Skip to content
Snippets Groups Projects
Unverified Commit 05ec3b33 authored by Chris Truter's avatar Chris Truter Committed by GitHub
Browse files

Basic character set handling (#47799)

parent a8305b62
No related branches found
No related tags found
No related merge requests found
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
:exclusions [it.unimi.dsi/fastutil :exclusions [it.unimi.dsi/fastutil
org.slf4j/slf4j-api]} org.slf4j/slf4j-api]}
com.draines/postal {:mvn/version "2.0.5"} ; SMTP library com.draines/postal {:mvn/version "2.0.5"} ; SMTP library
com.github.albfernandez/juniversalchardet {:mvn/version "2.5.0"}
com.github.seancorfield/honeysql {:mvn/version "2.6.1126"} ; Honey SQL 2. SQL generation from Clojure data maps com.github.seancorfield/honeysql {:mvn/version "2.6.1126"} ; Honey SQL 2. SQL generation from Clojure data maps
com.github.seancorfield/next.jdbc {:mvn/version "1.3.925"} ; Talk to JDBC DBs com.github.seancorfield/next.jdbc {:mvn/version "1.3.925"} ; Talk to JDBC DBs
com.github.steffan-westcott/clj-otel-api {:mvn/version "0.2.6"} ; Telemetry library com.github.steffan-westcott/clj-otel-api {:mvn/version "0.2.6"} ; Telemetry library
......
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
[clj-bom.core :as bom] [clj-bom.core :as bom]
[clojure.data :as data] [clojure.data :as data]
[clojure.data.csv :as csv] [clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.string :as str] [clojure.string :as str]
[flatland.ordered.map :as ordered-map] [flatland.ordered.map :as ordered-map]
[java-time.api :as t] [java-time.api :as t]
...@@ -38,9 +39,10 @@ ...@@ -38,9 +39,10 @@
[metabase.util.malli.schema :as ms] [metabase.util.malli.schema :as ms]
[toucan2.core :as t2]) [toucan2.core :as t2])
(:import (:import
(java.io File) (java.io File InputStreamReader Reader)
(java.nio.charset StandardCharsets) (java.nio.charset StandardCharsets)
(org.apache.tika Tika))) (org.apache.tika Tika)
(org.mozilla.universalchardet UniversalDetector)))
(set! *warn-on-reflection* true) (set! *warn-on-reflection* true)
...@@ -279,6 +281,30 @@ ...@@ -279,6 +281,30 @@
(defn- file-mime-type [^File file] (defn- file-mime-type [^File file]
(.detect tika file)) (.detect tika file))
(defn- detect-charset ^String [file]
(try
(let [detector (UniversalDetector.)
buffer (byte-array 8192)]
(with-open [input-stream (io/input-stream file)]
(loop []
(let [bytes-read (.read input-stream buffer)]
(if (pos? bytes-read)
(do
(.handleData detector buffer 0 bytes-read)
(if (.isDone detector)
(.getDetectedCharset detector)
(recur)))
(do
(.dataEnd detector)
(.getDetectedCharset detector)))))))
(catch Exception _)))
(defn- ->reader ^Reader [^File file]
;; Just live with unrecognized characters
(let [charset (or (detect-charset file) "UTF-8")]
(-> (bom/bom-input-stream file)
(InputStreamReader. charset))))
(defn- assert-separator-chosen [s] (defn- assert-separator-chosen [s]
(or s (throw (IllegalArgumentException. "Unable to determine separator")))) (or s (throw (IllegalArgumentException. "Unable to determine separator"))))
...@@ -289,7 +315,7 @@ ...@@ -289,7 +315,7 @@
[readable] [readable]
(let [count-columns (fn [s] (let [count-columns (fn [s]
;; Create a separate reader per separator, as the line-breaking behavior depends on the parser. ;; Create a separate reader per separator, as the line-breaking behavior depends on the parser.
(with-open [reader (bom/bom-reader readable)] (with-open [reader (->reader readable)]
(try (into [] (try (into []
(comp (take max-inferred-lines) (comp (take max-inferred-lines)
(map count)) (map count))
...@@ -343,7 +369,7 @@ ...@@ -343,7 +369,7 @@
Returns the file size, number of rows, and number of columns." Returns the file size, number of rows, and number of columns."
[driver db table-name filename ^File csv-file] [driver db table-name filename ^File csv-file]
(let [parse (infer-parser filename csv-file)] (let [parse (infer-parser filename csv-file)]
(with-open [reader (bom/bom-reader csv-file)] (with-open [reader (->reader csv-file)]
(let [auto-pk? (auto-pk-column? driver db) (let [auto-pk? (auto-pk-column? driver db)
[header & rows] (cond-> (parse reader) [header & rows] (cond-> (parse reader)
auto-pk? auto-pk?
...@@ -483,7 +509,7 @@ ...@@ -483,7 +509,7 @@
It may involve redundantly reading the file, or even failing again if the file is unreadable." It may involve redundantly reading the file, or even failing again if the file is unreadable."
[filename ^File file] [filename ^File file]
(let [parse (infer-parser filename file)] (let [parse (infer-parser filename file)]
(with-open [reader (bom/bom-reader file)] (with-open [reader (->reader file)]
(let [rows (parse reader)] (let [rows (parse reader)]
{:size-mb (file-size-mb file) {:size-mb (file-size-mb file)
:num-columns (count (first rows)) :num-columns (count (first rows))
...@@ -722,7 +748,7 @@ ...@@ -722,7 +748,7 @@
(defn- update-with-csv! [database table filename file & {:keys [replace-rows?]}] (defn- update-with-csv! [database table filename file & {:keys [replace-rows?]}]
(try (try
(let [parse (infer-parser filename file)] (let [parse (infer-parser filename file)]
(with-open [reader (bom/bom-reader file)] (with-open [reader (->reader file)]
(let [timer (u/start-timer) (let [timer (u/start-timer)
driver (driver.u/database->driver database) driver (driver.u/database->driver database)
auto-pk? (auto-pk-column? driver database) auto-pk? (auto-pk-column? driver database)
......
...@@ -565,6 +565,21 @@ ...@@ -565,6 +565,21 @@
(is (= 2 (is (= 2
(count (rows-for-table table))))))))))) (count (rows-for-table table)))))))))))
(deftest create-from-csv-display-name-encodings-test
(mt/test-drivers (mt/normal-drivers-with-feature :uploads)
(with-mysql-local-infile-on-and-off
(doseq [filename ["csv/iso-8859-1.csv"
"csv/utf-8.csv"
"csv/utf-16.csv"]]
(testing (str "Filename: " filename "\n")
(with-upload-table!
[table (create-from-csv-and-sync-with-defaults!
:file (io/file (io/resource filename))
:auxiliary-sync-steps :synchronous)]
(testing "Headers are displayed correctly"
(is (= (header-with-auto-pk ["Dirección" "País"])
(column-display-names-for-table table))))))))))
(deftest infer-separator-catch-exception-test (deftest infer-separator-catch-exception-test
(testing "errors in [[upload/infer-separator]] should not prevent the upload (#44034)" (testing "errors in [[upload/infer-separator]] should not prevent the upload (#44034)"
(mt/test-drivers (mt/normal-drivers-with-feature :uploads) (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
......
"Dirección";"País"
"foo";"bar"
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
"Dirección";"País"
"foo";"bar"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment