Skip to content
Snippets Groups Projects
Unverified Commit 05ec3b33 authored by Chris Truter's avatar Chris Truter Committed by GitHub
Browse files

Basic character set handling (#47799)

parent a8305b62
No related branches found
No related tags found
No related merge requests found
......@@ -36,6 +36,7 @@
:exclusions [it.unimi.dsi/fastutil
org.slf4j/slf4j-api]}
com.draines/postal {:mvn/version "2.0.5"} ; SMTP library
com.github.albfernandez/juniversalchardet {:mvn/version "2.5.0"}
com.github.seancorfield/honeysql {:mvn/version "2.6.1126"} ; Honey SQL 2. SQL generation from Clojure data maps
com.github.seancorfield/next.jdbc {:mvn/version "1.3.925"} ; Talk to JDBC DBs
com.github.steffan-westcott/clj-otel-api {:mvn/version "0.2.6"} ; Telemetry library
......
......@@ -3,6 +3,7 @@
[clj-bom.core :as bom]
[clojure.data :as data]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.string :as str]
[flatland.ordered.map :as ordered-map]
[java-time.api :as t]
......@@ -38,9 +39,10 @@
[metabase.util.malli.schema :as ms]
[toucan2.core :as t2])
(:import
(java.io File)
(java.io File InputStreamReader Reader)
(java.nio.charset StandardCharsets)
(org.apache.tika Tika)))
(org.apache.tika Tika)
(org.mozilla.universalchardet UniversalDetector)))
(set! *warn-on-reflection* true)
......@@ -279,6 +281,30 @@
(defn- file-mime-type [^File file]
(.detect tika file))
(defn- detect-charset ^String [file]
(try
(let [detector (UniversalDetector.)
buffer (byte-array 8192)]
(with-open [input-stream (io/input-stream file)]
(loop []
(let [bytes-read (.read input-stream buffer)]
(if (pos? bytes-read)
(do
(.handleData detector buffer 0 bytes-read)
(if (.isDone detector)
(.getDetectedCharset detector)
(recur)))
(do
(.dataEnd detector)
(.getDetectedCharset detector)))))))
(catch Exception _)))
(defn- ->reader ^Reader [^File file]
;; Just live with unrecognized characters
(let [charset (or (detect-charset file) "UTF-8")]
(-> (bom/bom-input-stream file)
(InputStreamReader. charset))))
(defn- assert-separator-chosen [s]
(or s (throw (IllegalArgumentException. "Unable to determine separator"))))
......@@ -289,7 +315,7 @@
[readable]
(let [count-columns (fn [s]
;; Create a separate reader per separator, as the line-breaking behavior depends on the parser.
(with-open [reader (bom/bom-reader readable)]
(with-open [reader (->reader readable)]
(try (into []
(comp (take max-inferred-lines)
(map count))
......@@ -343,7 +369,7 @@
Returns the file size, number of rows, and number of columns."
[driver db table-name filename ^File csv-file]
(let [parse (infer-parser filename csv-file)]
(with-open [reader (bom/bom-reader csv-file)]
(with-open [reader (->reader csv-file)]
(let [auto-pk? (auto-pk-column? driver db)
[header & rows] (cond-> (parse reader)
auto-pk?
......@@ -483,7 +509,7 @@
It may involve redundantly reading the file, or even failing again if the file is unreadable."
[filename ^File file]
(let [parse (infer-parser filename file)]
(with-open [reader (bom/bom-reader file)]
(with-open [reader (->reader file)]
(let [rows (parse reader)]
{:size-mb (file-size-mb file)
:num-columns (count (first rows))
......@@ -722,7 +748,7 @@
(defn- update-with-csv! [database table filename file & {:keys [replace-rows?]}]
(try
(let [parse (infer-parser filename file)]
(with-open [reader (bom/bom-reader file)]
(with-open [reader (->reader file)]
(let [timer (u/start-timer)
driver (driver.u/database->driver database)
auto-pk? (auto-pk-column? driver database)
......
......@@ -565,6 +565,21 @@
(is (= 2
(count (rows-for-table table)))))))))))
(deftest create-from-csv-display-name-encodings-test
(mt/test-drivers (mt/normal-drivers-with-feature :uploads)
(with-mysql-local-infile-on-and-off
(doseq [filename ["csv/iso-8859-1.csv"
"csv/utf-8.csv"
"csv/utf-16.csv"]]
(testing (str "Filename: " filename "\n")
(with-upload-table!
[table (create-from-csv-and-sync-with-defaults!
:file (io/file (io/resource filename))
:auxiliary-sync-steps :synchronous)]
(testing "Headers are displayed correctly"
(is (= (header-with-auto-pk ["Dirección" "País"])
(column-display-names-for-table table))))))))))
(deftest infer-separator-catch-exception-test
(testing "errors in [[upload/infer-separator]] should not prevent the upload (#44034)"
(mt/test-drivers (mt/normal-drivers-with-feature :uploads)
......
"Dirección";"País"
"foo";"bar"
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
"Dirección";"País"
"foo";"bar"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment