Skip to content
Snippets Groups Projects
Unverified Commit ee48cb94 authored by github-automation-metabase's avatar github-automation-metabase Committed by GitHub
Browse files

Restrict inferred encodings for Uploads (#49024) (#49029)

parent d551db35
No related branches found
No related tags found
No related merge requests found
......@@ -3,7 +3,6 @@
[clj-bom.core :as bom]
[clojure.data :as data]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.string :as str]
[flatland.ordered.map :as ordered-map]
[java-time.api :as t]
......@@ -282,25 +281,23 @@
(defn- file-mime-type [^File file]
(.detect tika file))
(defn- detect-charset ^String [file]
(try
(let [detector (UniversalDetector.)
buffer (byte-array 8192)]
(with-open [input-stream (io/input-stream file)]
(loop []
(let [bytes-read (.read input-stream buffer)]
(if (pos? bytes-read)
(do
(.handleData detector buffer 0 bytes-read)
(when-not (.isDone detector)
(recur)))
(.dataEnd detector)))))
(.getDetectedCharset detector))
(catch Exception _)))
(def ^:private supported-charsets
#{"UTF-8"
"UTF-16" "UTF-16BE" "UTF-16LE"
"UTF-32" "UTF-32BE" "UTF-32LE"
"WINDOWS-1252"})
(defn- detect-charset ^String [^File file]
(or
(try
;; If its not a first-class supported encoding, just treat it as the default encoding.
(supported-charsets (UniversalDetector/detectCharset file))
;; If we can't detect the encoding, use the default, and live with unrecognized characters.
(catch Exception _))
"UTF-8"))
(defn- ->reader ^Reader [^File file]
;; If we can't detect the encoding, just live with unrecognized characters.
(let [charset (or (detect-charset file) "UTF-8")]
(let [charset (detect-charset file)]
(-> (bom/bom-input-stream file)
(InputStreamReader. charset))))
......
......@@ -550,17 +550,17 @@
:file (csv-file-with lines)
:auxiliary-sync-steps :synchronous)]
(testing "Table and Fields exist after sync"
(is (=? (cond->> [["id" {:semantic_type :type/PK
:base_type :type/BigInteger}]
["nulls" {:base_type :type/Text}]
["string" {:base_type :type/Text}]
["bool" {:base_type :type/Boolean}]
["number" {:base_type :type/Float}]
["date" {:base_type :type/Date}]
["datetime" {:base_type :type/DateTime}]]
(is (=? (cond->> [["id" {:semantic_type :type/PK
:base_type :type/BigInteger}]
["nulls" {:base_type :type/Text}]
["string" {:base_type :type/Text}]
["bool" {:base_type :type/Boolean}]
["number" {:base_type :type/Float}]
["date" {:base_type :type/Date}]
["datetime" {:base_type :type/DateTime}]]
(auto-pk-column?)
(cons ["_mb_row_id" {:semantic_type :type/PK
:base_type :type/BigInteger}]))
(cons ["_mb_row_id" {:semantic_type :type/PK
:base_type :type/BigInteger}]))
(->> (t2/select :model/Field :table_id (:id table))
(sort-by :database_position)
(map (juxt (comp u/lower-case-en :name) identity))))))
......@@ -583,6 +583,17 @@
(is (= (header-with-auto-pk ["Dirección" "País"])
(column-display-names-for-table table))))))))))
(deftest detect-charset-test
(doseq [[encoding filename] [["UTF-8" "csv/utf-8.csv"]
["UTF-8" "csv/48945-1.csv"]
["UTF-8" "csv/48945-2.csv"]
["UTF-8" "csv/48945-3.csv"]
["UTF-16BE" "csv/utf-16.csv"]
;; Hmm, https://stackoverflow.com/a/19111140
["WINDOWS-1252" "csv/iso-8859-1.csv"]]]
(testing (str "Correct charset detected for " filename)
(is (= encoding (#'upload/detect-charset (io/file (io/resource filename))))))))
(deftest infer-separator-catch-exception-test
(testing "errors in [[upload/infer-separator]] should not prevent the upload (#44034)"
(mt/test-drivers (mt/normal-drivers-with-feature :uploads)
......@@ -2178,7 +2189,7 @@
{:fail-msg "There's a value with the wrong type \\('double precision'\\) in the 'test_column' column"}
{:coerced 2.1})) ; column is promoted to float
{:upload-type int-type, :uncoerced "2.0", :coerced 2} ; value is coerced to int
{:upload-type float-type, :uncoerced "2", :coerced 2.0}
{:upload-type float-type, :uncoerced "2", :coerced 2.0} ; column is promoted to float
{:upload-type bool-type, :uncoerced "0", :coerced false}
{:upload-type bool-type, :uncoerced "1.0", :fail-msg "'1.0' is not a recognizable boolean"}
{:upload-type bool-type, :uncoerced "0.0", :fail-msg "'0.0' is not a recognizable boolean"}
......
a,b,c,d,e
"i ó",,,,
"i ó",,,,
"i ó",,,,
"i ó",,,,
tó space,b,c,d,é
"i ó",,,,
"i ó",,,,
"i ó",,,,
"i ó",,,,
a,b,c,d,Helló
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
"a",,,,
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment