Consider the first 10 lines when inferring upload separator (#46321)

ba66cf32 · Chris Truter · GitHub · b9a7566f · ba66cf32 · ba66cf32
Unverified Commit ba66cf32 authored 7 months ago by Chris Truter Committed by GitHub 7 months ago
--- a/src/metabase/upload.clj
+++ b/src/metabase/upload.clj
@@ -171,8 +171,16 @@
  (let [parsers (map #(upload-parsing/upload-type->parser % settings) col-upload-types)]
    (for [row rows]
      (for [[value parser] (u/map-all vector row parsers)]
-        (when-not (str/blank? value)
-          (parser value))))))
+        (do
+          (when-not parser
+            (throw (ex-info (format "Column count in data (%s) exceeds the number of in the header (%s)"
+                                    (count rows)
+                                    (count parsers))
+                            {:settings settings
+                             :col-upload-types rows
+                             :row row})))
+          (when-not (str/blank? value)
+            (parser value)))))))

 (defn- remove-indices
  "Removes the elements at the given indices from the collection. Indices is a set."
@@ -210,44 +218,44 @@

 (def ^:private separators ",;\t")

-(defn- assert-inferred-separator [maybe-s]
-  (or maybe-s
-      (throw (ex-info (tru "Unable to recognise file separator")
-                      {:status-code 422}))))
+;; This number was chosen arbitrarily. There is robustness / performance trade-off.
+(def ^:private max-inferred-lines 10)
+
+(defn- separator-priority
+  "Prefer separators according to the follow criteria, in order:
+
+   - Splitting the header at least once.
+   - Giving a consistent column split for all the lines.
+   - Not having more columns in any row than in the header.
+   - The maximum number of column splits.
+   - The number of fields in the header.
+   - The precedence order in how we define them, e.g. a bias towards comma.
+
+  This last preference is implicit in the order of [[separators]]"
+  [[header-column-count & data-row-column-counts]]
+  [(when header-column-count
+     (> header-column-count 1))
+   (apply = header-column-count data-row-column-counts)
+   (not (some #(> % header-column-count) data-row-column-counts))
+   (reduce max 0 data-row-column-counts)
+   header-column-count])

 (defn- infer-separator
  "Guess at what symbol is being used as a separator in the given CSV-like file.
  Our heuristic is to use the separator that gives us the most number of columns.
  Exclude separators which give incompatible column counts between the header and the first row."
-  [^File file]
+  [readable]
  (let [count-columns (fn [s]
-                        ;; Create a separate reader per separator, as the line-breaking behaviour depends on the parser.
-                        (with-open [reader (bom/bom-reader file)]
+                        ;; Create a separate reader per separator, as the line-breaking behavior depends on the parser.
+                        (with-open [reader (bom/bom-reader readable)]
                          (try (into []
-                                     ;; take first two rows and count the number of columns in each to compare headers
-                                     ;; vs data rows.
-                                     (comp (take 2) (map count))
+                                     (comp (take max-inferred-lines)
+                                           (map count))
                                     (csv/read-csv reader :separator s))
                               (catch Exception _e nil))))]
    (->> (map (juxt identity count-columns) separators)
-         ;; We cannot have more data columns than header columns
-         ;; We currently support files without any data rows, and these get a free pass.
-         (remove (fn [[_s [header-column-count data-column-count]]]
-                   (when data-column-count
-                     (> data-column-count header-column-count))))
-         ;; Prefer separators according to the follow criteria, in order:
-         ;; - Splitting the header at least once
-         ;; - Giving a consistent column split for the first two lines of the file
-         ;; - The number of fields in the header
-         ;; - The precedence order in how we define them, e.g.. bias towards comma
-         (sort-by (fn [[_ [header-column-count data-column-count]]]
-                    [(when header-column-count
-                       (> header-column-count 1))
-                     (= header-column-count data-column-count)
-                     header-column-count])
-                  u/reverse-compare)
-         ffirst
-         assert-inferred-separator)))
+         (sort-by (comp separator-priority second) u/reverse-compare)
+         ffirst)))

 (defn- infer-parser
  "Currently this only infers the separator, but in future it may also handle different quoting options."

--- a/test/metabase/upload_test.clj
+++ b/test/metabase/upload_test.clj
@@ -34,7 +34,7 @@
   [metabase.util.malli :as mu]
   [toucan2.core :as t2])
  (:import
-   (java.io File)))
+   (java.io ByteArrayInputStream File)))

 (set! *warn-on-reflection* true)

@@ -366,7 +366,7 @@
       (finally
         ;; I'm experimenting with disabling this, it seems preposterous that this would actually cause test flakes --
         ;; Cam
-         (do #_when #_(not= driver/*driver* :redshift) ; redshift tests flake when tables are dropped
+         (when true #_(not= driver/*driver* :redshift) ; redshift tests flake when tables are dropped
           (driver/drop-table! driver/*driver*
                               (:db_id table)
                               (#'upload/table-identifier table))))))
@@ -565,24 +565,68 @@
            (is (= (rows-with-auto-pk [["a,b,c" "d"]])
                   (rows-for-table table)))))))))

+(defn reusable-string-reader
+  "Because life is too short for zillions of temp files."
+  [^String s]
+  (let [bytes (.getBytes s "UTF-8")]
+    (reify
+      io/IOFactory
+      (make-input-stream [_ _opts]
+        (ByteArrayInputStream. bytes))
+      (make-reader [this opts]
+        (io/reader (io/make-input-stream this opts))))))
+
+(defn- infer-separator [rows]
+  (#'upload/infer-separator (reusable-string-reader (str/join "\n" rows))))
+
 (deftest infer-separator-test
  (testing "doesn't error when checking alternative separators (#44034)"
-    (let [file (csv-file-with ["\"c1\",\"c2\""
-                               "\"a,b,c\",\"d\""])]
-      (is (= \, (#'upload/infer-separator file)))))
+    (let [rows ["\"c1\",\"c2\""
+                "\"a,b,c\",\"d\""]]
+      (is (= \, (infer-separator rows)))))
  (doseq [[separator lines] example-files]
    (testing (str "inferring " separator)
-      (let [f (csv-file-with lines)
-            s ({:tab \tab :semi-colon \; :comma \,} separator)]
-        (is (= s (#'upload/infer-separator f))))))
+      (let [s ({:tab \tab :semi-colon \; :comma \,} separator)]
+        (is (= s (infer-separator lines))))))
  ;; it's actually decently hard to make it not stumble on comma or semicolon. The strategy here is that the data
  ;; column count is greater than the header column count regardless of the separators we choose
  (let [lines [","
               ",,,;;;\t\t"]]
-    (testing "throws an error if there's no clear winner"
-      (let [f (csv-file-with lines)]
-        (is (thrown-with-msg? clojure.lang.ExceptionInfo #"Unable to recognise file separator"
-                              (#'upload/infer-separator f)))))))
+    (testing "will defer data width errors to insertion time if other separators are degenerate"
+      (is (= \, (infer-separator lines))))))
+
+(deftest infer-separator-priority-test
+  (testing "Multiple header columns"
+    ;; Despite inconsistent counts, we pick \;
+    (is (= \; (infer-separator ["a;b" "1"]))))
+  (testing "Consistent column counts"
+    ;; despite more data columns for the other separators, we pick \;
+    (is (= \; (infer-separator ["a;b,c\td"
+                                "1;2,3,4\t5\t6"]))))
+  (testing "Headers for every column"
+    ;; despite more data columns for other separators, we pick \;
+    (is (= \; (infer-separator ["a,b;c\td"
+                                "1,2,3;4\t5"]))))
+  (testing "Greatest number of data columns"
+    ;; despite more headers for \, we pick \;
+    (is (= \; (infer-separator ["a;b;c;d,e,f,g,h\ti\tj"
+                                "1;2;3,4\t5"]))))
+  (testing "Greatest number of header columns"
+    (is (= \; (infer-separator ["a,b;c;d\te"]))))
+  (testing "Precedence"
+    (is (= \, (infer-separator [])))
+    (is (= \; (infer-separator ["a\tb;c"
+                                "1\t2;3"])))))
+
+(deftest infer-separator-multiline-test
+  (testing "it picks the only viable separator forced by a quote"
+    (is (= \; (infer-separator ["name, first;surname"
+                                "bond, james;bond"
+                                "\"semi;\";colon"]))))
+  (testing "it considers consistency across the split count"
+    (is (= \; (infer-separator ["product name; amount, in dollars"
+                                "blunderbuss;  1,000"
+                                "cyberwagon;   1,000,000"])))))

 (deftest create-from-csv-date-test
  (testing "Upload a CSV file with a datetime column"