From 8824bd9a19e6596754a62eb861a4e3838408ea86 Mon Sep 17 00:00:00 2001 From: dpsutton <dan@dpsutton.com> Date: Wed, 12 Jun 2024 17:21:52 -0500 Subject: [PATCH] Handle errors when inferring separator of csv (#44073) * Handle errors when inferring separator of csv This file ``` "c1","c2" "a,b,c","d" ``` was failing to upload for a silly reason: when we were checking for which separator it used, we had a parse error when using a semi-colon and that blew up the whole pipeline. * Use cal's helpful test suggestions --- src/metabase/upload.clj | 12 ++++++------ test/metabase/upload_test.clj | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 6 deletions(-) diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj index b7ca0b15d3b..6abd636469b 100644 --- a/src/metabase/upload.clj +++ b/src/metabase/upload.clj @@ -191,12 +191,12 @@ (let [count-columns (fn [s] ;; Create a separate reader per separator, as the line-breaking behaviour depends on the parser. (with-open [reader (bom/bom-reader file)] - (->> (csv/read-csv reader :separator s) - ;; we only consider the header row and the first data row - (take 2) - (map count) - ;; realize the list before the reader closes - doall)))] + (try (into [] + ;; take first two rows and count the number of columns in each to compare headers + ;; vs data rows. + (comp (take 2) (map count)) + (csv/read-csv reader :separator s)) + (catch Exception _e nil))))] (->> (map (juxt identity count-columns) separators) ;; We cannot have more data columns than header columns ;; We currently support files without any data rows, and these get a free pass. diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj index 573a5e4dfab..b537548669b 100644 --- a/test/metabase/upload_test.clj +++ b/test/metabase/upload_test.clj @@ -459,6 +459,39 @@ (is (= 2 (count (rows-for-table table))))))))))) +(deftest infer-separator-catch-exception-test + (testing "errors in [[upload/infer-separator]] should not prevent the upload (#44034)" + (mt/test-drivers (mt/normal-drivers-with-feature :uploads) + (with-mysql-local-infile-on-and-off + (with-upload-table! + [table (create-from-csv-and-sync-with-defaults! + :file (csv-file-with ["\"c1\",\"c2\"" + "\"a,b,c\",\"d\""]))] + (testing "Check the data was uploaded into the table correctly" + (is (= (header-with-auto-pk ["c1", "c2"]) + (column-names-for-table table))) + (is (= (rows-with-auto-pk [["a,b,c" "d"]]) + (rows-for-table table))))))))) + +(deftest infer-separator-test + (testing "doesn't error when checking alternative separators (#44034)" + (let [file (csv-file-with ["\"c1\",\"c2\"" + "\"a,b,c\",\"d\""])] + (is (= \, (#'upload/infer-separator file))))) + (doseq [[separator lines] example-files] + (testing (str "inferring " separator) + (let [f (csv-file-with lines) + s ({:tab \tab :semi-colon \; :comma \,} separator)] + (is (= s (#'upload/infer-separator f)))))) + ;; it's actually decently hard to make it not stumble on comma or semicolon. The strategy here is that the data + ;; column count is greater than the header column count regardless of the separators we choose + (let [lines ["," + ",,,;;;\t\t"]] + (testing "throws an error if there's no clear winner" + (let [f (csv-file-with lines)] + (is (thrown-with-msg? clojure.lang.ExceptionInfo #"Unable to recognise file separator" + (#'upload/infer-separator f))))))) + (deftest create-from-csv-date-test (testing "Upload a CSV file with a datetime column" (mt/test-drivers (mt/normal-drivers-with-feature :uploads) -- GitLab