From a767fd31c9fd69571298e6f4dabe9b822794d851 Mon Sep 17 00:00:00 2001
From: Chris Truter <crisptrutski@users.noreply.github.com>
Date: Mon, 18 Mar 2024 12:21:45 -0600
Subject: [PATCH] Support appending new CSV fields (#39951) (#40220)

---
 src/metabase/driver/h2.clj    |   8 +++
 src/metabase/upload.clj       | 125 ++++++++++++++++++++--------------
 test/metabase/upload_test.clj |  71 +++++++++++++------
 3 files changed, 129 insertions(+), 75 deletions(-)

diff --git a/src/metabase/driver/h2.clj b/src/metabase/driver/h2.clj
index ec9836388af..6a03cf100a2 100644
--- a/src/metabase/driver/h2.clj
+++ b/src/metabase/driver/h2.clj
@@ -580,3 +580,11 @@
   [_driver]
   ;; http://www.h2database.com/html/advanced.html#limits_limitations
   256)
+
+(defmethod driver/add-columns! :h2
+  [driver db-id table-name column-definitions & {:as settings}]
+  ;; Workaround for the fact that H2 uses different syntax for adding multiple columns, which is difficult to
+  ;; produce with HoneySQL. As a simpler workaround we instead break it up into single column statements.
+  (let [f (get-method driver/add-columns! :sql-jdbc)]
+    (doseq [[k v] column-definitions]
+      (f driver db-id table-name {k v} settings))))
diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj
index b24d437259b..a7ba89b614d 100644
--- a/src/metabase/upload.clj
+++ b/src/metabase/upload.clj
@@ -570,28 +570,33 @@
 (defn- base-type->upload-type
   "Returns the most specific upload type for the given base type."
   [base-type]
-  (condp #(isa? %2 %1) base-type
-    :type/Float                  ::float
-    :type/BigInteger             ::int
-    :type/Integer                ::int
-    :type/Boolean                ::boolean
-    :type/DateTimeWithTZ         ::offset-datetime
-    :type/DateTime               ::datetime
-    :type/Date                   ::date
-    :type/Text                   ::text))
+  (when base-type
+    (condp #(isa? %2 %1) base-type
+      :type/Float                  ::float
+      :type/BigInteger             ::int
+      :type/Integer                ::int
+      :type/Boolean                ::boolean
+      :type/DateTimeWithTZ         ::offset-datetime
+      :type/DateTime               ::datetime
+      :type/Date                   ::date
+      :type/Text                   ::text)))
 
 (defn- not-blank [s]
   (when-not (str/blank? s)
     s))
 
 (defn- extra-and-missing-error-markdown [extra missing]
-  (->> [[(tru "The CSV file contains extra columns that are not in the table:") extra]
-        [(tru "The CSV file is missing columns that are in the table:") missing]]
-       (keep (fn [[header columns]]
-               (when (seq columns)
-                 (str/join "\n" (cons header (map #(str "- " %) columns))))))
-       (str/join "\n\n")
-       (not-blank)))
+  (when (seq missing)
+    (->> [[(tru "The CSV file is missing columns that are in the table:") missing]
+          ;; Even though we allow new columns to be implicitly added by uploads, we mention then in the error messages
+          ;; for missing fields as a common case will be the misspelling of names. Seeing the actual and expected
+          ;; names together could help customers spot the root cause more easily.
+          [(tru "There are new columns in the CSV file that are not in the table:") extra]]
+         (keep (fn [[header columns]]
+                 (when (seq columns)
+                   (str/join "\n" (cons header (map #(str "- " %) columns))))))
+         (str/join "\n\n")
+         (not-blank))))
 
 (def ^:private allowed-type-upgrades
   "A mapping of which types a column can be implicitly relaxed to, based on the content of appended values."
@@ -607,35 +612,50 @@
   ;; Assumes table-cols are unique when normalized
   (let [normalized-field-names (keys fields-by-normed-name)
         normalized-header      (map normalize-column-name header)
-        [extra missing _both] (data/diff (set normalized-header) (set normalized-field-names))]
+        [extra missing _both]  (data/diff (set normalized-header) (set normalized-field-names))]
     ;; check for duplicates
     (when (some #(< 1 %) (vals (frequencies normalized-header)))
       (throw (ex-info (tru "The CSV file contains duplicate column names.")
                       {:status-code 422})))
-    (when (or extra missing)
-      (let [error-message (extra-and-missing-error-markdown extra missing)]
-        (throw (ex-info error-message {:status-code 422}))))))
+    (when-let [error-message (extra-and-missing-error-markdown extra missing)]
+      (throw (ex-info error-message {:status-code 422})))))
 
 (defn- matching-or-upgradable? [current-type relaxed-type]
-  (or (= current-type relaxed-type)
+  (or (nil? current-type)
+      (= current-type relaxed-type)
       (when-let [f (allowed-type-upgrades current-type)]
         (f relaxed-type))))
 
-(defn- changed-field->new-type
-  "Given some fields and old and new types, filter out fields with unchanged types, then pair with the new types."
-  [fields old-types new-types]
-  (let [new-if-changed #(when (not= %1 %2) %2)]
-    (->> (map new-if-changed old-types new-types)
-         (map vector fields)
-         (filter second)
-         (into {}))))
-
-(defn- alter-columns! [driver database table field->new-type]
-  (driver/alter-columns! driver (:id database) (table-identifier table)
-                         (m/map-kv (fn [field column-type]
-                                     [(keyword (:name field))
-                                      (driver/upload-type->database-type driver column-type)])
-                                   field->new-type)))
+(defn- field-changes
+  "Given existing and newly inferred types for the given `field-names`, calculate which fields need to be added or updated, along with their new types."
+  [field-names existing-types new-types]
+  (reduce
+   (fn [m [f e n]]
+     (cond
+       (nil? e)   (assoc-in m [:added f] n)
+       (not= e n) (assoc-in m [:updated f] n)
+       :else      m))
+   {:added {}, :updated {}}
+   (map vector field-names existing-types new-types)))
+
+(defn- field->db-type [driver field->col-type]
+  (m/map-kv
+   (fn [field-name col-type]
+     [(keyword field-name)
+      (driver/upload-type->database-type driver col-type)])
+   field->col-type))
+
+(defn- add-columns! [driver database table field->type & args]
+  (when (seq field->type)
+    (apply driver/add-columns! driver (:id database) (table-identifier table)
+           (field->db-type driver field->type)
+           args)))
+
+(defn- alter-columns! [driver database table field->new-type & args]
+  (when (seq field->new-type)
+    (apply driver/alter-columns! driver (:id database) (table-identifier table)
+           (field->db-type driver field->new-type)
+            args)))
 
 (defn- append-csv!*
   [database table file]
@@ -652,24 +672,25 @@
                                 (not (contains? normed-name->field auto-pk-column-name)))
             _                  (check-schema (dissoc normed-name->field auto-pk-column-name) header)
             settings           (upload-parsing/get-settings)
-            old-column-types   (map (comp base-type->upload-type :base_type normed-name->field) normed-header)
+            old-types          (map (comp base-type->upload-type :base_type normed-name->field) normed-header)
             ;; in the happy, and most common, case all the values will match the existing types
             ;; for now we just plan for the worst and perform a fairly expensive operation to detect any type changes
             ;; we can come back and optimize this to an optimistic-with-fallback approach later.
-            detected-types     (column-types-from-rows settings old-column-types rows)
-            new-column-types   (map #(if (matching-or-upgradable? %1 %2) %2 %1) old-column-types detected-types)
-            _                  (when (and (not= old-column-types new-column-types)
-                                          ;; if we cannot coerce all the columns, don't bother coercing any of them
-                                          ;; we will instead throw an error when we try to parse as the old type
-                                          (= detected-types new-column-types))
-                                 (let [fields (map normed-name->field normed-header)]
-                                   (->> (changed-field->new-type fields old-column-types detected-types)
-                                        (alter-columns! driver database table))))
+            detected-types     (column-types-from-rows settings old-types rows)
+            new-types          (map #(if (matching-or-upgradable? %1 %2) %2 %1) old-types detected-types)
+            ;; avoid any schema modification unless we are able to fully upgrade to supporting the given file
+            ;; choosing to not upgrade means that we will defer failure until we hit the first value that cannot
+            ;; be parsed as its previous type - there is scope to improve these error messages in the future.
+            modify-schema?     (and (not= old-types new-types) (= detected-types new-types))
+            _                  (when modify-schema?
+                                 (let [changes (field-changes normed-header old-types new-types)]
+                                   (add-columns! driver database table (:added changes))
+                                   (alter-columns! driver database table (:updated changes))))
             ;; this will fail if any of our required relaxations were rejected.
-            parsed-rows        (parse-rows settings new-column-types rows)
+            parsed-rows        (parse-rows settings new-types rows)
             row-count          (count parsed-rows)
             stats              {:num-rows          row-count
-                                :num-columns       (count new-column-types)
+                                :num-columns       (count new-types)
                                 :generated-columns (if create-auto-pk? 1 0)
                                 :size-mb           (file-size-mb file)
                                 :upload-seconds    (since-ms timer)}]
@@ -680,11 +701,9 @@
             (throw (ex-info (ex-message e) {:status-code 422}))))
 
         (when create-auto-pk?
-          (driver/add-columns! driver
-                               (:id database)
-                               (table-identifier table)
-                               {auto-pk-column-keyword (driver/upload-type->database-type driver ::auto-incrementing-int-pk)}
-                               :primary-key [auto-pk-column-keyword]))
+          (add-columns! driver database table
+                        {auto-pk-column-keyword ::auto-incrementing-int-pk}
+                        :primary-key [auto-pk-column-keyword]))
 
         (scan-and-sync-table! database table)
 
diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj
index 8fed29a9a37..4d72e4a01ea 100644
--- a/test/metabase/upload_test.clj
+++ b/test/metabase/upload_test.clj
@@ -617,7 +617,7 @@
             (is (some? table))))))))
 
 (deftest load-from-csv-offset-datetime-test
-  (testing "Upload a CSV file with a datetime column"
+  (testing "Upload a CSV file with an offset datetime column"
     (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
       (with-mysql-local-infile-on-and-off
         (mt/with-dynamic-redefs [driver/db-default-timezone (constantly "Z")
@@ -1133,7 +1133,6 @@
                     :user-id (str (mt/user->id :rasta))}
                    (last (snowplow-test/pop-event-data-and-user-id!))))))))))
 
-
 (deftest csv-upload-audit-log-test
   ;; Just test with h2 because these events are independent of the driver
   (mt/test-driver :h2
@@ -1351,40 +1350,53 @@
 (deftest append-column-mismatch-test
   (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
     (with-uploads-allowed
-      (testing "Append should fail if there are extra or missing columns in the CSV file"
+      (testing "Append should fail only if there are missing columns in the CSV file"
         (doseq [[csv-rows error-message]
-                {["_mb_row_id,id,name,extra column one,EXTRA COLUMN TWO"]
-                 (trim-lines "The CSV file contains extra columns that are not in the table:
-                              - extra_column_two
-                              - extra_column_one")
-
-                 [""]
+                {[""]
                  (trim-lines "The CSV file is missing columns that are in the table:
                               - id
                               - name")
 
+                 ;; Extra columns are fine, as long as none are missing.
+                 ["_mb_row_id,id,extra 1, extra 2,name"]
+                 nil
+
                  ["_mb_row_id,extra 1, extra 2"]
-                 (trim-lines "The CSV file contains extra columns that are not in the table:
+                 (trim-lines "The CSV file is missing columns that are in the table:
+                              - id
+                              - name
+
+                              There are new columns in the CSV file that are not in the table:
                               - extra_2
-                              - extra_1
+                              - extra_1")
 
-                              The CSV file is missing columns that are in the table:
-                              - id
-                              - name")}]
+                 ["_mb_row_id,id, extra 2"]
+                 (trim-lines "The CSV file is missing columns that are in the table:
+                              - name
+
+                              There are new columns in the CSV file that are not in the table:
+                              - extra_2")}]
           (with-upload-table!
             [table (create-upload-table!
                     {:col->upload-type (ordered-map/ordered-map
                                         :id         ::upload/int
                                         :name       ::upload/varchar-255)
                      :rows [[1,"some_text"]]})]
-            (let [file  (csv-file-with csv-rows (mt/random-name))]
-              (is (= {:message error-message
-                      :data {:status-code 422}}
-                     (catch-ex-info (append-csv! {:file     file
-                                                  :table-id (:id table)}))))
-              (testing "Check the data was not uploaded into the table"
-                (is (= [[1 "some_text"]]
-                       (rows-for-table table))))
+
+            (let [file (csv-file-with csv-rows (mt/random-name))]
+              (when error-message
+                (is (= {:message error-message
+                        :data    {:status-code 422}}
+                       (catch-ex-info (append-csv! {:file file :table-id (:id table)}))))
+                (testing "Check the data was not uploaded into the table"
+                  (is (= [[1 "some_text"]]
+                         (rows-for-table table)))))
+
+              (when-not error-message
+                (testing "Check the data was uploaded into the table"
+                  ;; No exception is thrown - but there were also no rows in the table to check
+                  (append-csv! {:file file :table-id (:id table)})))
+
               (io/delete-file file))))))))
 
 (deftest append-all-types-test
@@ -1639,6 +1651,21 @@
                    (rows-for-table table))))
           (io/delete-file file))))))
 
+(deftest append-new-column-test
+  (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
+    (with-uploads-allowed
+     (testing "Append should handle new columns being added in the latest CSV"
+       (with-upload-table! [table (create-upload-table!)]
+         ;; Reorder as well for good measure
+         (let [csv-rows ["game,name" "Witticisms,Fluke Skytalker"]
+               file     (csv-file-with csv-rows (mt/random-name))]
+           (testing "The new row is inserted with the values correctly reordered"
+             (is (= {:row-count 1} (append-csv! {:file file, :table-id (:id table)})))
+             (is (= [[1 "Obi-Wan Kenobi" nil]
+                     [2 "Fluke Skytalker" "Witticisms"]]
+                    (rows-for-table table))))
+           (io/delete-file file)))))))
+
 (deftest append-type-mismatch-test
   (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
     (with-mysql-local-infile-on-and-off
-- 
GitLab