diff --git a/deps.edn b/deps.edn index f9832452b2a223664627db6490a86d10b1821fac..2994a82786ba1e796a394c9459e659db322b4a39 100644 --- a/deps.edn +++ b/deps.edn @@ -125,6 +125,7 @@ org.apache.sshd/sshd-core {:mvn/version "2.12.1" ; ssh tunneling and test server :exclusions [org.slf4j/slf4j-api org.slf4j/jcl-over-slf4j]} + org.apache.tika/tika-core {:mvn/version "2.9.2"} org.apache.xmlgraphics/batik-all {:mvn/version "1.17"} ; SVG -> image org.bouncycastle/bcpkix-jdk18on {:mvn/version "1.78"} ; Bouncy Castle crypto library -- explicit version of BC specified to resolve illegal reflective access errors org.bouncycastle/bcprov-jdk18on {:mvn/version "1.78"} diff --git a/src/metabase/api/table.clj b/src/metabase/api/table.clj index 743edac5cbebb7ca37e1006d91d9fd942d0a38d4..b1c4907b65f6301645a74ad5537a5724ffa6ba2f 100644 --- a/src/metabase/api/table.clj +++ b/src/metabase/api/table.clj @@ -588,6 +588,7 @@ "This helper function exists to make testing the POST /api/table/:id/{action}-csv endpoints easier." [options :- [:map [:table-id ms/PositiveInt] + [:filename :string] [:file (ms/InstanceOfClass java.io.File)] [:action upload/update-action-schema]]] (try @@ -607,6 +608,7 @@ [id :as {raw-params :params}] {id ms/PositiveInt} (update-csv! {:table-id id + :filename (get-in raw-params ["file" :filename]) :file (get-in raw-params ["file" :tempfile]) :action ::upload/append})) @@ -615,6 +617,7 @@ [id :as {raw-params :params}] {id ms/PositiveInt} (update-csv! {:table-id id + :filename (get-in raw-params ["file" :filename]) :file (get-in raw-params ["file" :tempfile]) :action ::upload/replace})) diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj index c92d46e450805c69409e4def384ea2a3d98bbb37..fc027e2641ea03419bd162dafef3e4c9cc0b6af3 100644 --- a/src/metabase/upload.clj +++ b/src/metabase/upload.clj @@ -38,7 +38,8 @@ [metabase.util.malli.schema :as ms] [toucan2.core :as t2]) (:import - (java.io File))) + (java.io File) + (org.apache.tika Tika))) (set! *warn-on-reflection* true) @@ -240,6 +241,19 @@ (reduce max 0 data-row-column-counts) header-column-count]) +(def ^:private allowed-extensions #{nil "csv" "tsv" "txt"}) + +(def ^:private allowed-mime-types #{"text/csv" "text/tab-separated-values" "text/plain"}) + +(def ^:private ^Tika tika (Tika.)) + +(defn- file-extension [filename] + (when filename + (-> filename (str/split #"\.") rest last))) + +(defn- file-mime-type [^File file] + (.detect tika file)) + (defn- infer-separator "Guess at what symbol is being used as a separator in the given CSV-like file. Our heuristic is to use the separator that gives us the most number of columns. @@ -259,8 +273,10 @@ (defn- infer-parser "Currently this only infers the separator, but in future it may also handle different quoting options." - [file] - (let [s (infer-separator file)] + [filename ^File file] + (let [s (if (= "tsv" (file-extension filename)) + \tab + (infer-separator file))] (fn [stream] (csv/read-csv stream :separator s)))) @@ -288,8 +304,8 @@ (defn- create-from-csv! "Creates a table from a CSV file. If the table already exists, it will throw an error. Returns the file size, number of rows, and number of columns." - [driver db table-name ^File csv-file] - (let [parse (infer-parser csv-file)] + [driver db table-name filename ^File csv-file] + (let [parse (infer-parser filename csv-file)] (with-open [reader (bom/bom-reader csv-file)] (let [auto-pk? (auto-pk-column? driver db) [header & rows] (cond-> (parse reader) @@ -407,8 +423,8 @@ (defn- fail-stats "If a given upload / append / replace fails, this function is used to create the failure event payload for snowplow. It may involve redundantly reading the file, or even failing again if the file is unreadable." - [^File file] - (let [parse (infer-parser file)] + [filename ^File file] + (let [parse (infer-parser filename file)] (with-open [reader (bom/bom-reader file)] (let [rows (parse reader)] {:size-mb (file-size-mb file) @@ -418,12 +434,12 @@ (defn- create-from-csv-and-sync! "This is separated from `create-csv-upload!` for testing" - [{:keys [db file schema table-name display-name]}] + [{:keys [db filename file schema table-name display-name]}] (let [driver (driver.u/database->driver db) schema (some->> schema (ddl.i/format-name driver)) table-name (some->> table-name (ddl.i/format-name driver)) schema+table-name (table-identifier {:schema schema :name table-name}) - stats (create-from-csv! driver db schema+table-name file) + stats (create-from-csv! driver db schema+table-name filename file) ;; Sync immediately to create the Table and its Fields; the scan is settings-dependent and can be async table (sync-tables/create-table! db {:name table-name :schema (not-empty schema) @@ -438,6 +454,20 @@ {:table table :stats stats})) +(defn- check-filetype [filename file] + (let [extension (file-extension filename)] + (when-not (contains? allowed-extensions extension) + (throw (ex-info (tru "Unsupported File Type") + {:status-code 415 ; Unsupported Media Type + :file-extension extension}))) + ;; This might be expensive to compute, hence having this as a second case. + (let [mime-type (file-mime-type file)] + (when-not (contains? allowed-mime-types mime-type) + (throw (ex-info (tru "Unsupported File Type") + {:status-code 415 ; Unsupported Media Type + :file-extension extension + :mime-type mime-type})))))) + (mu/defn create-csv-upload! "Main entry point for CSV uploading. @@ -472,6 +502,7 @@ (throw (ex-info (tru "The uploads database does not exist.") {:status-code 422})))] (check-can-create-upload database schema-name) + (check-filetype filename file) (collection/check-write-perms-for-collection collection-id) (try (let [timer (u/start-timer) @@ -486,6 +517,7 @@ (u/lower-case-en)) {:keys [stats table]} (create-from-csv-and-sync! {:db database + :filename filename :file file :schema schema-name :table-name table-name @@ -518,7 +550,7 @@ (assoc stats :model-id (:id card))) card) (catch Throwable e - (snowplow/track-event! ::snowplow/csv-upload-failed api/*current-user-id* (fail-stats file)) + (snowplow/track-event! ::snowplow/csv-upload-failed api/*current-user-id* (fail-stats filename file)) (throw e))))) ;;; +----------------------------- @@ -624,9 +656,9 @@ ;; Ideally we would do all the filtering in the query, but this would not allow us to leverage mlv2. (persisted-info/invalidate! {:card_id [:in model-ids]}))) -(defn- update-with-csv! [database table file & {:keys [replace-rows?]}] +(defn- update-with-csv! [database table filename file & {:keys [replace-rows?]}] (try - (let [parse (infer-parser file)] + (let [parse (infer-parser filename file)] (with-open [reader (bom/bom-reader file)] (let [timer (u/start-timer) driver (driver.u/database->driver database) @@ -699,7 +731,7 @@ {:row-count row-count}))) (catch Throwable e - (snowplow/track-event! ::snowplow/csv-append-failed api/*current-user-id* (fail-stats file)) + (snowplow/track-event! ::snowplow/csv-append-failed api/*current-user-id* (fail-stats filename file)) (throw e)))) (defn- can-update-error @@ -759,7 +791,7 @@ ;; Attempt to delete the underlying data from the customer database. ;; We perform this before marking the table as inactive in the app db so that even if it false, the table is still - ;; visible to administrators and the operation is easy to retry again later. + ;; visible to administrators, and the operation is easy to retry again later. (driver/drop-table! driver (:id database) table-name) ;; We mark the table as inactive synchronously, so that it will no longer shows up in the admin list. @@ -793,16 +825,18 @@ "Main entry point for updating an uploaded table with a CSV file. This will create an auto-incrementing primary key (auto-pk) column in the table for drivers that supported uploads before auto-pk columns were introduced by metabase#36249, if it does not already exist." - [{:keys [^File file table-id action]} + [{:keys [filename ^File file table-id action]} :- [:map [:table-id ms/PositiveInt] + [:filename :string] [:file (ms/InstanceOfClass File)] [:action update-action-schema]]] (let [table (api/check-404 (t2/select-one :model/Table :id table-id)) database (table/database table) replace? (= ::replace action)] (check-can-update database table) - (update-with-csv! database table file :replace-rows? replace?))) + (check-filetype filename file) + (update-with-csv! database table filename file :replace-rows? replace?))) ;;; +-------------------------------- ;;; | hydrate based_on_upload for FE diff --git a/test/metabase/api/table_test.clj b/test/metabase/api/table_test.clj index 74bacaa912920fa372a2dbd5709ed13845576b66..8153e3e6cf46abbdaacd2b9e979f675c4ac5744a 100644 --- a/test/metabase/api/table_test.clj +++ b/test/metabase/api/table_test.clj @@ -1090,7 +1090,7 @@ (upload-test/create-upload-table!))) (defn- update-csv! [options] - (@#'api.table/update-csv! options)) + (@#'api.table/update-csv! (merge {:filename "test.csv"} options))) (defn- update-csv-via-api! "Upload a small CSV file to the given collection ID. Default args can be overridden" diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj index 52658f86a06b275979e1a2e2f0664108683c00a5..2c0b796c5382c145778b31765757f53f50d06ef4 100644 --- a/test/metabase/upload_test.clj +++ b/test/metabase/upload_test.clj @@ -34,7 +34,7 @@ [metabase.util.malli :as mu] [toucan2.core :as t2]) (:import - (java.io ByteArrayInputStream File))) + (java.io ByteArrayInputStream File FileOutputStream))) (set! *warn-on-reflection* true) @@ -118,6 +118,10 @@ (#'upload/scan-and-sync-table! database table)) (t2/select-one :model/Table (:id table)))) +(defn- tmp-file [prefix extension] + (doto (File/createTempFile prefix extension) + (.deleteOnExit))) + (defn csv-file-with "Create a temp csv file with the given content and return the file" (^File [rows] @@ -126,8 +130,7 @@ (csv-file-with rows file-prefix io/writer)) (^File [rows file-prefix writer-fn] (let [contents (str/join "\n" rows) - csv-file (doto (File/createTempFile file-prefix ".csv") - (.deleteOnExit))] + csv-file (tmp-file file-prefix ".csv")] (with-open [^java.io.Writer w (writer-fn csv-file)] (.write w contents)) csv-file))) @@ -1034,7 +1037,7 @@ (defn- update-csv! "Shorthand for synchronously updating a CSV" [action options] - (update-csv-synchronously! (assoc options :action action))) + (update-csv-synchronously! (merge {:filename "test.csv" :action action} options))) (deftest create-csv-upload!-schema-test (mt/test-drivers (mt/normal-drivers-with-feature :uploads :schemas) @@ -1159,6 +1162,23 @@ :upload-seconds pos?}}} (last-audit-event :upload-create))))))))) +(defn- write-empty-gzip + "Writes the data for an empty gzip file" + [^File file] + (with-open [out (FileOutputStream. file)] + (.write out (byte-array + [0x1F 0x8B ; GZIP magic number + 0x08 ; Compression method (deflate) + 0 ; Flags + 0 0 0 0 ; Modification time (none) + 0 ; Extra flags + 0xFF ; Operating system (unknown) + 0x03 0 ; Compressed data (empty block) + 0 0 0 0 ; CRC32 + 0 0 0 0 ; Input size + ])) + file)) + (deftest ^:mb/once create-csv-upload!-failure-test (mt/test-drivers (mt/normal-drivers-with-feature :uploads) (mt/with-empty-db @@ -1191,7 +1211,20 @@ #"^You do not have curate permissions for this Collection\.$" (do-with-uploaded-example-csv! {:user-id (mt/user->id :lucky) :schema-name "public", :table-prefix "uploaded_magic_"} - identity)))))))) + identity))))) + (testing "File type must be allowed" + (is (thrown-with-msg? + clojure.lang.ExceptionInfo + #"Unsupported File Type" + (do-with-uploaded-example-csv! + {:file (tmp-file "illegal" ".jpg")} + identity))) + (is (thrown-with-msg? + clojure.lang.ExceptionInfo + #"Unsupported File Type" + (do-with-uploaded-example-csv! + {:file (write-empty-gzip (tmp-file "sneaky" ".csv"))} + identity))))))) (defn- find-schema-filters-prop [driver] (first (filter (fn [conn-prop] @@ -1262,12 +1295,6 @@ (driver/insert-into! driver db-id schema+table-name insert-col-names rows) (sync-upload-test-table! :database (mt/db) :table-name table-name :schema-name schema-name))) -(defmacro maybe-apply-macro - [flag macro-fn & body] - `(if ~flag - (~macro-fn ~@body) - ~@body)) - (defn catch-ex-info* [f] (try (f)