From 95817d084e2c9a93ffef567da474f973b1598c08 Mon Sep 17 00:00:00 2001 From: Cal Herries <39073188+calherries@users.noreply.github.com> Date: Mon, 1 Jul 2024 14:05:33 -0600 Subject: [PATCH] Better display names for CSV upload tables (#44862) --- src/metabase/lib/util.cljc | 37 +------------- src/metabase/sync/sync_metadata/tables.clj | 59 ++++++++++++---------- src/metabase/upload.clj | 46 +++++++++++++---- src/metabase/util.cljc | 48 ++++++++++++++---- test/metabase/lib/util_test.cljc | 42 ++------------- test/metabase/upload_test.clj | 39 +++++++++++++- test/metabase/util_test.cljc | 44 ++++++++++++++-- 7 files changed, 190 insertions(+), 125 deletions(-) diff --git a/src/metabase/lib/util.cljc b/src/metabase/lib/util.cljc index 32a4f5b8cb8..8e6cede92b5 100644 --- a/src/metabase/lib/util.cljc +++ b/src/metabase/lib/util.cljc @@ -373,39 +373,6 @@ conjunction (last coll))))))) -(mu/defn ^:private string-byte-count :- [:int {:min 0}] - "Number of bytes in a string using UTF-8 encoding." - [s :- :string] - #?(:clj (count (.getBytes (str s) "UTF-8")) - :cljs (.. (js/TextEncoder.) (encode s) -length))) - -#?(:clj - (mu/defn ^:private string-character-at :- [:string {:min 0, :max 1}] - [s :- :string - i :-[:int {:min 0}]] - (str (.charAt ^String s i)))) - -(mu/defn ^:private truncate-string-to-byte-count :- :string - "Truncate string `s` to `max-length-bytes` UTF-8 bytes (as opposed to truncating to some number of - *characters*)." - [s :- :string - max-length-bytes :- [:int {:min 1}]] - #?(:clj - (loop [i 0, cumulative-byte-count 0] - (cond - (= cumulative-byte-count max-length-bytes) (subs s 0 i) - (> cumulative-byte-count max-length-bytes) (subs s 0 (dec i)) - (>= i (count s)) s - :else (recur (inc i) - (long (+ - cumulative-byte-count - (string-byte-count (string-character-at s i))))))) - - :cljs - (let [buf (js/Uint8Array. max-length-bytes) - result (.encodeInto (js/TextEncoder.) s buf)] ;; JS obj {read: chars_converted, write: bytes_written} - (subs s 0 (.-read result))))) - (def ^:private truncate-alias-max-length-bytes "Length to truncate column and table identifiers to. See [[metabase.driver.impl/default-alias-max-length-bytes]] for reasoning." @@ -444,10 +411,10 @@ ([s :- ::lib.schema.common/non-blank-string max-bytes :- [:int {:min 0}]] - (if (<= (string-byte-count s) max-bytes) + (if (<= (u/string-byte-count s) max-bytes) s (let [checksum (crc32-checksum s) - truncated (truncate-string-to-byte-count s (- max-bytes truncated-alias-hash-suffix-length))] + truncated (u/truncate-string-to-byte-count s (- max-bytes truncated-alias-hash-suffix-length))] (str truncated \_ checksum))))) (mu/defn legacy-string-table-id->card-id :- [:maybe ::lib.schema.id/card] diff --git a/src/metabase/sync/sync_metadata/tables.clj b/src/metabase/sync/sync_metadata/tables.clj index d99f514b227..1da7ea26c57 100644 --- a/src/metabase/sync/sync_metadata/tables.clj +++ b/src/metabase/sync/sync_metadata/tables.clj @@ -79,8 +79,8 @@ (mu/defn ^:private is-crufty-table? "Should we give newly created TABLE a `visibility_type` of `:cruft`?" - [table :- i/DatabaseMetadataTable] - (some #(re-find % (u/lower-case-en (:name table))) crufty-table-patterns)) + [table-name] + (some #(re-find % (u/lower-case-en table-name)) crufty-table-patterns)) ;;; ---------------------------------------------------- Syncing ----------------------------------------------------- @@ -94,34 +94,39 @@ {:details (assoc (:details database) :version (:version db-metadata))})) +(defn- cruft-dependent-columns [table-name] + ;; if this is a crufty table, mark initial sync as complete since we'll skip the subsequent sync steps + (let [is-crufty? (is-crufty-table? table-name)] + {:initial_sync_status (if is-crufty? "complete" "incomplete") + :visibility_type (when is-crufty? :cruft)})) + +(defn create-table! + "Creates a new table in the database, ready to be synced. + Throws an exception if there is already a table with the same name, schema and database ID." + [database table] + (t2/insert-returning-instance! + Table + (merge (cruft-dependent-columns (:name table)) + {:active true + :db_id (:id database) + :schema (:schema table) + :description (:description table) + :database_require_filter (:database_require_filter table) + :display_name (or (:display_name table) (humanization/name->human-readable-name (:name table))) + :name (:name table)}))) + (defn create-or-reactivate-table! "Create a single new table in the database, or mark it as active if it already exists." [database {schema :schema table-name :name :as table}] - (let [;; if this is a crufty table, mark initial sync as complete since we'll skip the subsequent sync steps - is-crufty? (is-crufty-table? table) - initial-sync-status (if is-crufty? "complete" "incomplete") - visibility-type (when is-crufty? :cruft)] - (if-let [existing-id (t2/select-one-pk Table - :db_id (u/the-id database) - :schema schema - :name table-name - :active false)] - ;; if the table already exists but is marked *inactive*, mark it as *active* - (t2/update! Table existing-id - {:active true - :visibility_type visibility-type - :initial_sync_status initial-sync-status}) - ;; otherwise create a new Table - (first (t2/insert-returning-instances! Table - :db_id (u/the-id database) - :schema schema - :description (:description table) - :database_require_filter (:database_require_filter table) - :name table-name - :display_name (humanization/name->human-readable-name table-name) - :active true - :visibility_type visibility-type - :initial_sync_status initial-sync-status))))) + (if-let [existing-id (t2/select-one-pk Table + :db_id (u/the-id database) + :schema schema + :name table-name + :active false)] + ;; if the table already exists but is marked *inactive*, mark it as *active* + (t2/update! Table existing-id (assoc (cruft-dependent-columns (:name table)) :active true)) + ;; otherwise create a new Table + (create-table! database table))) ;; TODO - should we make this logic case-insensitive like it is for fields? diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj index b9dcede3433..8a030308437 100644 --- a/src/metabase/upload.clj +++ b/src/metabase/upload.clj @@ -41,15 +41,30 @@ (set! *warn-on-reflection* true) -(def ^:private max-field-name-bytes - "This tracks the size of the metabase_field.name field, in bytes." - 254) +;; TODO: move these to a more appropriate namespace if they need to be reused +(defmulti max-bytes + "This tracks the size of various text fields in bytes." + {:arglists '([model column])} + (fn [model _column] model)) + +(defmethod max-bytes :model/Table [_ column] + (case column + :display_name 256 + :name 256)) + +(defmethod max-bytes :model/Field [_ column] + (case column + :name 254)) + +(defmethod max-bytes :model/Card [_ column] + (case column + :name 254)) (def ^:private min-safe (fnil min Long/MAX_VALUE Long/MAX_VALUE)) (defn- max-column-bytes [driver] (let [column-limit (some-> driver driver/column-name-length-limit)] - (min-safe column-limit max-field-name-bytes))) + (min-safe column-limit (max-bytes :model/Field :name)))) (defn- normalize-column-name [driver raw-name] @@ -110,7 +125,9 @@ slugified-name (or (u/slugify table-name) "") ;; since both the time-format and the slugified-name contain only ASCII characters, we can behave as if ;; [[driver/table-name-length-limit]] were defining a length in characters. - max-length (- (driver/table-name-length-limit driver) (count time-format)) + max-length (- (min-safe (driver/table-name-length-limit driver) + (max-bytes :model/Table :name)) + (count time-format)) acceptable-length (min (count slugified-name) max-length) truncated-name-without-time (subs slugified-name 0 acceptable-length)] (str truncated-name-without-time @@ -389,14 +406,16 @@ (defn- create-from-csv-and-sync! "This is separated from `create-csv-upload!` for testing" - [{:keys [db file schema table-name]}] + [{:keys [db file schema table-name display-name]}] (let [driver (driver.u/database->driver db) schema (some->> schema (ddl.i/format-name driver)) table-name (some->> table-name (ddl.i/format-name driver)) schema+table-name (table-identifier {:schema schema :name table-name}) stats (create-from-csv! driver db schema+table-name file) ;; Sync immediately to create the Table and its Fields; the scan is settings-dependent and can be async - table (sync-tables/create-or-reactivate-table! db {:name table-name :schema (not-empty schema)}) + table (sync-tables/create-table! db {:name table-name + :schema (not-empty schema) + :display_name display-name}) _set_is_upload (t2/update! :model/Table (:id table) {:is_upload true}) _sync (scan-and-sync-table! db table) ;; Set the display_name of the auto-generated primary key column to the same as its name, so that if users @@ -446,12 +465,19 @@ (let [timer (start-timer) filename-prefix (or (second (re-matches #"(.*)\.(csv|tsv)$" filename)) filename) + humanized-name (humanization/name->human-readable-name filename-prefix) + display-name (u/truncate-string-to-byte-count humanized-name (max-bytes :model/Table :display_name)) + card-name (u/truncate-string-to-byte-count humanized-name (max-bytes :model/Card :name)) driver (driver.u/database->driver database) - table-name (->> (str table-prefix filename-prefix) + table-name (->> (str table-prefix display-name) (unique-table-name driver) (u/lower-case-en)) {:keys [stats - table]} (create-from-csv-and-sync! {:db database :file file :schema schema-name :table-name table-name}) + table]} (create-from-csv-and-sync! {:db database + :file file + :schema schema-name + :table-name table-name + :display-name display-name}) card (card/create-card! {:collection_id collection-id :type :model @@ -460,7 +486,7 @@ :query {:source-table (:id table)} :type :query} :display :table - :name (humanization/name->human-readable-name filename-prefix) + :name card-name :visualization_settings {}} @api/*current-user*) upload-seconds (/ (since-ms timer) 1e3) diff --git a/src/metabase/util.cljc b/src/metabase/util.cljc index 5d712900509..2b34f0951ef 100644 --- a/src/metabase/util.cljc +++ b/src/metabase/util.cljc @@ -1,6 +1,14 @@ (ns metabase.util "Common utility functions useful throughout the codebase." (:require + #?@(:clj ([clojure.math.numeric-tower :as math] + [me.flowthing.pp :as pp] + [metabase.config :as config] + #_{:clj-kondo/ignore [:discouraged-namespace]} + [metabase.util.jvm :as u.jvm] + [metabase.util.string :as u.str] + [potemkin :as p] + [ring.util.codec :as codec])) [camel-snake-kebab.internals.macros :as csk.macros] [clojure.data :refer [diff]] [clojure.pprint :as pprint] @@ -15,15 +23,7 @@ [metabase.util.log :as log] [metabase.util.memoize :as memoize] [net.cgrand.macrovich :as macros] - [weavejester.dependency :as dep] - #?@(:clj ([clojure.math.numeric-tower :as math] - [me.flowthing.pp :as pp] - [metabase.config :as config] - #_{:clj-kondo/ignore [:discouraged-namespace]} - [metabase.util.jvm :as u.jvm] - [metabase.util.string :as u.str] - [potemkin :as p] - [ring.util.codec :as codec]))) + [weavejester.dependency :as dep]) #?(:clj (:import (clojure.lang Reflector) (java.text Normalizer Normalizer$Form) @@ -989,3 +989,33 @@ (transient {}) m))] (with-meta ret (meta m))))) + +(defn string-byte-count + "Number of bytes in a string using UTF-8 encoding." + [s] + #?(:clj (count (.getBytes (str s) "UTF-8")) + :cljs (.. (js/TextEncoder.) (encode s) -length))) + +#?(:clj + (defn ^:private string-character-at + [s i] + (str (.charAt ^String s i)))) + +(defn truncate-string-to-byte-count + "Truncate string `s` to `max-length-bytes` UTF-8 bytes (as opposed to truncating to some number of *characters*)." + [s max-length-bytes] + #?(:clj + (loop [i 0, cumulative-byte-count 0] + (cond + (= cumulative-byte-count max-length-bytes) (subs s 0 i) + (> cumulative-byte-count max-length-bytes) (subs s 0 (dec i)) + (>= i (count s)) s + :else (recur (inc i) + (long (+ + cumulative-byte-count + (string-byte-count (string-character-at s i))))))) + + :cljs + (let [buf (js/Uint8Array. max-length-bytes) + result (.encodeInto (js/TextEncoder.) s buf)] ;; JS obj {read: chars_converted, write: bytes_written} + (subs s 0 (.-read result))))) diff --git a/test/metabase/lib/util_test.cljc b/test/metabase/lib/util_test.cljc index 3b559552daa..15ae78d07ad 100644 --- a/test/metabase/lib/util_test.cljc +++ b/test/metabase/lib/util_test.cljc @@ -1,13 +1,13 @@ (ns metabase.lib.util-test (:require #?@(:cljs ([metabase.test-runner.assert-exprs.approximately-equal])) - [clojure.string :as str] [clojure.test :refer [are deftest is testing]] [metabase.lib.core :as lib] [metabase.lib.equality :as lib.equality] [metabase.lib.test-metadata :as meta] [metabase.lib.test-util :as lib.tu] - [metabase.lib.util :as lib.util])) + [metabase.lib.util :as lib.util] + [metabase.util :as u])) #?(:cljs (comment metabase.test-runner.assert-exprs.approximately-equal/keep-me)) @@ -223,46 +223,10 @@ "0601246074" "00000001" "2915035893" "00000000")) -(deftest ^:parallel truncate-string-to-byte-count-test - (letfn [(truncate-string-to-byte-count [s byte-length] - (let [truncated (#'lib.util/truncate-string-to-byte-count s byte-length)] - (is (<= (#'lib.util/string-byte-count truncated) byte-length)) - (is (str/starts-with? s truncated)) - truncated))] - (doseq [[s max-length->expected] {"12345" - {1 "1" - 2 "12" - 3 "123" - 4 "1234" - 5 "12345" - 6 "12345" - 10 "12345"} - - "가나다ë¼" - {1 "" - 2 "" - 3 "ê°€" - 4 "ê°€" - 5 "ê°€" - 6 "가나" - 7 "가나" - 8 "가나" - 9 "가나다" - 10 "가나다" - 11 "가나다" - 12 "가나다ë¼" - 13 "가나다ë¼" - 15 "가나다ë¼" - 20 "가나다ë¼"}} - [max-length expected] max-length->expected] - (testing (pr-str (list `lib.util/truncate-string-to-byte-count s max-length)) - (is (= expected - (truncate-string-to-byte-count s max-length))))))) - (deftest ^:parallel truncate-alias-test (letfn [(truncate-alias [s max-bytes] (let [truncated (lib.util/truncate-alias s max-bytes)] - (is (<= (#'lib.util/string-byte-count truncated) max-bytes)) + (is (<= (u/string-byte-count truncated) max-bytes)) truncated))] (doseq [[s max-bytes->expected] { ;; 20-character plain ASCII string "01234567890123456789" diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj index 126cb76e7b1..ad279e12f85 100644 --- a/test/metabase/upload_test.clj +++ b/test/metabase/upload_test.clj @@ -366,6 +366,39 @@ (mt/with-model-cleanup [:model/Table] (do-with-upload-table! ~create-table-expr (fn [~table-binding] ~@body))))) +(deftest create-from-csv-display-name-test + (mt/test-drivers (mt/normal-drivers-with-feature :uploads) + (let [test-names-match (fn [table expected] + (is (= expected + (:display_name table) + (:name (table->card table)))))] + (testing "The table's display name and model's name is humanized from the CSV file name" + (let [csv-file-prefix "some_FILE-prefix"] + (with-upload-table! [table (card->table (upload-example-csv! :csv-file-prefix csv-file-prefix))] + (test-names-match table "Some File Prefix")))) + (testing "Unicode characters are preserved in the display name, even when the table name is slugified" + (let [csv-file-prefix "出色的"] + (with-redefs [upload/strictly-monotonic-now (constantly #t "2024-06-28T00:00:00")] + (with-upload-table! [table (card->table (upload-example-csv! :csv-file-prefix csv-file-prefix))] + (test-names-match table "出色的") + (is (= (ddl.i/format-name driver/*driver* "%e5%87%ba%e8%89%b2%e7%9a%84_20240628000000") + (:name table))))))) + (testing "The names should be truncated to the right size" + ;; we can assume app DBs use UTF-8 encoding (metabase#11753) + (let [max-bytes 50] + (with-redefs [; redef this because the UNIX filename limit is 255 bytes, so we can't test it in CI + upload/max-bytes (constantly max-bytes)] + (doseq [c ["a" "出"]] + (let [long-csv-file-prefix (apply str (repeat (inc max-bytes) c)) + char-size (count (.getBytes c "UTF-8"))] + (with-upload-table! [table (card->table (upload-example-csv! :csv-file-prefix long-csv-file-prefix))] + (testing "The card name should be truncated to max bytes with UTF-8 encoding" + (is (= (str/capitalize (apply str (repeat (quot max-bytes char-size) c))) + (:name (table->card table))))) + (testing "The display name should be truncated to the max bytes with UTF-8 encoding" + (is (= (str/capitalize (apply str (repeat (quot max-bytes char-size) c))) + (:display_name table))))))))))))) + (deftest create-from-csv-table-name-test (testing "Can upload two files with the same name" (mt/test-drivers (mt/normal-drivers-with-feature :uploads) @@ -375,11 +408,15 @@ (with-upload-table! [table-2 (card->table (upload-example-csv! :csv-file-prefix csv-file-prefix))] (mt/with-current-user (mt/user->id :crowberto) + (testing "both tables have the same display name" + (is (= "Some File Prefix" + (:display_name table-1) + (:display_name table-2)) (testing "tables are different between the two uploads" (is (some? (:id table-1))) (is (some? (:id table-2))) (is (not= (:id table-1) - (:id table-2))))))))))) + (:id table-2))))))))))))) (defn- query [db-id source-table] (qp/process-query {:database db-id diff --git a/test/metabase/util_test.cljc b/test/metabase/util_test.cljc index 2c8d44858f4..c48bc949b0c 100644 --- a/test/metabase/util_test.cljc +++ b/test/metabase/util_test.cljc @@ -1,16 +1,16 @@ (ns ^:mb/once metabase.util-test "Tests for functions in `metabase.util`." (:require - #?@(:clj [[metabase.test :as mt]]) + #?@(:clj [[metabase.test :as mt] + [malli.generator :as mg]]) + [clojure.string :as str] [clojure.test :refer [are deftest is testing]] [clojure.test.check.clojure-test :refer [defspec]] [clojure.test.check.generators :as gen] [clojure.test.check.properties :as prop] [flatland.ordered.map :refer [ordered-map]] - #_:clj-kondo/ignore - [malli.generator :as mg] [metabase.util :as u]) - #?(:clj (:import [java.time Month DayOfWeek]))) + #?(:clj (:import [java.time DayOfWeek Month]))) #?(:clj (set! *warn-on-reflection* true)) @@ -529,3 +529,39 @@ (u/case-enum Month/JANUARY Month/JANUARY 1 DayOfWeek/SUNDAY 2)))))) + +(deftest ^:parallel truncate-string-to-byte-count-test + (letfn [(truncate-string-to-byte-count [s byte-length] + (let [truncated (#'u/truncate-string-to-byte-count s byte-length)] + (is (<= (#'u/string-byte-count truncated) byte-length)) + (is (str/starts-with? s truncated)) + truncated))] + (doseq [[s max-length->expected] {"12345" + {1 "1" + 2 "12" + 3 "123" + 4 "1234" + 5 "12345" + 6 "12345" + 10 "12345"} + + "가나다ë¼" + {1 "" + 2 "" + 3 "ê°€" + 4 "ê°€" + 5 "ê°€" + 6 "가나" + 7 "가나" + 8 "가나" + 9 "가나다" + 10 "가나다" + 11 "가나다" + 12 "가나다ë¼" + 13 "가나다ë¼" + 15 "가나다ë¼" + 20 "가나다ë¼"}} + [max-length expected] max-length->expected] + (testing (pr-str (list `lib.util/truncate-string-to-byte-count s max-length)) + (is (= expected + (truncate-string-to-byte-count s max-length))))))) -- GitLab