diff --git a/src/metabase/db/metadata_queries.clj b/src/metabase/db/metadata_queries.clj index b493b4b6abf9a8018543b7e3e37f8bd5e5802948..7d9929d67163a9e78ba1776a00d15c2f27927100 100644 --- a/src/metabase/db/metadata_queries.clj +++ b/src/metabase/db/metadata_queries.clj @@ -89,17 +89,32 @@ inferring special types and what-not; we don't want to scan millions of values at any rate." 10000) +(def TableRowsSampleOptions + "Schema for `table-rows-sample` options" + (s/maybe {(s/optional-key :truncation-size) s/Int})) + (s/defn table-rows-sample :- (s/maybe si/TableSample) "Run a basic MBQL query to fetch a sample of rows belonging to a Table." {:style/indent 1} - [table :- si/TableInstance, fields :- [si/FieldInstance]] - (let [results ((resolve 'metabase.query-processor/process-query) - {:database (:db_id table) - :type :query - :query {:source-table (u/get-id table) - :fields (vec (for [field fields] - [:field-id (u/get-id field)])) - :limit max-sample-rows} - :middleware {:format-rows? false - :skip-results-metadata? true}})] - (get-in results [:data :rows]))) + ([table :- si/TableInstance, fields :- [si/FieldInstance]] + (table-rows-sample table fields nil)) + ([table :- si/TableInstance, fields :- [si/FieldInstance] + {:keys [truncation-size] :as _opts} :- TableRowsSampleOptions] + (let [text-fields (filter (comp #{:type/Text} :base_type) fields) + field->expressions (when truncation-size + (into {} (for [field text-fields] + [field [(str (gensym "substring")) + [:substring [:field-id (u/get-id field)] 1 truncation-size]]]))) + results ((resolve 'metabase.query-processor/process-query) + {:database (:db_id table) + :type :query + :query {:source-table (u/get-id table) + :expressions (into {} (vals field->expressions)) + :fields (vec (for [field fields] + (if-let [[expression-name _] (get field->expressions field)] + [:expression expression-name] + [:field-id (u/get-id field)]))) + :limit max-sample-rows} + :middleware {:format-rows? false + :skip-results-metadata? true}})] + (get-in results [:data :rows])))) diff --git a/src/metabase/driver/postgres.clj b/src/metabase/driver/postgres.clj index 0214c3c4a1f0d01825e250118cd405a4edaefbae..a5a77003e4d7da07343ca23877216a6dc88384ee 100644 --- a/src/metabase/driver/postgres.clj +++ b/src/metabase/driver/postgres.clj @@ -94,14 +94,14 @@ (assoc driver.common/default-additional-options-details :placeholder "prepareThreshold=0")])) -(defn- enum-types [driver database] +(defn- enum-types [_driver database] (set - (map (comp keyword :typname) - (jdbc/query (sql-jdbc.conn/connection-details->spec driver (:details database)) - [(str "SELECT DISTINCT t.typname " - "FROM pg_enum e " - "LEFT JOIN pg_type t " - " ON t.oid = e.enumtypid")])))) + (map (comp keyword :typname) + (jdbc/query (sql-jdbc.conn/db->pooled-connection-spec database) + [(str "SELECT DISTINCT t.typname " + "FROM pg_enum e " + "LEFT JOIN pg_type t " + " ON t.oid = e.enumtypid")])))) (def ^:private ^:dynamic *enum-types* nil) @@ -220,7 +220,7 @@ :box :type/* :bpchar :type/Text ; "blank-padded char" is the internal name of "character" :bytea :type/* ; byte array - :cidr :type/Text ; IPv4/IPv6 network address + :cidr :type/Structured ; IPv4/IPv6 network address :circle :type/* :citext :type/Text ; case-insensitive text :date :type/Date @@ -234,11 +234,11 @@ :int4 :type/Integer :int8 :type/BigInteger :interval :type/* ; time span - :json :type/Text - :jsonb :type/Text + :json :type/Structured + :jsonb :type/Structured :line :type/* :lseg :type/* - :macaddr :type/Text + :macaddr :type/Structured :money :type/Decimal :numeric :type/Decimal :path :type/* @@ -262,7 +262,7 @@ :uuid :type/UUID :varbit :type/* :varchar :type/Text - :xml :type/Text + :xml :type/Structured (keyword "bit varying") :type/* (keyword "character varying") :type/Text (keyword "double precision") :type/Float @@ -281,8 +281,10 @@ [_ database-type _] ;; this is really, really simple right now. if its postgres :json type then it's :type/SerializedJSON special-type (case database-type - "json" :type/SerializedJSON - "inet" :type/IPAddress + "json" :type/SerializedJSON + "jsonb" :type/SerializedJSON + "xml" :type/XML + "inet" :type/IPAddress nil)) (def ^:private ssl-params diff --git a/src/metabase/driver/sql_jdbc/connection.clj b/src/metabase/driver/sql_jdbc/connection.clj index 1a16452b1d59bac84100e8cafe9bb4c82f9a8f48..0fdf0441022044582269f03918d751e69831b906 100644 --- a/src/metabase/driver/sql_jdbc/connection.clj +++ b/src/metabase/driver/sql_jdbc/connection.clj @@ -104,7 +104,9 @@ more than one pool is ever open for a single database." [database-id pool-spec-or-nil] {:pre [(integer? database-id)]} - (let [[old-id->pool] (swap-vals! database-id->connection-pool assoc database-id pool-spec-or-nil)] + (let [[old-id->pool] (if pool-spec-or-nil + (swap-vals! database-id->connection-pool assoc database-id pool-spec-or-nil) + (swap-vals! database-id->connection-pool dissoc database-id))] ;; if we replaced a different pool with the new pool that is different from the old one, destroy the old pool (when-let [old-pool-spec (get old-id->pool database-id)] (when-not (identical? old-pool-spec pool-spec-or-nil) diff --git a/src/metabase/sync/analyze/fingerprint.clj b/src/metabase/sync/analyze/fingerprint.clj index a3e13ee4b42dca5ee4e6e8419a9f93210ed59485..57e615d18519069293573fee283b5ee88b885688 100644 --- a/src/metabase/sync/analyze/fingerprint.clj +++ b/src/metabase/sync/analyze/fingerprint.clj @@ -40,6 +40,12 @@ :updated-fingerprints 0 :fingerprints-attempted fields-count}) +(def truncation-size + "The maximum size of :type/Text to be selected from the database in `table-rows-sample`. In practice we see large + text blobs and want to balance taking enough for distinct counts and but not so much that we risk out of memory + issues when syncing." + 1234) + (s/defn ^:private fingerprint-table! [table :- i/TableInstance, fields :- [i/FieldInstance]] (transduce identity @@ -60,7 +66,7 @@ (update count-info :updated-fingerprints inc)))) (empty-stats-map (count fingerprints)) (map vector fields fingerprints)))) - (metadata-queries/table-rows-sample table fields))) + (metadata-queries/table-rows-sample table fields {:truncation-size truncation-size}))) ;;; +----------------------------------------------------------------------------------------------------------------+ ;;; | WHICH FIELDS NEED UPDATED FINGERPRINTS? | @@ -138,6 +144,7 @@ [:not (mdb/isa :special_type :type/PK)] [:= :special_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] + [:not= :base_type "type/Structured"] (cons :or (versions-clauses))]}) ([table :- i/TableInstance] diff --git a/src/metabase/sync/analyze/fingerprint/fingerprinters.clj b/src/metabase/sync/analyze/fingerprint/fingerprinters.clj index dad91feef5c749a33d23225c6b78f7d1f1b31745..887213f4f4f0f61fed4f553851ec68972a59c34a 100644 --- a/src/metabase/sync/analyze/fingerprint/fingerprinters.clj +++ b/src/metabase/sync/analyze/fingerprint/fingerprinters.clj @@ -1,7 +1,6 @@ (ns metabase.sync.analyze.fingerprint.fingerprinters "Non-identifying fingerprinters for various field types." (:require [bigml.histogram.core :as hist] - [cheshire.core :as json] [java-time :as t] [kixi.stats [core :as stats] @@ -225,10 +224,20 @@ :q3 q3))))) (defn- valid-serialized-json? - "Is x a serialized JSON dictionary or array." + "Is x a serialized JSON dictionary or array. Hueristically recognize maps and arrays. Uses the following strategies: + - leading character {: assume valid JSON + - leading character [: assume valid json unless its of the form [ident] where ident is not a boolean." [x] (u/ignore-exceptions - ((some-fn map? sequential?) (json/parse-string x)))) + (when (and x (string? x)) + (let [matcher (case (first x) + \[ (fn bracket-matcher [s] + (cond (re-find #"^\[\s*(?:true|false)" s) true + (re-find #"^\[\s*[a-zA-Z]" s) false + :else true)) + \{ (constantly true) + (constantly false))] + (matcher x))))) (deffingerprinter :type/Text ((map str) ; we cast to str to support `field-literal` type overwriting: diff --git a/src/metabase/types.clj b/src/metabase/types.clj index c5793816f6f47a1aadfa1fdc96f2dde1ba282559..397c08a62b78baca837c981245a2e9e55919c9d3 100644 --- a/src/metabase/types.clj +++ b/src/metabase/types.clj @@ -158,6 +158,12 @@ (derive :type/Country :type/Address) (derive :type/ZipCode :type/Address) +;;; Structured + +(derive :type/Structured :type/*) +(derive :type/SerializedJSON :type/Structured) +(derive :type/XML :type/Structured) + ;;; Legacy Special Types. These will hopefully be going away in the future when we add columns like `:is_pk` and ;;; `:cardinality` diff --git a/src/metabase/util/ssh.clj b/src/metabase/util/ssh.clj index 491177a65bfda7e5d549a81884ef250c90f3a43d..8d04c3019ba7a7f503cb092ff855283855ace1f3 100644 --- a/src/metabase/util/ssh.clj +++ b/src/metabase/util/ssh.clj @@ -2,12 +2,16 @@ (:require [clojure.tools.logging :as log] [metabase.util :as u]) (:import java.io.ByteArrayInputStream + java.util.concurrent.TimeUnit org.apache.sshd.client.future.ConnectFuture org.apache.sshd.client.session.ClientSession org.apache.sshd.client.session.forward.PortForwardingTracker org.apache.sshd.client.SshClient [org.apache.sshd.common.config.keys FilePasswordProvider FilePasswordProvider$ResourceDecodeResult] - org.apache.sshd.common.session.SessionHolder + [org.apache.sshd.common.session + SessionHeartbeatController + SessionHeartbeatController$HeartbeatType + SessionHolder] org.apache.sshd.common.util.GenericUtils org.apache.sshd.common.util.io.resource.AbstractIoResource org.apache.sshd.common.util.net.SshdSocketAddress @@ -50,6 +54,9 @@ session (doto ^ClientSession (.getSession conn-status) (maybe-add-tunnel-password! tunnel-pass) (maybe-add-tunnel-private-key! tunnel-private-key tunnel-private-key-passphrase) + (.setSessionHeartbeat SessionHeartbeatController$HeartbeatType/IGNORE + TimeUnit/SECONDS + 180) (.. auth (verify default-ssh-timeout))) tracker (.createLocalPortForwardingTracker session (SshdSocketAddress. "" 0) diff --git a/test/metabase/db/metadata_queries_test.clj b/test/metabase/db/metadata_queries_test.clj index 15aac91fab9d407e6bc5d829c4014931d04bc0e6..9a732f2724336d89f9cb7769cb5a0cfaf1047fe8 100644 --- a/test/metabase/db/metadata_queries_test.clj +++ b/test/metabase/db/metadata_queries_test.clj @@ -3,7 +3,8 @@ [metabase [models :refer [Field Table]] [test :as mt]] - [metabase.db.metadata-queries :as metadata-queries])) + [metabase.db.metadata-queries :as metadata-queries] + [metabase.driver.sql-jdbc.test-util :as sql-jdbc.tu])) ;; Redshift tests are randomly failing -- see https://github.com/metabase/metabase/issues/2767 (defn- metadata-queries-test-drivers [] @@ -31,3 +32,25 @@ (mt/test-drivers (metadata-queries-test-drivers) (is (= [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] (map int (metadata-queries/field-distinct-values (Field (mt/id :checkins :user_id)))))))) + +(deftest table-rows-sample-test + (let [expected [["20th Century Cafe"] + ["25°"] + ["33 Taps"] + ["800 Degrees Neapolitan Pizzeria"] + ["BCD Tofu House"]] + table (Table (mt/id :venues)) + fields [(Field (mt/id :venues :name))] + fetch! #(->> (metadata-queries/table-rows-sample table fields (when % {:truncation-size %})) + ;; since order is not guaranteed do some sorting here so we always get the same results + (sort-by first) + (take 5))] + (is (= :type/Text (-> fields first :base_type))) + (mt/test-drivers (sql-jdbc.tu/sql-jdbc-drivers) + (is (= expected (fetch! nil))) + (testing "truncates text fields (see #13288)" + (doseq [size [1 4 80]] + (is (= (mapv (fn [[s]] [(subs (or s "") 0 (min size (count s)))]) + expected) + (fetch! size)) + "Did not truncate a text field")))))) diff --git a/test/metabase/driver/postgres_test.clj b/test/metabase/driver/postgres_test.clj index 039acfa8713c7b2df87b6de2286c809ce78c57b2..225586a4f71db0edd0113deca2e6d02fa010a2d9 100644 --- a/test/metabase/driver/postgres_test.clj +++ b/test/metabase/driver/postgres_test.clj @@ -370,7 +370,8 @@ (create-enums-db!) (mt/with-temp Database [database {:engine :postgres, :details (enums-test-db-details)}] (sync-metadata/sync-db-metadata! database) - (f database))) + (f database) + (#'sql-jdbc.conn/set-pool! (u/id database) nil))) (deftest enums-test (mt/test-driver :postgres diff --git a/test/metabase/driver/sql_jdbc/connection_test.clj b/test/metabase/driver/sql_jdbc/connection_test.clj index 5e187620dea47b85009f909a20cadbc771435824..957054571090ddb04be676d434a4c65dc3ae6fd2 100644 --- a/test/metabase/driver/sql_jdbc/connection_test.clj +++ b/test/metabase/driver/sql_jdbc/connection_test.clj @@ -1,6 +1,14 @@ (ns metabase.driver.sql-jdbc.connection-test - (:require [expectations :refer [expect]] + (:require [clojure.java.jdbc :as jdbc] + [clojure.test :refer :all] + [expectations :refer [expect]] + [metabase + [db :as mdb] + [test :as mt] + [util :as u]] + [metabase.driver.sql-jdbc.connection :as sql-jdbc.conn] [metabase.driver.util :as driver.u] + [metabase.models.database :refer [Database]] [metabase.test.data :as data] [metabase.test.util.log :as tu.log])) @@ -28,3 +36,37 @@ false (tu.log/suppress-output (driver.u/can-connect-with-details? :postgres {:host "google.com", :port 80}))) + +(deftest db->pooled-connection-spec-test + (mt/test-driver :h2 + (testing "creating and removing specs works" + ;; need to create a new, nonexistent h2 db + (binding [mdb/*allow-potentailly-unsafe-connections* true] + (let [destroyed? (atom false) + original-destroy @#'sql-jdbc.conn/destroy-pool! + spec (sql-jdbc.conn/connection-details->spec + :h2 + (mt/dbdef->connection-details :h2 :server {:database-name "connection_test"}))] + (with-redefs [sql-jdbc.conn/destroy-pool! (fn [id destroyed-spec] + (original-destroy id destroyed-spec) + (reset! destroyed? true))] + (jdbc/with-db-connection [conn spec] + (jdbc/execute! spec ["CREATE TABLE birds (name varchar)"]) + (jdbc/execute! spec ["INSERT INTO birds values ('rasta'),('lucky')"]) + (mt/with-temp Database [database {:engine :h2 :details spec}] + (testing "database id is not in our connection map initially" + ;; deref'ing a var to get the atom. looks weird + (is (not (contains? @@#'sql-jdbc.conn/database-id->connection-pool + (u/id database))))) + (testing "when getting a pooled connection it is now in our connection map" + (let [stored-spec (sql-jdbc.conn/db->pooled-connection-spec database) + birds (jdbc/query stored-spec ["SELECT * FROM birds"])] + (is (seq birds)) + (is (contains? @@#'sql-jdbc.conn/database-id->connection-pool + (u/id database))))) + (testing "and is no longer in our connection map after cleanup" + (#'sql-jdbc.conn/set-pool! (u/id database) nil) + (is (not (contains? @@#'sql-jdbc.conn/database-id->connection-pool + (u/id database))))) + (testing "the pool has been destroyed" + (is @destroyed?)))))))))) diff --git a/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj b/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj index e232674348a8a5c38450325522b7eb05dfa12780..62b11aaa156bd887aa3e4e6546f178630f0837b9 100644 --- a/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj +++ b/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj @@ -1,7 +1,8 @@ (ns metabase.sync.analyze.fingerprint.fingerprinters-test - (:require [clojure.test :refer :all] + (:require [cheshire.core :as json] + [clojure.test :refer :all] [metabase.models.field :as field :refer [Field]] - [metabase.sync.analyze.fingerprint.fingerprinters :refer :all] + [metabase.sync.analyze.fingerprint.fingerprinters :as f] [metabase.test :as mt] [schema.core :as s] [toucan.db :as db])) @@ -19,7 +20,7 @@ :type {:type/DateTime {:earliest "2013-01-01" :latest "2018-01-01"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/DateTime})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/DateTime})) [#t "2013" nil #t "2018" nil nil #t "2015"]))) (testing "handle ChronoLocalDateTime" (is (= {:global {:distinct-count 2 @@ -27,7 +28,7 @@ :type {:type/DateTime {:earliest "2013-01-01T20:04:00Z" :latest "2018-01-01T04:04:00Z"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) [(java.time.LocalDateTime/of 2013 01 01 20 04 0 0) (java.time.LocalDateTime/of 2018 01 01 04 04 0 0)])))) (testing "handle comparing explicit Instant with ChronoLocalDateTime" @@ -36,7 +37,7 @@ :type {:type/DateTime {:earliest "2007-12-03T10:15:30Z" :latest "2018-01-01T04:04:00Z"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) [(java.time.Instant/parse "2007-12-03T10:15:30.00Z") (java.time.LocalDateTime/of 2018 01 01 04 04 0 0)])))) (testing "mixing numbers and strings" @@ -45,7 +46,7 @@ :type {:type/DateTime {:earliest "1970-01-01T00:00:01.234Z" :latest "2007-12-03T10:15:30Z"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) ["2007-12-03T10:15:30.00Z" 1234])))) (testing "nil temporal values" (is (= {:global {:distinct-count 1 @@ -53,7 +54,7 @@ :type {:type/DateTime {:earliest nil :latest nil}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/DateTime})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/DateTime})) (repeat 10 nil))))) (testing "handle all supported types" (is (= {:global {:distinct-count 5 @@ -61,7 +62,7 @@ :type {:type/DateTime {:earliest "1970-01-01T00:00:01.234Z" :latest "2020-07-06T20:25:33.36Z"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Temporal})) [(java.time.LocalDateTime/of 2013 01 01 20 04 0 0) ; LocalDateTime 1234 ; int 1594067133360 ; long @@ -75,8 +76,8 @@ :type {:type/DateTime {:earliest "2013-01-01" :latest "2018-01-01"}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/DateTime - :special_type :type/FK})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/DateTime + :special_type :type/FK})) [#t "2013" #t "2018" #t "2015"]))))) (deftest fingerprint-numeric-values-test @@ -89,7 +90,7 @@ :q3 2.75 :sd 1.0}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Number})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Number})) [1.0 2.0 3.0]))) (testing "We should robustly survive weird values such as NaN, Infinity, and nil" (is (= {:global {:distinct-count 7 @@ -101,7 +102,7 @@ :q3 2.75 :sd 1.0}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Number})) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Number})) [1.0 2.0 3.0 Double/NaN Double/POSITIVE_INFINITY Double/NEGATIVE_INFINITY nil nil]))))) (deftest fingerprint-string-values-test @@ -112,8 +113,18 @@ :percent-email 0.0 :average-length 6.4}}} (transduce identity - (fingerprinter (field/map->FieldInstance {:base_type :type/Text})) - ["metabase" "more" "like" "metabae" "[1, 2, 3]"])))) + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Text})) + ["metabase" "more" "like" "metabae" "[1, 2, 3]"]))) + (let [truncated-json (subs (json/generate-string (vec (range 50))) 0 30)] + (is (= {:global {:distinct-count 5 + :nil% 0.0} + :type {:type/Text {:percent-json 0.2 + :percent-url 0.0 + :percent-email 0.0 + :average-length 10.6}}} + (transduce identity + (f/fingerprinter (field/map->FieldInstance {:base_type :type/Text})) + ["metabase" "more" "like" "metabae" truncated-json]))))) (deftest fingerprints-in-db-test (mt/test-drivers (mt/normal-drivers) @@ -142,3 +153,15 @@ :sd (s/pred #(< 0.76 % 0.78) "between 0.76 and 0.78") :avg (s/eq 2.03)}}} (db/select-one-field :fingerprint Field :id (mt/id :venues :price)))))))) + +(deftest valid-serialized-json?-test + (testing "recognizes substrings of json" + (let [partial-json (fn [x] + (let [json (json/generate-string x)] + (subs json 0 (/ (count json) 2))))] + (is (#'f/valid-serialized-json? (partial-json [1 2 3]))) + (is (#'f/valid-serialized-json? (partial-json {:a 1 :b 2}))) + (is (#'f/valid-serialized-json? (partial-json [{:a 2}]))) + (is (#'f/valid-serialized-json? (partial-json [true true]))) + (is (not (#'f/valid-serialized-json? "bob"))) + (is (not (#'f/valid-serialized-json? "[bob]")))))) diff --git a/test/metabase/sync/analyze/fingerprint_test.clj b/test/metabase/sync/analyze/fingerprint_test.clj index 453112f73a3c63af404bc694465e705af90d320f..99e322d850902c2dc294c30b78d8c8ae6782b429 100644 --- a/test/metabase/sync/analyze/fingerprint_test.clj +++ b/test/metabase/sync/analyze/fingerprint_test.clj @@ -4,6 +4,7 @@ [expectations :refer :all] [metabase [db :as mdb] + [test :as mt] [util :as u]] [metabase.db.metadata-queries :as metadata-queries] [metabase.models @@ -40,6 +41,7 @@ [:not (mdb/isa :special_type :type/PK)] [:= :special_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] + [:not= :base_type "type/Structured"] [:or [:and [:< :fingerprint_version 1] @@ -55,6 +57,7 @@ [:not (mdb/isa :special_type :type/PK)] [:= :special_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] + [:not= :base_type "type/Structured"] [:or [:and [:< :fingerprint_version 2] @@ -77,6 +80,7 @@ [:not (mdb/isa :special_type :type/PK)] [:= :special_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] + [:not= :base_type "type/Structured"] [:or [:and [:< :fingerprint_version 2] @@ -99,6 +103,7 @@ [:not (mdb/isa :special_type :type/PK)] [:= :special_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] + [:not= :base_type "type/Structured"] [:or [:and [:< :fingerprint_version 4] @@ -239,3 +244,14 @@ (with-redefs [fingerprint/fingerprint-table! (fn [_] (throw (Exception. "this should not be called!")))] (is (= (fingerprint/empty-stats-map 0) (fingerprint/fingerprint-fields-for-db! fake-db [(Table (data/id :venues))] (fn [_ _])))))))) + +(deftest fingerprinting-test + (testing "fingerprinting truncates text fields (see #13288)" + (doseq [size [4 8 10]] + (let [table (Table (mt/id :categories)) + field (Field (mt/id :categories :name))] + (with-redefs [fingerprint/truncation-size size] + (#'fingerprint/fingerprint-table! table [field]) + (let [field' (db/select-one [Field :fingerprint] :id (u/id field)) + fingerprinted-size (get-in field' [:fingerprint :type :type/Text :average-length])] + (is (<= fingerprinted-size size))))))))