From 2081799850d246014379e6dc813bccab95a425bc Mon Sep 17 00:00:00 2001 From: Cal Herries <39073188+calherries@users.noreply.github.com> Date: Wed, 24 Jul 2024 00:19:30 +0300 Subject: [PATCH] Skip syncing field values for oracle CLOB columns (#44654) --- docs/developers-guide/driver-changelog.md | 19 ++++- .../oracle/src/metabase/driver/oracle.clj | 2 +- .../test/metabase/driver/oracle_test.clj | 12 ++- src/metabase/models/field_values.clj | 5 +- src/metabase/sync/analyze/fingerprint.clj | 4 +- src/metabase/types.cljc | 30 ++++--- test/metabase/models/field_values_test.clj | 78 +++++++++++++------ .../sync/analyze/fingerprint_test.clj | 8 +- 8 files changed, 108 insertions(+), 50 deletions(-) diff --git a/docs/developers-guide/driver-changelog.md b/docs/developers-guide/driver-changelog.md index edd1cf14870..357a5120e65 100644 --- a/docs/developers-guide/driver-changelog.md +++ b/docs/developers-guide/driver-changelog.md @@ -50,9 +50,20 @@ title: Driver interface changelog - `metabase.query-processor.compile/compile-and-splice-parameters` has been removed; replace usages with `metabase.query-processor.compile/compile-with-inline-parameters`. -- The three-arity of `metabase.driver.sql.query-processor/format-honeysql` (which had an additional parameter for - Honey SQL version) has been removed; replace all usages with the two-arity version. Honey SQL 2 has been the only - supported version since Metabase 0.49.0. + - The three-arity of `metabase.driver.sql.query-processor/format-honeysql` (which had an additional parameter for + Honey SQL version) has been removed; replace all usages with the two-arity version. Honey SQL 2 has been the only + supported version since Metabase 0.49.0. + +## Metabase 0.50.16 + + - `:type/fingerprinting-unsupported` has been added in the `metabase.types` namespace. Similar to + `:type/field-values-unsupported` for field values scanning, it is used to determine whether a specific field + should have its fingerprint computed or not. At the time of writing that logic is performed in + `metabase.sync.analyze.fingerprint/fields-to-fingerprint-base-clause`. + + - `:type/Large` has been also been added in the `metabase.types` namespace. It can be used by driver authors to + signal that a specific field contains large enough values to skip fingerprinting or field values scanning. It + can be used for other purposes as well in the future. Examples include Oracle CLOB or Postgres JSON columns. - The `:skip-drop-db?` option sometimes passed to methods for loading and destroying test data is no longer passed, you can remove code that checks for it. Test data code is now better about avoiding unneeded/redundant calls to @@ -107,7 +118,7 @@ title: Driver interface changelog - The `:foreign-keys` driver feature has been removed. `:metadata/keys-constraints` should be used for drivers that support foreign key relationships reporting during sync. Implicit joins now depend on the `:left-join` feature instead. The default value is true for `:sql` based drivers. All join features are now enabled for `:sql` based drivers - by default. Previously, those depended on the `:foreign-keys` feature. If your driver supports `:left-join`, + by default. Previously, those depended on the `:foreign-keys` feature. If your driver supports `:left-join`, the test for remapping and implicit joins will be now executed. - The`:parameterized-sql` driver feature has been added to distinguish drivers that don't support parametrized SQL in tests. Currently, this is disabled only for `:sparksql`. diff --git a/modules/drivers/oracle/src/metabase/driver/oracle.clj b/modules/drivers/oracle/src/metabase/driver/oracle.clj index 16a5495849e..38a70f0bd8c 100644 --- a/modules/drivers/oracle/src/metabase/driver/oracle.clj +++ b/modules/drivers/oracle/src/metabase/driver/oracle.clj @@ -57,7 +57,7 @@ [#"BLOB" :type/*] [#"RAW" :type/*] [#"CHAR" :type/Text] - [#"CLOB" :type/Text] + [#"CLOB" :type/OracleCLOB] [#"DATE" :type/Date] [#"DOUBLE" :type/Float] ;; Expression filter type diff --git a/modules/drivers/oracle/test/metabase/driver/oracle_test.clj b/modules/drivers/oracle/test/metabase/driver/oracle_test.clj index b880c5596eb..4ce53890bed 100644 --- a/modules/drivers/oracle/test/metabase/driver/oracle_test.clj +++ b/modules/drivers/oracle/test/metabase/driver/oracle_test.clj @@ -14,7 +14,6 @@ [metabase.driver.util :as driver.u] [metabase.lib.test-util :as lib.tu] [metabase.models.database :refer [Database]] - [metabase.models.field :refer [Field]] [metabase.models.table :refer [Table]] [metabase.public-settings.premium-features :as premium-features] [metabase.query-processor :as qp] @@ -289,9 +288,14 @@ (execute! "CREATE TABLE \"%s\".\"messages\" (\"id\" %s, \"message\" CLOB)" username pk-type) (execute! "INSERT INTO \"%s\".\"messages\" (\"id\", \"message\") VALUES (1, 'Hello')" username) (execute! "INSERT INTO \"%s\".\"messages\" (\"id\", \"message\") VALUES (2, NULL)" username) - (t2.with-temp/with-temp [Table table {:schema username, :name "messages", :db_id (mt/id)} - Field id-field {:table_id (u/the-id table), :name "id", :base_type "type/Integer"} - Field _ {:table_id (u/the-id table), :name "message", :base_type "type/Text"}] + (sync/sync-database! (mt/db) {:scan :schema}) + (let [table (t2/select-one :model/Table :schema username, :name "messages", :db_id (mt/id)) + id-field (t2/select-one :model/Field :table_id (u/the-id table), :name "id")] + (testing "The CLOB is synced as a text field" + (let [base-type (t2/select-one-fn :base_type :model/Field :table_id (u/the-id table), :name "message")] + ;; type/OracleCLOB is important for skipping fingerprinting and field values scanning #44109 + (is (= :type/OracleCLOB base-type)) + (is (isa? base-type :type/Text)))) (is (= [[1M "Hello"] [2M nil]] (mt/rows diff --git a/src/metabase/models/field_values.clj b/src/metabase/models/field_values.clj index 60dc879fe05..111ea918713 100644 --- a/src/metabase/models/field_values.clj +++ b/src/metabase/models/field_values.clj @@ -233,10 +233,7 @@ (boolean (and (not (contains? #{:retired :sensitive :hidden :details-only} (keyword visibility-type))) - (not (some #(isa? (keyword base-type) %) [:type/field-values-unsupported - :type/Structured - :type/Temporal - :type/Collection])) + (not (isa? (keyword base-type) :type/field-values-unsupported)) (not (= (keyword base-type) :type/*)) (#{:list :auto-list} (keyword has-field-values))))))) diff --git a/src/metabase/sync/analyze/fingerprint.clj b/src/metabase/sync/analyze/fingerprint.clj index 790323ac9a7..0965a9e7574 100644 --- a/src/metabase/sync/analyze/fingerprint.clj +++ b/src/metabase/sync/analyze/fingerprint.clj @@ -160,8 +160,8 @@ [:not (mdb.query/isa :semantic_type :type/PK)] [:= :semantic_type nil]] [:not-in :visibility_type ["retired" "sensitive"]] - [:not-in :base_type (-> (set (mapcat mdb.query/type-keyword->descendants [:type/Structured :type/Collection])) - (conj (u/qualified-name :type/*)))]]) + [:not-in :base_type (conj (mdb.query/type-keyword->descendants :type/fingerprint-unsupported) + (u/qualified-name :type/*))]]) (def ^:dynamic *refingerprint?* "Whether we are refingerprinting or doing the normal fingerprinting. Refingerprinting should get fields that already diff --git a/src/metabase/types.cljc b/src/metabase/types.cljc index 7d2ae98df5b..b0dd07fb775 100644 --- a/src/metabase/types.cljc +++ b/src/metabase/types.cljc @@ -66,6 +66,20 @@ (derive :entity/SubscriptionTable :entity/GenericTable) (derive :entity/EventTable :entity/GenericTable) +;;; Modifier Types + +;; `:type/field-values-unsupported` enables driver developers to opt out of field values calculation for specific +;; fields. For more details see the `driver-changelog.yml`, section `Metabase 0.50.0`. +(derive :type/field-values-unsupported :type/*) + +;; `:type/fingerprint-unsupported` enables driver developers to opt out of fingerprinting for specific +;; fields. +(derive :type/fingerprint-unsupported :type/*) + +;; `:type/Large` enables driver developers to signal that the field values can be relatively large and can use a lot of +;; memory. These types will be excluded from scanning and fingerprinting, and possibly other features in the future. +(derive :type/Large :type/field-values-unsupported) +(derive :type/Large :type/fingerprint-unsupported) ;;; Numeric Types @@ -168,9 +182,13 @@ (derive :type/PostgresEnum :type/Text) +(derive :type/OracleCLOB :type/Text) +(derive :type/OracleCLOB :type/Large) + ;;; DateTime Types (derive :type/Temporal :type/*) +(derive :type/Temporal :type/field-values-unsupported) (derive :type/Date :type/Temporal) ;; You could have Dates with TZ info but it's not supported by JSR-310 so we'll not worry about that for now. @@ -245,13 +263,7 @@ (derive :type/Boolean :type/*) (derive :type/DruidHyperUnique :type/*) - -;; `:type/field-values-unsupported` enables driver developers to opt out of field values calculation for specific -;; fields. For more details see the `driver-changelog.yml`, section `Metabase 0.50.0`. -(derive :type/field-values-unsupported :type/*) - (derive :type/DruidHyperUnique :type/field-values-unsupported) -(derive :type/DruidJSON :type/field-values-unsupported) ;;; The Snowflake `VARIANT` type is allowed to be anything, so just mark it as deriving from the core root types so ;;; we're allowed to use any sort of filter with it (whether it makes sense or not). See @@ -260,8 +272,7 @@ :type/Text :type/Temporal :type/Boolean - :type/Collection - :type/field-values-unsupported]] + :type/Collection]] (derive :type/SnowflakeVariant t)) ;;; Text-Like Types: Things that should be displayed as text for most purposes but that *shouldn't* support advanced @@ -278,7 +289,9 @@ ;;; Structured/Collections (derive :type/Collection :type/*) +(derive :type/Collection :type/Large) (derive :type/Structured :type/*) +(derive :type/Structured :type/Large) (derive :type/Dictionary :type/Collection) (derive :type/Array :type/Collection) @@ -287,7 +300,6 @@ (derive :type/JSON :type/Structured) (derive :type/JSON :type/Collection) -;; DruidJSON is specific -- it derives also :type/field-values-unsupported (derive :type/DruidJSON :type/JSON) ;; `:type/XML` -- an actual native XML data column diff --git a/test/metabase/models/field_values_test.clj b/test/metabase/models/field_values_test.clj index 20375ab6718..da878087991 100644 --- a/test/metabase/models/field_values_test.clj +++ b/test/metabase/models/field_values_test.clj @@ -22,6 +22,60 @@ [toucan2.tools.with-temp :as t2.with-temp]) (:import (clojure.lang ExceptionInfo))) +(def ^:private base-types-without-field-values + #{:type/* + :type/JSON + :type/Array + :type/DruidJSON + :type/Dictionary + :type/Structured + :type/Collection + :type/OracleCLOB + :type/SnowflakeVariant + :type/DruidHyperUnique + :type/TimeWithTZ + :type/TimeWithLocalTZ + :type/Time + :type/UpdatedTime + :type/Instant + :type/UpdatedDate + :type/JoinTimestamp + :type/DeletionTime + :type/CancelationDate + :type/CancelationTime + :type/DeletionDate + :type/DateTimeWithZoneID + :type/UpdatedTimestamp + :type/Birthdate + :type/Date + :type/SerializedJSON + :type/DateTimeWithZoneOffset + :type/Temporal + :type/CreationTimestamp + :type/Large + :type/JoinTime + :type/CreationTime + :type/DateTimeWithTZ + :type/JoinDate + :type/CancelationTimestamp + :type/CreationDate + :type/XML + :type/field-values-unsupported + :type/DeletionTimestamp + :type/TimeWithZoneOffset + :type/DateTime + :type/DateTimeWithLocalTZ + :type/Interval}) + +(deftest ^:parallel base-type-should-have-field-values-test + (doseq [base-type (conj (descendants :type/*) :type/*)] + (let [expected (not (contains? base-types-without-field-values base-type))] + (testing (str base-type "should " (when-not expected "not ") "have field values") + (is (= expected + (#'field-values/field-should-have-field-values? {:has_field_values :list + :visibility_type :normal + :base_type base-type}))))))) + (deftest ^:parallel field-should-have-field-values?-test (doseq [[group input->expected] {"Text and Category Fields" {{:has_field_values :list @@ -90,29 +144,7 @@ {:base_type :type/Time :has_field_values :list :visibility_type :normal} - false} - - "type/* fields should be excluded" - {{:has_field_values :list - :visibility_type :normal - :base_type :type/*} - false} - - "type/Structured fields should be excluded" - (into {} - (for [base-type (conj (descendants :type/Structured) :type/Structured)] - [{:has_field_values :list - :visibility_type :normal - :base_type base-type} - false])) - - "type/Collection fields should be excluded" - (into {} - (for [base-type (conj (descendants :type/Collection) :type/Collection)] - [{:has_field_values :list - :visibility_type :normal - :base_type base-type} - false]))} + false}} [input expected] input->expected] (testing (str group "\n") (testing (pr-str (list 'field-should-have-field-values? input)) diff --git a/test/metabase/sync/analyze/fingerprint_test.clj b/test/metabase/sync/analyze/fingerprint_test.clj index dbec7cda6d8..4555dfd60df 100644 --- a/test/metabase/sync/analyze/fingerprint_test.clj +++ b/test/metabase/sync/analyze/fingerprint_test.clj @@ -34,12 +34,14 @@ "type/Dictionary" "type/Array" "type/Collection" - "type/XML"} - ;; collection and structured subtypes never get fingerprinted + "type/XML" + "type/fingerprint-unsupported"} (comp (mapcat descendants) (map u/qualified-name)) [:type/Collection - :type/Structured])) + :type/Structured + :type/Large + :type/fingerprint-unsupported])) (deftest ^:parallel honeysql-for-fields-that-need-fingerprint-updating-test (testing (str "Make sure we generate the correct HoneySQL WHERE clause based on whatever is in " -- GitLab