Skip to content
Snippets Groups Projects
Unverified Commit 5fda4308 authored by Cam Saul's avatar Cam Saul
Browse files

Field fingerprint versioning

parent d946e6b0
No related merge requests found
Showing
with 609 additions and 182 deletions
......@@ -3778,3 +3778,18 @@ databaseChangeLog:
defaultValue: '0 50 0 * * ? *' # run at 12:50 AM
constraints:
nullable: false
- changeSet:
id: 61
author: camsaul
comment: 'Added 0.26.0'
changes:
- addColumn:
tableName: metabase_field
columns:
- column:
name: fingerprint_version
type: int
remarks: 'The version of the fingerprint for this Field. Used so we can keep track of which Fields need to be analyzed again when new things are added to fingerprints.'
defaultValue: 0
constraints:
nullable: false
......@@ -36,7 +36,8 @@
(for [table (-> (db/select Table, :active true, {:order-by [[:name :asc]]})
(hydrate :db))
:when (mi/can-read? table)]
;; if for some reason a Table doesn't have rows set then set it to 0 so UI doesn't barf. TODO - should that be part of `post-select` instead?
;; if for some reason a Table doesn't have rows set then set it to 0 so UI doesn't barf.
;; TODO - should that be part of `post-select` instead?
(update table :rows (fn [n]
(or n 0)))))
......@@ -99,7 +100,7 @@
["Quarter of Year" "quarter-of-year"]
["Year" "year"]])
(conj
(mapv (fn [[name params]]
(mapv (fn [[name params]]
{:name name
:mbql (apply vector "binning-strategy" nil params)
:type "type/Number"})
......@@ -107,12 +108,11 @@
["10 bins" ["num-bins" 10]]
["50 bins" ["num-bins" 50]]
["100 bins" ["num-bins" 100]]])
{:name "Don't bin"
:mbql nil
:type "type/Number"}
)
{:name "Don't bin"
:mbql nil
:type "type/Number"})
(conj
(mapv (fn [[name params]]
(mapv (fn [[name params]]
{:name name
:mbql (apply vector "binning-strategy" nil params)
:type "type/Coordinate"})
......@@ -121,10 +121,9 @@
["Bin every 10 degrees" ["bin-width" 10.0]]
["Bin every 20 degrees" ["bin-width" 20.0]]
["Bin every 50 degrees" ["bin-width" 50.0]]])
{:name "Don't bin"
:mbql nil
:type "type/Coordinate"}
)))))
{:name "Don't bin"
:mbql nil
:type "type/Coordinate"})))))
(def ^:private dimension-options-for-response
(m/map-kv (fn [k v]
......@@ -226,20 +225,23 @@
(not= (keyword visibility_type) :sensitive))))))))
(defn- card-result-metadata->virtual-fields
"Return a sequence of 'virtual' fields metadata for the 'virtual' table for a Card in the Saved Questions 'virtual' database."
"Return a sequence of 'virtual' fields metadata for the 'virtual' table for a Card in the Saved Questions 'virtual'
database."
[card-id metadata]
(for [col metadata]
(assoc col
:table_id (str "card__" card-id)
:id [:field-literal (:name col) (or (:base_type col) :type/*)]
;; don't return :special_type if it's a PK or FK because it confuses the frontend since it can't actually be used that way IRL
;; don't return :special_type if it's a PK or FK because it confuses the frontend since it can't actually be
;; used that way IRL
:special_type (when-let [special-type (keyword (:special_type col))]
(when-not (or (isa? special-type :type/PK)
(isa? special-type :type/FK))
special-type)))))
(defn card->virtual-table
"Return metadata for a 'virtual' table for a CARD in the Saved Questions 'virtual' database. Optionally include 'virtual' fields as well."
"Return metadata for a 'virtual' table for a CARD in the Saved Questions 'virtual' database. Optionally include
'virtual' fields as well."
[card & {:keys [include-fields?]}]
;; if collection isn't already hydrated then do so
(let [card (hydrate card :colllection)]
......
......@@ -16,14 +16,52 @@
[schema.core :as s]
[toucan.db :as db]))
;; How does analysis decide which Fields should get analyzed?
;;
;; Good question. There are two situations in which Fields should get analyzed:
;;
;; * Whenever a new Field is first detected, *or*
;; * When the fingerprinters are updated in such a way that this Field (based on its base type) ought to be
;; * re-fingerprinted
;;
;; So how do we check all that?
;;
;; 1. We keep track of which base types are affected by new fingerprint versions. See the discussion in
;; `metabase.sync.interface` for more details.
;;
;; 2. FINGERPRINTING
;;
;; 2a. When running fingerprinting, we calculate a fairly sophisticated SQL query to only fetch Fields that
;; need to be re-fingerprinted based on type info and their current fingerprint version
;;
;; 2b. All of these fields get updated fingerprints and marked with the newest version. We also set
;; `last_analyzed` to `nil` so we know we need to re-run classification for them
;;
;; 3. CLASSIFICATION
;;
;; All Fields that have the latest fingerprint version but a `nil` `last_analyzed` time need to be re-classified.
;; Classification takes place for these Fields and special types and the like are updated as needed.
;;
;; 4. MARKING FIELDS AS RECENTLY ANALYZED
;;
;; Once all of the above is done, we update the `last_analyzed` timestamp for all the Fields that got
;; re-fingerprinted and re-classified.
;;
;; So what happens during the next analysis?
;;
;; During the next analysis phase, Fields whose fingerprint is up-to-date will be skipped. However, if a new
;; fingerprint version is introduced, Fields that need it will be upgraded to it. We'll still only reclassify the
;; newly re-fingerprinted Fields, because we'll know to skip the ones from last time since their value of
;; `last_analyzed` is not `nil`.
(s/defn ^:private ^:always-validate update-fields-last-analyzed!
"Update the `last_analyzed` date for all the fields in TABLE."
"Update the `last_analyzed` date for all the recently re-fingerprinted/re-classified Fields in TABLE."
[table :- i/TableInstance]
(db/update-where! Field {:table_id (u/get-id table)
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil}
;; The WHERE portion of this query should match up with that of `classify/fields-to-classify`
(db/update-where! Field {:table_id (u/get-id table)
:fingerprint_version i/latest-fingerprint-version
:last_analyzed nil}
:last_analyzed (u/new-sql-timestamp)))
......@@ -38,8 +76,8 @@
(s/defn ^:always-validate analyze-db!
"Perform in-depth analysis on the data for all Tables in a given DATABASE.
This is dependent on what each database driver supports, but includes things like cardinality testing and table row counting.
This also updates the `:last_analyzed` value for each affected Field."
This is dependent on what each database driver supports, but includes things like cardinality testing and table row
counting. This also updates the `:last_analyzed` value for each affected Field."
[database :- i/DatabaseInstance]
(sync-util/sync-operation :analyze database (format "Analyze data for %s" (sync-util/name-for-logging database))
(let [tables (sync-util/db->sync-tables database)]
......
(ns metabase.sync.analyze.classify
"Analysis sub-step that takes a fingerprint for a Field and infers and saves appropriate information like special type.
Each 'classifier' takes the information available to it and decides whether or not to run.
"Analysis sub-step that takes a fingerprint for a Field and infers and saves appropriate information like special
type. Each 'classifier' takes the information available to it and decides whether or not to run.
We currently have the following classifiers:
1. `name`: Looks at the name of a Field and infers a special type if possible
2. `no-preview-display`: Looks at average length of text Field recorded in fingerprint and decides whether or not we should hide this Field
2. `no-preview-display`: Looks at average length of text Field recorded in fingerprint and decides whether or not
we should hide this Field
3. `category`: Looks at the number of distinct values of Field and determines whether it can be a Category
4. `text-fingerprint`: Looks at percentages recorded in a text Fields' TextFingerprint and infers a special type if possible
4. `text-fingerprint`: Looks at percentages recorded in a text Fields' TextFingerprint and infers a special type
if possible
All classifier functions take two arguments, a `FieldInstance` and a possibly `nil` `Fingerprint`, and should return the Field
with any appropriate changes (such as a new special type). If no changes are appropriate, a classifier may return nil.
Error handling is handled by `run-classifiers` below, so individual classiers do not need to handle errors themselves.
All classifier functions take two arguments, a `FieldInstance` and a possibly `nil` `Fingerprint`, and should
return the Field with any appropriate changes (such as a new special type). If no changes are appropriate, a
classifier may return nil. Error handling is handled by `run-classifiers` below, so individual classiers do not
need to handle errors themselves.
In the future, we plan to add more classifiers, including ML ones that run offline."
(:require [clojure.data :as data]
......@@ -28,9 +31,9 @@
[schema.core :as s]
[toucan.db :as db]))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING INDIVIDUAL FIELDS |
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING INDIVIDUAL FIELDS |
;;; +----------------------------------------------------------------------------------------------------------------+
(def ^:private values-that-can-be-set
"Columns of Field that classifiers are allowed to set."
......@@ -40,7 +43,9 @@
"Save the updates in UPDATED-FIELD."
[original-field :- i/FieldInstance, updated-field :- i/FieldInstance]
(let [[_ values-to-set] (data/diff original-field updated-field)]
(log/debug (format "Based on classification, updating these values of %s: %s" (sync-util/name-for-logging original-field) values-to-set))
(log/debug (format "Based on classification, updating these values of %s: %s"
(sync-util/name-for-logging original-field)
values-to-set))
;; Check that we're not trying to set anything that we're not allowed to
(doseq [k (keys values-to-set)]
(when-not (contains? values-that-can-be-set k)
......@@ -51,8 +56,8 @@
(def ^:private classifiers
"Various classifier functions available. These should all take two args, a `FieldInstance` and a possibly `nil` `Fingerprint`,
and return `FieldInstance` with any inferred property changes, or `nil` if none could be inferred.
"Various classifier functions available. These should all take two args, a `FieldInstance` and a possibly `nil`
`Fingerprint`, and return `FieldInstance` with any inferred property changes, or `nil` if none could be inferred.
Order is important!"
[name/infer-special-type
category/infer-is-category
......@@ -84,21 +89,18 @@
(save-field-updates! field updated-field))))))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING ALL FIELDS IN A TABLE |
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; +------------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING ALL FIELDS IN A TABLE |
;;; +------------------------------------------------------------------------------------------------------------------+
(s/defn ^:private ^:always-validate fields-to-classify :- (s/maybe [i/FieldInstance])
"Return a sequences of Fields belonging to TABLE for which we should attempt to determine special type.
This should include NEW fields that are active, visibile, and without an existing special type."
This should include Fields that have the latest fingerprint, but have not yet *completed* analysis."
[table :- i/TableInstance]
(seq (db/select Field
:table_id (u/get-id table)
:special_type nil
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil)))
:table_id (u/get-id table)
:fingerprint_version i/latest-fingerprint-version
:last_analyzed nil)))
(s/defn ^:always-validate classify-fields!
"Run various classifiers on the appropriate FIELDS in a TABLE that have not been previously analyzed.
......
(ns metabase.sync.analyze.fingerprint
"Analysis sub-step that takes a sample of values for a Field and saving a non-identifying fingerprint
used for classification. This fingerprint is saved as a column on the Field it belongs to."
(:require [clojure.tools.logging :as log]
(:require [clojure.set :as set]
[clojure.tools.logging :as log]
[honeysql.helpers :as h]
[metabase.models.field :refer [Field]]
[metabase.sync
[interface :as i]
......@@ -12,6 +14,7 @@
[sample :as sample]
[text :as text]]
[metabase.util :as u]
[metabase.util.schema :as su]
[schema.core :as s]
[toucan.db :as db]))
......@@ -43,24 +46,104 @@
(sync-util/with-error-handling (format "Error generating fingerprint for %s" (sync-util/name-for-logging field))
(when-let [fingerprint (fingerprint field)]
(log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field)))
;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll
;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the
;; latest fingerprints.
(db/update! Field (u/get-id field)
:fingerprint fingerprint))))
:fingerprint fingerprint
:fingerprint_version i/latest-fingerprint-version
:last_analyzed nil))))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | FINGERPRINTING ALL FIELDS IN A TABLE |
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | WHICH FIELDS NEED UPDATED FINGERPRINTS? |
;;; +----------------------------------------------------------------------------------------------------------------+
;; Logic for building the somewhat-complicated query we use to determine which Fields need new Fingerprints
;;
;; This ends up giving us a SQL query that looks something like:
;;
;; SELECT *
;; FROM metabase_field
;; WHERE active = true
;; AND preview_display = true
;; AND visibility_type <> 'retired'
;; AND table_id = 1
;; AND ((fingerprint_version < 1 AND
;; base_type IN ("type/Longitude", "type/Latitude", "type/Integer"))
;; OR
;; (fingerprint_version < 2 AND
;; base_type IN ("type/Text", "type/SerializedJSON")))
(s/defn ^:private ^:always-validate base-types->descendants :- #{su/FieldTypeKeywordOrString}
"Given a set of BASE-TYPES return an expanded set that includes those base types as well as all of their
descendants. These types are converted to strings so HoneySQL doesn't confuse them for columns."
[base-types :- #{su/FieldType}]
(->> (for [base-type base-types]
(cons base-type (descendants base-type)))
(reduce set/union)
(map u/keyword->qualified-name)
set))
;; It's even cooler if we could generate efficient SQL that looks at what types have already
;; been marked for upgrade so we don't need to generate overly-complicated queries.
;;
;; e.g. instead of doing:
;;
;; WHERE ((version < 2 AND base_type IN ("type/Integer", "type/BigInteger", "type/Text")) OR
;; (version < 1 AND base_type IN ("type/Boolean", "type/Integer", "type/BigInteger")))
;;
;; we could do:
;;
;; WHERE ((version < 2 AND base_type IN ("type/Integer", "type/BigInteger", "type/Text")) OR
;; (version < 1 AND base_type IN ("type/Boolean")))
;;
;; (In the example above, something that is a `type/Integer` or `type/Text` would get upgraded
;; as long as it's less than version 2; so no need to also check if those types are less than 1, which
;; would always be the case.)
;;
;; This way we can also completely omit adding clauses for versions that have been "eclipsed" by others.
;; This would keep the SQL query from growing boundlessly as new fingerprint versions are added
(s/defn ^:private ^:always-validate versions-clauses :- [s/Any]
[]
;; keep track of all the base types (including descendants) for each version, starting from most recent
(let [versions+base-types (reverse (sort-by first (seq i/fingerprint-version->types-that-should-be-re-fingerprinted)))
already-seen (atom #{})]
(for [[version base-types] versions+base-types
:let [descendants (base-types->descendants base-types)
not-yet-seen (set/difference descendants @already-seen)]
;; if all the descendants of any given version have already been seen, we can skip this clause altogether
:when (seq not-yet-seen)]
;; otherwise record the newly seen types and generate an appropriate clause
(do
(swap! already-seen set/union not-yet-seen)
[:and
[:< :fingerprint_version version]
[:in :base_type not-yet-seen]]))))
(s/defn ^:private ^:always-validate honeysql-for-fields-that-need-fingerprint-updating :- {:where s/Any}
"Return appropriate WHERE clause for all the Fields whose Fingerprint needs to be re-calculated."
([]
{:where [:and
[:= :active true]
[:not= :visibility_type "retired"]
(cons :or (versions-clauses))]})
([table :- i/TableInstance]
(h/merge-where (honeysql-for-fields-that-need-fingerprint-updating)
[:= :table_id (u/get-id table)])))
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | FINGERPRINTING ALL FIELDS IN A TABLE |
;;; +----------------------------------------------------------------------------------------------------------------+
(s/defn ^:private ^:always-validate fields-to-fingerprint :- (s/maybe [i/FieldInstance])
"Return a sequences of Fields belonging to TABLE for which we should generate (and save) fingerprints.
This should include NEW fields that are active and visibile."
[table :- i/TableInstance]
(seq (db/select Field
:table_id (u/get-id table)
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil)))
(honeysql-for-fields-that-need-fingerprint-updating table))))
(s/defn ^:always-validate fingerprint-fields!
"Generate and save fingerprints for all the Fields in TABLE that have not been previously analyzed."
......
......@@ -4,6 +4,7 @@
[database :refer [Database]]
[field :refer [Field]]
[table :refer [Table]]]
metabase.types
[metabase.util.schema :as su]
[schema.core :as s]))
......@@ -44,22 +45,23 @@
"Schema for the expected output of `describe-table-fks`."
(s/maybe #{FKMetadataEntry}))
;; These schemas are provided purely as conveniences since adding `:import` statements to get the corresponding classes from the model namespaces
;; also requires a `:require`, which `clj-refactor` seems more than happy to strip out from the ns declaration when running `cljr-clean-ns`.
;; Plus as a bonus in the future we could add additional validations to these, e.g. requiring that a Field have a base_type
;; These schemas are provided purely as conveniences since adding `:import` statements to get the corresponding
;; classes from the model namespaces also requires a `:require`, which `clj-refactor` seems more than happy to strip
;; out from the ns declaration when running `cljr-clean-ns`. Plus as a bonus in the future we could add additional
;; validations to these, e.g. requiring that a Field have a base_type
(def DatabaseInstance "Schema for a valid instance of a Metabase Database." (class Database))
(def TableInstance "Schema for a valid instance of a Metabase Table." (class Table))
(def FieldInstance "Schema for a valid instance of a Metabase Field." (class Field))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | SAMPLING & FINGERPRINTS |
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | SAMPLING & FINGERPRINTS |
;;; +----------------------------------------------------------------------------------------------------------------+
(def ValuesSample
"Schema for a sample of VALUES returned by the `sample` sub-stage of analysis and passed into the `fingerprint` stage.
Guaranteed to be non-empty and non-nil."
"Schema for a sample of VALUES returned by the `sample` sub-stage of analysis and passed into the `fingerprint`
stage. Guaranteed to be non-empty and non-nil."
;; Validating against this is actually pretty quick, in the order of microseconds even for a 10,000 value sequence
(s/constrained [(s/pred (complement nil?))] seq "Non-empty sequence of non-nil values."))
......@@ -95,9 +97,41 @@
"Type-specific fingerprint with exactly one key"))
(def Fingerprint
"Schema for a Field 'fingerprint' generated as part of the analysis stage. Used to power the 'classification' sub-stage of
analysis. Stored as the `fingerprint` column of Field."
{(s/optional-key :version) su/IntGreaterThanZero ; Fingerprints with no version key are assumed to have version of 1
(s/optional-key :global) GlobalFingerprint
"Schema for a Field 'fingerprint' generated as part of the analysis stage. Used to power the 'classification'
sub-stage of analysis. Stored as the `fingerprint` column of Field."
{(s/optional-key :global) GlobalFingerprint
(s/optional-key :type) TypeSpecificFingerprint
(s/optional-key :experimental) {s/Keyword s/Any}})
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | FINGERPRINT VERSIONING |
;;; +----------------------------------------------------------------------------------------------------------------+
;; Occasionally we want to update the schema of our Field fingerprints and add new logic to populate the additional
;; keys. However, by default, analysis (which includes fingerprinting) only runs on *NEW* Fields, meaning *EXISTING*
;; Fields won't get new fingerprints with the updated info.
;;
;; To work around this, we can use a versioning system. Fields whose Fingerprint's version is lower than the current
;; version should get updated during the next sync/analysis regardless of whether they are or are not new Fields.
;; However, this could be quite inefficient: if we add a new fingerprint field for `:type/Number` Fields, why should
;; we re-fingerprint `:type/Text` Fields? Ideally, we'd only re-fingerprint the numeric Fields.
;;
;; Thus, our implementation below. Each new fingerprint version lists a set of types that should be upgraded to it.
;; Our fingerprinting logic will calculate whether a fingerprint needs to be recalculated based on its version and the
;; changes that have been made in subsequent versions. Only the Fields that would benefit from the new Fingerprint
;; info need be re-fingerprinted.
;;
;; Thus, if Fingerprint v2 contains some new info for numeric Fields, only Fields that derive from `:type/Number` need
;; be upgraded to v2. Textual Fields with a v1 fingerprint can stay at v1 for the time being. Later, if we introduce a
;; v3 that includes new "global" fingerprint info, both the v2-fingerprinted numeric Fields and the v1-fingerprinted
;; textual Fields can be upgraded to v3.
(def fingerprint-version->types-that-should-be-re-fingerprinted
"Map of fingerprint version to the set of Field base types that need to be upgraded to this version the next
time we do analysis. The highest-numbered entry is considered the latest version of fingerprints."
{1 #{:type/*}})
(def latest-fingerprint-version
"The newest (highest-numbered) version of our Field fingerprints."
(apply max (keys fingerprint-version->types-that-should-be-re-fingerprinted)))
......@@ -251,14 +251,15 @@
(merge
default-field-details
(match-$ (hydrate/hydrate field :values)
{:updated_at $
:id $
:raw_column_id $
:created_at $
:last_analyzed $
:fingerprint $
:fk_target_field_id $
:values $})))
{:updated_at $
:id $
:raw_column_id $
:created_at $
:last_analyzed $
:fingerprint $
:fingerprint_version $
:fk_target_field_id $
:values $})))
;; ## GET /api/meta/table/:id/query_metadata
;; TODO - add in example with Field :values
......
......@@ -40,45 +40,46 @@
;; ## GET /api/field/:id
(expect
(tu/match-$ (Field (id :users :name))
{:description nil
:table_id (id :users)
:raw_column_id $
:fingerprint $
:table (tu/match-$ (Table (id :users))
{:description nil
:entity_type nil
:visibility_type nil
:db (db-details)
:schema "PUBLIC"
:name "USERS"
:display_name "Users"
:rows 15
:updated_at $
:entity_name nil
:active true
:id (id :users)
:db_id (id)
:caveats nil
:points_of_interest nil
:show_in_getting_started false
:raw_table_id $
:created_at $})
:special_type "type/Name"
:name "NAME"
:display_name "Name"
:caveats nil
:points_of_interest nil
:updated_at $
:last_analyzed $
:active true
:id (id :users :name)
:visibility_type "normal"
:position 0
:preview_display true
:created_at $
:base_type "type/Text"
:fk_target_field_id nil
:parent_id nil})
{:description nil
:table_id (id :users)
:raw_column_id $
:fingerprint $
:fingerprint_version $
:table (tu/match-$ (Table (id :users))
{:description nil
:entity_type nil
:visibility_type nil
:db (db-details)
:schema "PUBLIC"
:name "USERS"
:display_name "Users"
:rows 15
:updated_at $
:entity_name nil
:active true
:id (id :users)
:db_id (id)
:caveats nil
:points_of_interest nil
:show_in_getting_started false
:raw_table_id $
:created_at $})
:special_type "type/Name"
:name "NAME"
:display_name "Name"
:caveats nil
:points_of_interest nil
:updated_at $
:last_analyzed $
:active true
:id (id :users :name)
:visibility_type "normal"
:position 0
:preview_display true
:created_at $
:base_type "type/Text"
:fk_target_field_id nil
:parent_id nil})
((user->client :rasta) :get 200 (format "field/%d" (id :users :name))))
......
......@@ -88,13 +88,14 @@
(merge
field-defaults
(match-$ field
{:updated_at $
:id $
:created_at $
:fk_target_field_id $
:raw_column_id $
:last_analyzed $
:fingerprint $})))
{:updated_at $
:id $
:created_at $
:fk_target_field_id $
:raw_column_id $
:last_analyzed $
:fingerprint $
:fingerprint_version $})))
(defn- fk-field-details [field]
(-> (field-details field)
......
(ns metabase.sync.analyze.classify-test
(:require [expectations :refer :all]
[metabase.models
[field :refer [Field]]
[table :refer [Table]]]
[metabase.sync.analyze.classify :as classify]
[metabase.sync.interface :as i]
[metabase.util :as u]
[toucan.util.test :as tt]))
;; Check that only the right Fields get classified
(expect
["Current fingerprint, not analyzed"]
;; For max version pick something we'll hopefully never hit. Don't think we'll ever have 32k different versions :D
(with-redefs [i/latest-fingerprint-version Short/MAX_VALUE]
(tt/with-temp* [Table [table]
Field [_ {:table_id (u/get-id table)
:name "Current fingerprint, not analyzed"
:fingerprint_version Short/MAX_VALUE
:last_analyzed nil}]
Field [_ {:table_id (u/get-id table)
:name "Current fingerprint, already analzed"
:fingerprint_version Short/MAX_VALUE
:last_analyzed (u/->Timestamp "2017-08-09")}]
Field [_ {:table_id (u/get-id table)
:name "Old fingerprint, not analyzed"
:fingerprint_version (dec Short/MAX_VALUE)
:last_analyzed nil}]
Field [_ {:table_id (u/get-id table)
:name "Old fingerprint, already analzed"
:fingerprint_version (dec Short/MAX_VALUE)
:last_analyzed (u/->Timestamp "2017-08-09")}]]
(for [field (#'classify/fields-to-classify table)]
(:name field)))))
(ns metabase.sync.analyze.fingerprint-test
"Basic tests to make sure the fingerprint generatation code is doing something that makes sense."
(:require [expectations :refer :all]
[metabase.models.field :refer [Field]]
[metabase.models
[field :refer [Field]]
[table :refer [Table]]]
[metabase.sync.analyze.fingerprint :as fingerprint]
[metabase.test.data :as data]))
[metabase.sync.interface :as i]
[metabase.test.data :as data]
[metabase.util :as u]
[toucan.db :as db]
[toucan.util.test :as tt]))
;; basic test for a numeric Field
(expect
......@@ -27,3 +33,178 @@
(expect
{:global {:distinct-count 618}}
(#'fingerprint/fingerprint (Field (data/id :checkins :date))))
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | TESTS FOR WHICH FIELDS NEED FINGERPRINTING |
;;; +----------------------------------------------------------------------------------------------------------------+
;; Check that our `base-types->descendants` function properly returns a set of descendants including parent type
(expect
#{"type/URL" "type/ImageURL" "type/AvatarURL"}
(#'fingerprint/base-types->descendants #{:type/URL}))
(expect
#{"type/ImageURL" "type/AvatarURL"}
(#'fingerprint/base-types->descendants #{:type/ImageURL :type/AvatarURL}))
;; Make sure we generate the correct HoneySQL WHERE clause based on whatever is in
;; `fingerprint-version->types-that-should-be-re-fingerprinted`
(expect
{:where
[:and
[:= :active true]
[:not= :visibility_type "retired"]
[:or
[:and
[:< :fingerprint_version 1]
[:in :base_type #{"type/URL" "type/ImageURL" "type/AvatarURL"}]]]]}
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted {1 #{:type/URL}}]
(#'fingerprint/honeysql-for-fields-that-need-fingerprint-updating)))
(expect
{:where
[:and
[:= :active true]
[:not= :visibility_type "retired"]
[:or
[:and
[:< :fingerprint_version 2]
[:in :base_type #{"type/Decimal" "type/Latitude" "type/Longitude" "type/Coordinate" "type/Float"}]]
[:and
[:< :fingerprint_version 1]
[:in :base_type #{"type/ImageURL" "type/AvatarURL"}]]]]}
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted {1 #{:type/ImageURL :type/AvatarURL}
2 #{:type/Float}}]
(#'fingerprint/honeysql-for-fields-that-need-fingerprint-updating)))
;; Make sure that our SQL generation code is clever enough to remove version checks when a newer version completely
;; eclipses them
(expect
{:where
[:and
[:= :active true]
[:not= :visibility_type "retired"]
[:or
[:and
[:< :fingerprint_version 2]
[:in :base_type #{"type/Decimal" "type/Latitude" "type/Longitude" "type/Coordinate" "type/Float"}]]
;; no type/Float stuff should be included for 1
[:and
[:< :fingerprint_version 1]
[:in :base_type #{"type/SerializedJSON" "type/Name" "type/Text" "type/UUID" "type/State" "type/City"
"type/Country" "type/URL" "type/Email" "type/ImageURL" "type/AvatarURL"
"type/Description"}]]]]}
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted {1 #{:type/Float :type/Text}
2 #{:type/Float}}]
(#'fingerprint/honeysql-for-fields-that-need-fingerprint-updating)))
;; Make sure that our SQL generation code is also clever enough to completely skip completely eclipsed versions
(expect
{:where
[:and
[:= :active true]
[:not= :visibility_type "retired"]
[:or
[:and
[:< :fingerprint_version 4]
[:in :base_type #{"type/Decimal" "type/Latitude" "type/Longitude" "type/Coordinate" "type/Float"}]]
[:and
[:< :fingerprint_version 3]
[:in :base_type #{"type/URL" "type/ImageURL" "type/AvatarURL"}]]
;; version 2 can be eliminated completely since everything relevant there is included in 4
;; The only things that should go in 1 should be `:type/City` since `:type/Coordinate` is included in 4
[:and
[:< :fingerprint_version 1]
[:in :base_type #{"type/City"}]]]]}
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted {1 #{:type/Coordinate :type/City}
2 #{:type/Coordinate}
3 #{:type/URL}
4 #{:type/Float}}]
(#'fingerprint/honeysql-for-fields-that-need-fingerprint-updating)))
;; Make sure that the above functions are used correctly to determine which Fields get (re-)fingerprinted
(defn- field-was-fingerprinted? {:style/indent 0} [fingerprint-versions field-properties]
(let [fingerprinted? (atom false)]
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted fingerprint-versions
fingerprint/fingerprint! (fn [& _] (reset! fingerprinted? true))]
(tt/with-temp* [Table [table]
Field [_ (assoc field-properties :table_id (u/get-id table))]]
(fingerprint/fingerprint-fields! table))
@fingerprinted?)))
;; Field is a subtype of newer fingerprint version
(expect
(field-was-fingerprinted?
{2 #{:type/Float}}
{:base_type :type/Decimal, :fingerprint_version 1}))
;; field is *not* a subtype of newer fingerprint version
(expect
false
(field-was-fingerprinted?
{2 #{:type/Text}}
{:base_type :type/Decimal, :fingerprint_version 1}))
;; Field is a subtype of one of several types for newer fingerprint version
(expect
(field-was-fingerprinted?
{2 #{:type/Float :type/Text}}
{:base_type :type/Decimal, :fingerprint_version 1}))
;; Field has same version as latest fingerprint version
(expect
false
(field-was-fingerprinted?
{1 #{:type/Float}}
{:base_type :type/Decimal, :fingerprint_version 1}))
;; field has newer version than latest fingerprint version (should never happen)
(expect
false
(field-was-fingerprinted?
{1 #{:type/Float}}
{:base_type :type/Decimal, :fingerprint_version 2}))
;; field has same exact type as newer fingerprint version
(expect
(field-was-fingerprinted?
{2 #{:type/Float}}
{:base_type :type/Float, :fingerprint_version 1}))
;; field is parent type of newer fingerprint version type
(expect
false
(field-was-fingerprinted?
{2 #{:type/Decimal}}
{:base_type :type/Float, :fingerprint_version 1}))
;; several new fingerprint versions exist
(expect
(field-was-fingerprinted?
{2 #{:type/Float}
3 #{:type/Text}}
{:base_type :type/Decimal, :fingerprint_version 1}))
(expect
(field-was-fingerprinted?
{2 #{:type/Text}
3 #{:type/Float}}
{:base_type :type/Decimal, :fingerprint_version 1}))
;; Make sure the `fingerprint!` function is correctly updating the correct columns of Field
(expect
{:fingerprint {:fake-fingerprint? true}
:fingerprint_version 3
:last_analyzed nil}
(with-redefs [i/latest-fingerprint-version 3
fingerprint/fingerprint (constantly {:fake-fingerprint? true})]
(tt/with-temp Field [field {:base_type :type/Integer
:fingerprint nil
:fingerprint_version 1
:last_analyzed (u/->Timestamp "2017-08-09")}]
(#'fingerprint/fingerprint! field)
(db/select-one [Field :fingerprint :fingerprint_version :last_analyzed] :id (u/get-id field)))))
......@@ -6,6 +6,7 @@
[table :refer [Table]]]
[metabase.sync
[analyze :as analyze]
[interface :as i]
[sync-metadata :as sync-metadata]]
[metabase.test.data :as data]
[metabase.util :as u]
......@@ -15,21 +16,23 @@
(def ^:private fake-analysis-completion-date
(u/->Timestamp "2017-08-01"))
;; Check that Fields do *not* get analyzed if they're not newly created
;; Check that Fields do *not* get analyzed if they're not newly created and fingerprint version is current
(expect
;; PK is ok because it gets marked as part of metadata sync
#{{:name "LONGITUDE", :special_type nil, :last_analyzed fake-analysis-completion-date}
{:name "CATEGORY_ID", :special_type nil, :last_analyzed fake-analysis-completion-date}
{:name "PRICE", :special_type nil, :last_analyzed fake-analysis-completion-date}
{:name "LATITUDE", :special_type nil, :last_analyzed fake-analysis-completion-date}
{:name "NAME", :special_type nil, :last_analyzed fake-analysis-completion-date}
{:name "ID", :special_type :type/PK, :last_analyzed fake-analysis-completion-date}} ; PK is ok because it gets marked as part of metadata sync
{:name "ID", :special_type :type/PK, :last_analyzed fake-analysis-completion-date}}
(tt/with-temp* [Database [db {:engine "h2", :details (:details (data/db))}]
Table [table {:name "VENUES", :db_id (u/get-id db)}]]
;; sync the metadata, but DON't do analysis YET
(sync-metadata/sync-table-metadata! table)
;; now mark all the Tables as analyzed so they won't be subject to analysis
;; now mark all the Tables as analyzed with so they won't be subject to analysis
(db/update-where! Field {:table_id (u/get-id table)}
:last_analyzed fake-analysis-completion-date)
:last_analyzed fake-analysis-completion-date
:fingerprint_version Short/MAX_VALUE)
;; ok, NOW run the analysis process
(analyze/analyze-table! table)
;; check and make sure all the Fields don't have special types and their last_analyzed date didn't change
......@@ -53,3 +56,28 @@
;; fields *SHOULD* have special types now
(set (for [field (db/select [Field :name :special_type :last_analyzed] :table_id (u/get-id table))]
(into {} (update field :last_analyzed boolean))))))
;; Make sure that only the correct Fields get marked as recently analyzed
(expect
#{"Current fingerprint, not analyzed"}
(with-redefs [i/latest-fingerprint-version Short/MAX_VALUE
u/new-sql-timestamp (constantly (u/->Timestamp "1999-01-01"))]
(tt/with-temp* [Table [table]
Field [_ {:table_id (u/get-id table)
:name "Current fingerprint, not analyzed"
:fingerprint_version Short/MAX_VALUE
:last_analyzed nil}]
Field [_ {:table_id (u/get-id table)
:name "Current fingerprint, already analzed"
:fingerprint_version Short/MAX_VALUE
:last_analyzed (u/->Timestamp "2017-08-09")}]
Field [_ {:table_id (u/get-id table)
:name "Old fingerprint, not analyzed"
:fingerprint_version (dec Short/MAX_VALUE)
:last_analyzed nil}]
Field [_ {:table_id (u/get-id table)
:name "Old fingerprint, already analzed"
:fingerprint_version (dec Short/MAX_VALUE)
:last_analyzed (u/->Timestamp "2017-08-09")}]]
(#'analyze/update-fields-last-analyzed! table)
(db/select-field :name Field :last_analyzed (u/new-sql-timestamp)))))
(ns metabase.sync-database.analyze-test
;; TODO - this namespace follows the old pattern of sync namespaces. Tests should be moved to appropriate new homes at some point
;; TODO - this namespace follows the old pattern of sync namespaces. Tests should be moved to appropriate new homes
;; at some point
(:require [clojure.string :as str]
[expectations :refer :all]
[metabase
......@@ -83,15 +84,15 @@
(expect false (values-are-valid-emails? ["false"]))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | Tests to avoid analyzing hidden tables |
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | Tests to avoid analyzing hidden tables |
;;; +----------------------------------------------------------------------------------------------------------------+
(defn- unanalyzed-fields-count [table]
(defn- fake-field-was-analyzed? [field]
;; don't let ourselves be fooled if the test passes because the table is
;; totally broken or has no fields. Make sure we actually test something
(assert (pos? (db/count Field :table_id (u/get-id table))))
(db/count Field :last_analyzed nil, :table_id (u/get-id table)))
(assert (db/exists? Field :id (u/get-id field)))
(db/exists? Field :id (u/get-id field), :last_analyzed [:not= nil]))
(defn- latest-sync-time [table]
(db/select-one-field :last_analyzed Field
......@@ -99,7 +100,7 @@
:table_id (u/get-id table)
{:order-by [[:last_analyzed :desc]]}))
(defn- set-table-visibility-type!
(defn- set-table-visibility-type-via-api!
"Change the VISIBILITY-TYPE of TABLE via an API call.
(This is done via the API so we can see which, if any, side effects (e.g. analysis) get triggered.)"
[table visibility-type]
......@@ -120,56 +121,55 @@
additional-options))
(defn- fake-field [table & {:as additional-options}]
(merge {:table_id (u/get-id table), :name "NAME"}
(merge {:table_id (u/get-id table), :name "PRICE", :base_type "type/Integer"}
additional-options))
;; expect all the kinds of hidden tables to stay un-analyzed through transitions and repeated syncing
(expect
1
false
(tt/with-temp* [Table [table (fake-table)]
Field [field (fake-field table)]]
(set-table-visibility-type! table "hidden")
(set-table-visibility-type-via-api! table "hidden")
(api-sync! table)
(set-table-visibility-type! table "cruft")
(set-table-visibility-type! table "cruft")
(set-table-visibility-type-via-api! table "cruft")
(set-table-visibility-type-via-api! table "cruft")
(api-sync! table)
(set-table-visibility-type! table "technical")
(set-table-visibility-type-via-api! table "technical")
(api-sync! table)
(set-table-visibility-type! table "technical")
(set-table-visibility-type-via-api! table "technical")
(api-sync! table)
(api-sync! table)
(unanalyzed-fields-count table)))
(fake-field-was-analyzed? field)))
;; same test not coming through the api
(defn- analyze-table! [table]
;; we're calling `analyze-db!` instead of `analyze-table!` because the latter doesn't care if you try to sync a hidden table
;; and will allow that. TODO - Does that behavior make sense?
;; we're calling `analyze-db!` instead of `analyze-table!` because the latter doesn't care if you try to sync a
;; hidden table and will allow that. TODO - Does that behavior make sense?
(analyze/analyze-db! (Database (:db_id table))))
(expect
1
false
(tt/with-temp* [Table [table (fake-table)]
Field [field (fake-field table)]]
(set-table-visibility-type! table "hidden")
(set-table-visibility-type-via-api! table "hidden")
(analyze-table! table)
(set-table-visibility-type! table "cruft")
(set-table-visibility-type! table "cruft")
(set-table-visibility-type-via-api! table "cruft")
(set-table-visibility-type-via-api! table "cruft")
(analyze-table! table)
(set-table-visibility-type! table "technical")
(set-table-visibility-type-via-api! table "technical")
(analyze-table! table)
(set-table-visibility-type! table "technical")
(set-table-visibility-type-via-api! table "technical")
(analyze-table! table)
(analyze-table! table)
(unanalyzed-fields-count table)))
(fake-field-was-analyzed? field)))
;; un-hiding a table should cause it to be analyzed
(expect
0
(tt/with-temp* [Table [table (fake-table)]
Field [field (fake-field table)]]
(set-table-visibility-type! table "hidden")
(set-table-visibility-type! table nil)
(unanalyzed-fields-count table)))
(set-table-visibility-type-via-api! table "hidden")
(set-table-visibility-type-via-api! table nil)
(fake-field-was-analyzed? field)))
;; re-hiding a table should not cause it to be analyzed
(expect
......@@ -177,9 +177,9 @@
(tt/with-temp* [Table [table (fake-table :visibility_type "hidden")]
Field [field (fake-field table)]]
;; switch the table to visible (triggering a sync) and get the last sync time
(let [last-sync-time (do (set-table-visibility-type! table nil)
(let [last-sync-time (do (set-table-visibility-type-via-api! table nil)
(latest-sync-time table))]
;; now make it hidden again
(set-table-visibility-type! table "hidden")
(set-table-visibility-type-via-api! table "hidden")
;; sync time shouldn't change
(= last-sync-time (latest-sync-time table)))))
......@@ -22,10 +22,11 @@
Done for the sake of making debugging some of the tests below easier."
[tables]
(for [table tables]
(-> (u/select-non-nil-keys table [:schema :name :raw_table_id :fields])
(update :fields (fn [fields]
(for [field fields]
(u/select-non-nil-keys field [:table_id :name :fk_target_field_id :parent_id :base_type :special_type])))))))
(-> (u/select-non-nil-keys table [:schema :name :fields])
(update :fields (fn [fields]
(for [field fields]
(u/select-non-nil-keys field [:table_id :name :fk_target_field_id :parent_id :base_type
:special_type])))))))
(defn- get-tables [database-or-id]
(->> (hydrate (db/select Table, :db_id (u/get-id database-or-id), {:order-by [:id]}) :fields)
......
......@@ -65,7 +65,8 @@
:describe-table-fks describe-table-fks
:features (constantly #{:foreign-keys})
:details-fields (constantly [])
:field-values-lazy-seq (constantly [])}))
;; enough values that it won't get marked as a Category, but still get a fingerprint or w/e
:field-values-lazy-seq (fn [& _] (range 500))}))
(driver/register-driver! :sync-test (SyncTestDriver.))
......@@ -74,7 +75,11 @@
(defn- table-details [table]
(into {} (-> (dissoc table :db :pk_field :field_values)
(assoc :fields (for [field (db/select Field, :table_id (:id table), {:order-by [:name]})]
(into {} (dissoc field :table :db :children :qualified-name :qualified-name-components :values :target))))
(into {} (-> (dissoc field
:table :db :children :qualified-name :qualified-name-components
:values :target)
(update :fingerprint map?)
(update :fingerprint_version (complement zero?))))))
tu/boolean-ids-and-timestamps)))
(def ^:private table-defaults
......@@ -95,22 +100,23 @@
:updated_at true})
(def ^:private field-defaults
{:id true
:table_id true
:raw_column_id false
:description nil
:caveats nil
:points_of_interest nil
:active true
:parent_id false
:position 0
:preview_display true
:visibility_type :normal
:fk_target_field_id false
:created_at true
:updated_at true
:last_analyzed true
:fingerprint nil})
{:id true
:table_id true
:raw_column_id false
:description nil
:caveats nil
:points_of_interest nil
:active true
:parent_id false
:position 0
:preview_display true
:visibility_type :normal
:fk_target_field_id false
:created_at true
:updated_at true
:last_analyzed true
:fingerprint true
:fingerprint_version true})
;; ## SYNC DATABASE
(expect
......@@ -238,7 +244,7 @@
field-id (db/select-one-id Field, :table_id table-id, :name "title")]
(tt/with-temp FieldValues [_ {:field_id field-id
:values "[1,2,3]"}]
(let [initial-field-values (db/select-one-field :values FieldValues, :field_id field-id)]
(let [initial-field-values (db/select-one-field :values FieldValues, :field_id field-id)]
(sync-database! db)
[initial-field-values
(db/select-one-field :values FieldValues, :field_id field-id)])))))
......
......@@ -66,7 +66,7 @@
:describe-table describe-table
:features (constantly #{:dynamic-schema :nested-fields})
:details-fields (constantly [])
:field-values-lazy-seq (constantly nil)
:field-values-lazy-seq (fn [& _] (range 500)) ; enough so it can get fingerprinted, but not be a category
:table-rows-seq table-rows-seq}))
(driver/register-driver! :toucanery (ToucaneryDriver.))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment