Skip to content
Snippets Groups Projects
Commit 2183b8ac authored by Cam Saul's avatar Cam Saul
Browse files

Split analysis into separate fingerprint/classification stages

parent fe232d5a
No related branches found
No related tags found
No related merge requests found
Showing
with 644 additions and 494 deletions
......@@ -3741,3 +3741,14 @@ databaseChangeLog:
columns:
- column:
name: field_id
- changeSet:
id: 59
author: camsaul
changes:
- addColumn:
tableName: metabase_field
columns:
- column:
name: fingerprint
type: text
remarks: 'Serialized JSON containing non-identifying information about this Field, such as min, max, and percent JSON. Used for classification.'
......@@ -67,16 +67,17 @@
:types (constantly {:base_type :keyword
:special_type :keyword
:visibility_type :keyword
:description :clob})
:description :clob
:fingerprint :json})
:properties (constantly {:timestamped? true})
:pre-insert pre-insert
:pre-update pre-update
:pre-delete pre-delete})
i/IObjectPermissions
(merge i/IObjectPermissionsDefaults
{:perms-objects-set perms-objects-set
:can-read? (partial i/current-user-has-full-permissions? :read)
:can-write? i/superuser?}))
{:perms-objects-set perms-objects-set
:can-read? (partial i/current-user-has-full-permissions? :read)
:can-write? i/superuser?}))
;;; ------------------------------------------------------------ Hydration / Util Fns ------------------------------------------------------------
......
......@@ -9,7 +9,8 @@
[interface :as i]
[util :as sync-util]]
[metabase.sync.analyze
[special-types :as special-types]
[classify :as classify]
[fingerprint :as fingerprint]
[table-row-count :as table-row-count]]
[metabase.util :as u]
[schema.core :as s]
......@@ -30,7 +31,8 @@
"Perform in-depth analysis for a TABLE."
[table :- i/TableInstance]
(table-row-count/update-row-count! table)
(special-types/infer-special-types! table)
(fingerprint/fingerprint-fields! table)
(classify/classify-fields! table)
(update-fields-last-analyzed! table))
......
(ns metabase.sync.analyze.classifiers.category
"Classifier that determines whether a Field should be marked as a `:type/Category` based on the number of distinct values it has."
(:require [clojure.tools.logging :as log]
[metabase.models.field-values :as field-values]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.util.schema :as su]
[schema.core :as s]))
(s/defn ^:private ^:always-validate cannot-be-category? :- s/Bool
[base-type :- su/FieldType]
(or (isa? base-type :type/DateTime)
(isa? base-type :type/Collection)))
(s/defn ^:always-validate infer-is-category :- (s/maybe i/FieldInstance)
"Classifier that attempts to determine whether FIELD ought to be marked as a Category based on its distinct count."
[field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
(when-not (:special_type field)
(when fingerprint
(when-not (cannot-be-category? (:base_type field))
(when-let [distinct-count (get-in fingerprint [:global :distinct-count])]
(when (< distinct-count field-values/low-cardinality-threshold)
(log/debug (format "%s has %d distinct values. Since that is less than %d, we're marking it as a category."
(sync-util/name-for-logging field)
distinct-count
field-values/low-cardinality-threshold))
(assoc field
:special_type :type/Category)))))))
(ns metabase.sync.analyze.special-types.name
"Logic for inferring the special types of Fields based on their name."
(ns metabase.sync.analyze.classifiers.name
"Classifier that infers the special type of a Field based on its name and base type."
(:require [clojure.string :as str]
[clojure.tools.logging :as log]
[metabase
......@@ -69,7 +69,7 @@
(assert (isa? special-type :type/*))))
(s/defn ^:private ^:always-validate infer-special-type-by-name :- (s/maybe su/FieldType)
(s/defn ^:private ^:always-validate special-type-for-name-and-base-type :- (s/maybe su/FieldType)
"If `name` and `base-type` matches a known pattern, return the `special_type` we should assign to it."
[field-name :- su/NonBlankString, base-type :- su/FieldType]
(or (when (= "id" (str/lower-case field-name)) :type/PK)
......@@ -79,14 +79,11 @@
special-type))
pattern+base-types+special-type)))
(s/defn ^:always-validate infer-special-types-by-name!
[table :- i/TableInstance, fields :- [i/FieldInstance]]
(doseq [field fields]
(sync-util/with-error-handling (format "Error inferring special type by name for %s" (sync-util/name-for-logging field))
(when-let [inferred-special-type (infer-special-type-by-name (:name field) (:base_type field))]
(log/debug (format "Based on the name of %s %s, we're giving it a special type of %s."
(sync-util/name-for-logging table)
(sync-util/name-for-logging field)
inferred-special-type))
(db/update! Field (u/get-id field)
:special_type inferred-special-type)))))
(s/defn ^:always-validate infer-special-type :- (s/maybe i/FieldInstance)
"Classifer that infers the special type of a FIELD based on its name and base type."
[field :- i/FieldInstance, _ :- (s/maybe i/Fingerprint)]
(when-let [inferred-special-type (special-type-for-name-and-base-type (:name field) (:base_type field))]
(log/debug (format "Based on the name of %s, we're giving it a special type of %s."
(sync-util/name-for-logging field)
inferred-special-type))
(assoc field :special_type inferred-special-type)))
(ns metabase.sync.analyze.classifiers.no-preview-display
"Classifier that decides whether a Field should be marked `:no_preview_display.`"
(:require [metabase.sync.interface :as i]
[schema.core :as s]))
(def ^:private ^:const ^Integer average-length-no-preview-threshold
"Fields whose values' average length is greater than this amount should be marked as `preview_display = false`."
50)
(s/defn ^:always-validate infer-no-preview-display :- (s/maybe i/FieldInstance)
"Classifier that determines whether FIELD should be marked `:no_preview_display`.
If FIELD is textual and its average length is too great, mark it so it isn't displayed in the UI."
[field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
(when (isa? (:base_type field) :type/Text)
(when-let [average-length (get-in fingerprint [:type :type/Text :average-length])]
(when (> average-length average-length-no-preview-threshold)
(assoc field
:no_preview_display true)))))
(ns metabase.sync.analyze.classifiers.text-fingerprint
"Logic for inferring the special types of *Text* fields based on their TextFingerprints.
These tests only run against Fields that *don't* have existing special types."
(:require [clojure.tools.logging :as log]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.util.schema :as su]
[schema.core :as s]))
(def ^:private ^:const ^Float percent-valid-threshold
"Fields that have at least this percent of values that are satisfy some predicate (such as `u/is-email?`)
should be given the corresponding special type (such as `:type/Email`)."
0.95)
(s/defn ^:private ^:always-validate percent-key-below-threshold? :- s/Bool
"Is the value of PERCENT-KEY inside TEXT-FINGERPRINT above the `percent-valid-threshold`?"
[text-fingerprint :- i/TextFingerprint, percent-key :- s/Keyword]
(boolean
(when-let [percent (get text-fingerprint percent-key)]
(>= percent percent-valid-threshold))))
(def ^:private percent-key->special-type
"Map of keys inside the `TextFingerprint` to the corresponding special types we should mark a Field as if the value of the key
is over `percent-valid-thresold`."
{:percent-json :type/SerializedJSON
:percent-url :type/URL
:percent-email :type/Email})
(s/defn ^:private ^:always-validate infer-special-type-for-text-fingerprint :- (s/maybe su/FieldType)
"Check various percentages inside the TEXT-FINGERPRINT and return the corresponding special type to mark the Field as if the percent passes the threshold."
[text-fingerprint :- i/TextFingerprint]
(some (fn [[percent-key special-type]]
(when (percent-key-below-threshold? text-fingerprint percent-key)
special-type))
(seq percent-key->special-type)))
(s/defn ^:always-validate infer-special-type :- (s/maybe i/FieldInstance)
"Do classification for `:type/Text` Fields with a valid `TextFingerprint`.
Currently this only checks the various recorded percentages, but this is subject to change in the future."
[field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
(when (isa? (:base_type field) :type/Text)
(when-not (:special_type field)
(when-let [text-fingerprint (get-in fingerprint [:type :type/Text])]
(when-let [inferred-special-type (infer-special-type-for-text-fingerprint text-fingerprint)]
(log/debug (format "Based on the fingerprint of %s, we're marking it as %s." (sync-util/name-for-logging field) inferred-special-type))
(assoc field
:special_type inferred-special-type))))))
(ns metabase.sync.analyze.classify
"Analysis sub-step that takes a fingerprint for a Field and infers and saves appropriate information like special type.
Each 'classifier' takes the information available to it and decides whether or not to run.
We currently have the following classifiers:
1. `name`: Looks at the name of a Field and infers a special type if possible
2. `no-preview-display`: Looks at average length of text Field recorded in fingerprint and decides whether or not we should hide this Field
3. `category`: Looks at the number of distinct values of Field and determines whether it can be a Category
4. `text-fingerprint`: Looks at percentages recorded in a text Fields' TextFingerprint and infers a special type if possible
All classifier functions take two arguments, a `FieldInstance` and a possibly `nil` `Fingerprint`, and should return the Field
with any appropriate changes (such as a new special type). If no changes are appropriate, a classifier may return nil.
Error handling is handled by `run-classifiers` below, so individual classiers do not need to handle errors themselves.
In the future, we plan to add more classifiers, including ML ones that run offline."
(:require [clojure.data :as data]
[clojure.tools.logging :as log]
[metabase.models.field :refer [Field]]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.sync.analyze.classifiers
[category :as category]
[name :as name]
[no-preview-display :as no-preview-display]
[text-fingerprint :as text-fingerprint]]
[metabase.util :as u]
[schema.core :as s]
[toucan.db :as db]))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING INDIVIDUAL FIELDS |
;;; +------------------------------------------------------------------------------------------------------------------------+
(def ^:private values-that-can-be-set
"Columns of Field that classifiers are allowed to set."
#{:special_type :preview_display})
(s/defn ^:private ^:always-validate save-field-updates!
"Save the updates in UPDATED-FIELD."
[original-field :- i/FieldInstance, updated-field :- i/FieldInstance]
(let [[_ values-to-set] (data/diff original-field updated-field)]
(log/debug (format "Based on classification, updating these values of %s: %s" (sync-util/name-for-logging original-field) values-to-set))
;; Check that we're not trying to set anything that we're not allowed to
(doseq [k (keys values-to-set)]
(when-not (contains? values-that-can-be-set k)
(throw (Exception. (format "Classifiers are not allowed to set the value of %s." k)))))
;; cool, now we should be ok to update the Field
(db/update! Field (u/get-id original-field)
values-to-set)))
(def ^:private classifiers
"Various classifier functions available. These should all take two args, a `FieldInstance` and a possibly `nil` `Fingerprint`,
and return `FieldInstance` with any inferred property changes, or `nil` if none could be inferred.
Order is important!"
[name/infer-special-type
category/infer-is-category
no-preview-display/infer-no-preview-display
text-fingerprint/infer-special-type])
(s/defn ^:private ^:always-validate run-classifiers :- i/FieldInstance
"Run all the available `classifiers` against FIELD and FINGERPRINT, and return the resulting FIELD with changes
decided upon by the classifiers."
[field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
(loop [field field, [classifier & more] classifiers]
(if-not classifier
field
(recur (or (sync-util/with-error-handling (format "Error running classifier on %s" (sync-util/name-for-logging field))
(classifier field fingerprint))
field)
more))))
(s/defn ^:private ^:always-validate classify!
"Run various classifiers on FIELD and its FINGERPRINT, and save any detected changes."
([field :- i/FieldInstance]
(classify! field (or (:fingerprint field)
(db/select-one-field :fingerprint Field :id (u/get-id field)))))
([field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
(sync-util/with-error-handling (format "Error classifying %s" (sync-util/name-for-logging field))
(let [updated-field (run-classifiers field fingerprint)]
(when-not (= field updated-field)
(save-field-updates! field updated-field))))))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | CLASSIFYING ALL FIELDS IN A TABLE |
;;; +------------------------------------------------------------------------------------------------------------------------+
(s/defn ^:private ^:always-validate fields-to-classify :- (s/maybe [i/FieldInstance])
"Return a sequences of Fields belonging to TABLE for which we should attempt to determine special type.
This should include NEW fields that are active, visibile, and without an existing special type."
[table :- i/TableInstance]
(seq (db/select Field
:table_id (u/get-id table)
:special_type nil
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil)))
(s/defn ^:always-validate classify-fields!
"Run various classifiers on the appropriate FIELDS in a TABLE that have not been previously analyzed.
These do things like inferring (and setting) the special types and preview display status for Fields
belonging to TABLE."
[table :- i/TableInstance]
(when-let [fields (fields-to-classify table)]
(doseq [field fields]
(classify! field))))
(ns metabase.sync.analyze.fingerprint
"Analysis sub-step that takes a sample of values for a Field and saving a non-identifying fingerprint
used for classification. This fingerprint is saved as a column on the Field it belongs to."
(:require [clojure.tools.logging :as log]
[metabase.models.field :refer [Field]]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.sync.analyze.fingerprint
[global :as global]
[number :as number]
[sample :as sample]
[text :as text]]
[metabase.util :as u]
[schema.core :as s]
[toucan.db :as db]))
(s/defn ^:private ^:always-validate type-specific-fingerprint :- (s/maybe i/TypeSpecificFingerprint)
"Return type-specific fingerprint info for FIELD and a sample of VALUES if it has an elligible base type
such as a derivative of `:type/Text` or of `:type/Number`."
[field :- i/FieldInstance, values :- i/ValuesSample]
(condp #(isa? %2 %1) (:base_type field)
:type/Text {:type/Text (text/text-fingerprint values)}
:type/Number {:type/Number (number/number-fingerprint values)}
nil))
(s/defn ^:private ^:always-validate fingerprint :- (s/maybe i/Fingerprint)
"Generate a 'fingerprint' from a SAMPLE of values."
([field :- i/FieldInstance]
(when-let [values (sample/basic-sample field)]
(fingerprint field values)))
([field :- i/FieldInstance, values :- i/ValuesSample]
(merge
(when-let [global-fingerprint (global/global-fingerprint values)]
{:global global-fingerprint})
(when-let [type-specific-fingerprint (type-specific-fingerprint field values)]
{:type type-specific-fingerprint}))))
(s/defn ^:private ^:always-validate fingerprint!
"Generate and save a fingerprint for a FIELD."
[field :- i/FieldInstance]
(sync-util/with-error-handling (format "Error generating fingerprint for %s" (sync-util/name-for-logging field))
(when-let [fingerprint (fingerprint field)]
(log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field)))
(db/update! Field (u/get-id field)
:fingerprint fingerprint))))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | FINGERPRINTING ALL FIELDS IN A TABLE |
;;; +------------------------------------------------------------------------------------------------------------------------+
(s/defn ^:private ^:always-validate fields-to-fingerprint :- (s/maybe [i/FieldInstance])
"Return a sequences of Fields belonging to TABLE for which we should generate (and save) fingerprints.
This should include NEW fields that are active and visibile."
[table :- i/TableInstance]
(seq (db/select Field
:table_id (u/get-id table)
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil)))
(s/defn ^:always-validate fingerprint-fields!
"Generate and save fingerprints for all the Fields in TABLE that have not been previously analyzed."
[table :- i/TableInstance]
(when-let [fields (fields-to-fingerprint table)]
(doseq [field fields]
(fingerprint! field))))
(ns metabase.sync.analyze.fingerprint.global
"Logic for generating a `GlobalFingerprint` from a sequence of values for a Field of *any* type."
(:require [metabase.sync.interface :as i]
[schema.core :as s]))
(s/defn ^:always-validate global-fingerprint :- i/GlobalFingerprint
"Generate a fingerprint of global information for Fields of all types."
[values :- i/ValuesSample]
;; TODO - this logic isn't as nice as the old logic that actually called the DB
;; We used to do (queries/field-distinct-count field field-values/low-cardinality-threshold)
;; Consider whether we are so married to the idea of only generating fingerprints from samples that we
;; are ok with inaccurate counts like the one we'll surely be getting here
{:distinct-count (count (distinct values))})
(ns metabase.sync.analyze.fingerprint.number
"Logic for generating a `NumberFingerprint` from a sequence of values for a `:type/Number` Field."
(:require [metabase.sync.interface :as i]
[schema.core :as s]))
(s/defn ^:private ^:always-validate average :- s/Num
"Return the average of VALUES."
[values :- i/ValuesSample]
(/ (double (reduce + values))
(double (count values))))
(s/defn ^:always-validate number-fingerprint :- i/NumberFingerprint
"Generate a fingerprint containing information about values that belong to a `:type/Number` Field."
[values :- i/ValuesSample]
{:min (apply min values)
:max (apply max values)
:avg (average values)})
(ns metabase.sync.analyze.fingerprint.sample
"Analysis sub-step that fetches a sample of values for a given Field, which is used to generate a fingerprint for it.
Currently this is dumb and just fetches a contiguous sequence of values, but in the future we plan to make this
more sophisticated and have different types of samples for different Fields."
(:require [metabase.driver :as driver]
[metabase.models.table :refer [Table]]
[metabase.sync.interface :as i]
[schema.core :as s]
[toucan.db :as db]))
(s/defn ^:always-validate basic-sample :- (s/maybe i/ValuesSample)
"Procure a sequence of non-nil values, up to `max-sync-lazy-seq-results` (10,000 at the time of this writing), for use
in the various tests above. Maybe return `nil` if no values are available."
[field :- i/FieldInstance]
;; TODO - we should make `->driver` a method so we can pass things like Fields into it
(->> (driver/field-values-lazy-seq (driver/->driver (db/select-one-field :db_id Table :id (:table_id field)))
field)
(take driver/max-sync-lazy-seq-results)
(filter (complement nil?))
seq))
(ns metabase.sync.analyze.fingerprint.text
"Logic for generating a `TextFingerprint` from a sequence of values for a `:type/Text` Field."
(:require [cheshire.core :as json]
[metabase.sync.interface :as i]
[metabase.util :as u]
[schema.core :as s]))
(s/defn ^:private ^:always-validate average-length :- (s/constrained Double #(>= % 0))
"Return the average length of VALUES."
[values :- i/ValuesSample]
(let [total-length (reduce + (for [value values]
(count (str value))))]
(/ (double total-length)
(double (count values)))))
(s/defn ^:private ^:always-validate percent-satisfying-predicate :- i/Percent
"Return the percentage of VALUES that satisfy PRED."
[pred :- (s/pred fn?), values :- i/ValuesSample]
(let [total-count (count values)
pred #(boolean (u/ignore-exceptions (pred %)))
matching-count (count (get (group-by pred values) true []))]
(/ (double matching-count)
(double total-count))))
(defn- valid-serialized-json?
"True if X is a serialized JSON dictionary or array."
[x]
(boolean
(when-let [parsed-json (json/parse-string x)]
(or (map? parsed-json)
(sequential? parsed-json)))))
(s/defn ^:always-validate text-fingerprint :- i/TextFingerprint
"Generate a fingerprint containing information about values that belong to a `:type/Text` Field."
[values :- i/ValuesSample]
{:percent-json (percent-satisfying-predicate valid-serialized-json? values)
:percent-url (percent-satisfying-predicate u/is-url? values)
:percent-email (percent-satisfying-predicate u/is-email? values)
:average-length (average-length values)})
(ns metabase.sync.analyze.special-types
"Logic for scanning values of a given field and updating special types as appropriate.
Also known as 'fingerprinting', 'analysis', or 'classification'.
(Note: this namespace is sort of a misnomer, since special type isn't the only thing that can get set by
the functions here. `:preview_display` can also get set to `false` if a Field has on average very
large (long) values.)"
(:require [metabase.models.field :refer [Field]]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.sync.analyze.special-types
[name :as name]
[values :as values]]
[metabase.util :as u]
[schema.core :as s]
[toucan.db :as db]))
(s/defn ^:private ^:always-validate fields-to-infer-special-types-for :- (s/maybe [i/FieldInstance])
"Return a sequences of Fields belonging to TABLE for which we should attempt to determine special type.
This should include NEW fields that are active, visibile, and without an existing special type."
[table :- i/TableInstance]
(seq (db/select Field
:table_id (u/get-id table)
:special_type nil
:active true
:visibility_type [:not= "retired"]
:preview_display true
:last_analyzed nil))) ; only analyze NEW fields
(s/defn ^:always-validate infer-special-types!
"Infer (and set) the special types and preview display status for Fields
belonging to TABLE, and mark the fields as recently analyzed."
[table :- i/TableInstance]
(sync-util/with-error-handling (format "Error inferring special types for %s" (sync-util/name-for-logging table))
;; fetch any fields with no special type. See if we can infer a type from their name.
(when-let [fields (fields-to-infer-special-types-for table)]
(name/infer-special-types-by-name! table fields))
;; Ok, now fetch fields that *still* don't have a special type. Try to infer a type from a sequence of their values.
(when-let [fields (fields-to-infer-special-types-for table)]
(values/infer-special-types-by-value! table fields))))
(ns metabase.sync.analyze.special-types.values
"Logic for inferring (and setting) the special types of fields based on tests done against a sequence of their values.
Also sets `:preview_display` to `false` if a Field has on average very long text values."
(:require [cheshire.core :as json]
[clojure.tools.logging :as log]
[metabase
[driver :as driver]
[util :as u]]
[metabase.db.metadata-queries :as queries]
[metabase.models
[field :refer [Field]]
[field-values :as field-values]]
[metabase.sync
[interface :as i]
[util :as sync-util]]
[metabase.util.schema :as su]
[schema.core :as s]
[toucan.db :as db]))
(def ^:private Values
"Schema for the VALUES passed to each of the functions below. *Guaranteed* to be non-nil and non-empty."
;; Validating against this is actually pretty quick, in the order of microseconds even for a 10,000 value sequence
(s/constrained [(s/pred (complement nil?))] seq "Non-empty sequence of non-nil values."))
;;; ------------------------------------------------------------ No Preview Display ------------------------------------------------------------
(def ^:private ^:const ^Integer average-length-no-preview-threshold
"Fields whose values' average length is greater than this amount should be marked as `preview_display = false`."
50)
(s/defn ^:private ^:always-validate avg-length :- Double
[values :- Values]
(let [total-length (reduce + (for [value values]
(count (str value))))]
(/ (double total-length)
(double (count values)))))
(s/defn ^:private ^:always-validate field-should-be-marked-no-preview-display? :- s/Bool
"If FIELD's is textual and its average length is too great, mark it so it isn't displayed in the UI."
[field :- i/FieldInstance, values :- Values]
(boolean
(and (isa? (:base_type field) :type/Text)
(> (avg-length values) average-length-no-preview-threshold))))
;;; ------------------------------------------------------------ Predicate-based tests ------------------------------------------------------------
(def ^:private ^:const ^Float percent-valid-threshold
"Fields that have at least this percent of values that are satisfy some predicate (such as `u/is-email?`)
should be given the corresponding special type (such as `:type/Email`)."
0.95)
(s/defn ^:private ^:always-validate percent-satisfying-predicate :- Double
[pred :- (s/pred fn?), values :- Values]
(let [total-count (count values)
pred #(boolean (u/ignore-exceptions (pred %)))
matching-count (count (get (group-by pred values) true []))]
(/ (double matching-count)
(double total-count))))
(s/defn ^:private ^:always-validate values-satisfy-predicate? :- s/Bool
"True if enough VALUES satisfy PREDICATE that the field they belong to should be given the corresponding special type."
[pred :- (s/pred fn?), field :- i/FieldInstance, values :- Values]
(and (isa? (:base_type field) :type/Text)
(>= (percent-satisfying-predicate pred values)
percent-valid-threshold)))
(s/defn ^:private ^:always-validate test:url :- (s/maybe (s/eq :type/URL))
"If FIELD is texual, doesn't have a `special_type`, and its non-nil values are primarily URLs, mark it as `special_type` `:type/URL`."
[field :- i/FieldInstance, values :- Values]
(when (values-satisfy-predicate? u/is-url? field values)
:type/URL))
(defn- valid-serialized-json? [x]
(boolean
(when-let [parsed-json (json/parse-string x)]
(or (map? parsed-json)
(sequential? parsed-json)))))
(s/defn ^:private ^:always-validate test:json :- (s/maybe (s/eq :type/SerializedJSON))
"Mark FIELD as `:json` if it's textual, doesn't already have a special type, the majority of it's values are non-nil, and all of its non-nil values
are valid serialized JSON dictionaries or arrays."
[field :- i/FieldInstance, values :- Values]
(when (values-satisfy-predicate? valid-serialized-json? field values)
:type/SerializedJSON))
(s/defn ^:private ^:always-validate test:email :- (s/maybe (s/eq :type/Email))
"Mark FIELD as `:email` if it's textual, doesn't already have a special type, the majority of it's values are non-nil, and all of its non-nil values
are valid emails."
[field :- i/FieldInstance, values :- Values]
(when (values-satisfy-predicate? u/is-email? field values)
:type/Email))
;;; ------------------------------------------------------------ Category ------------------------------------------------------------
(derive :type/DateTime ::cannot-be-category)
(derive :type/Collection ::cannot-be-category)
(s/defn ^:private ^:always-validate test:category :- (s/maybe (s/eq :type/Category))
[field :- i/FieldInstance, _]
(when-not (isa? (:base_type field) ::cannot-be-category)
(let [distinct-count (queries/field-distinct-count field field-values/low-cardinality-threshold)]
(when (< distinct-count field-values/low-cardinality-threshold)
(log/debug (format "%s has %d distinct values. Since that is less than %d, we're marking it as a category."
(sync-util/name-for-logging field)
distinct-count
field-values/low-cardinality-threshold))
:type/Category))))
;;; ------------------------------------------------------------ Putting it all together ------------------------------------------------------------
(def ^:private test-fns
"Various test functions, in the order the 'tests' against values should be ran.
Each test function take two args, `field` and `values."
[test:url
test:json
test:email
test:category])
(s/defn ^:private ^:always-validate do-test :- (s/maybe su/FieldType)
[field :- i/FieldInstance, values :- Values, test-fn :- (s/pred fn?)]
(sync-util/with-error-handling (format "Error checking if values of %s match special-type" (sync-util/name-for-logging field))
(u/prog1 (test-fn field values)
(when (and <>
(not= <> :type/Category)) ; `test:category` has its own loggings
(log/debug (format "Based on the values of %s, we're marking it as %s." (sync-util/name-for-logging field) <>))))))
(s/defn ^:private ^:always-validate infer-special-type :- (s/maybe su/FieldType)
"Run each of the `test-fns` against FIELD and VALUES until one of the 'tests positive' and returns a special type to mark this Field as."
[field :- i/FieldInstance, values :- Values]
(some (partial do-test field values)
test-fns))
(s/defn ^:private ^:always-validate field-values :- (s/maybe Values)
"Procure a sequence of non-nil values, up to `max-sync-lazy-seq-results` (10,000 at the time of this writing), for use
in the various tests above."
[driver, field :- i/FieldInstance]
(->> (driver/field-values-lazy-seq driver field)
(take driver/max-sync-lazy-seq-results)
(filter (complement nil?))
seq))
(s/defn ^:private ^:always-validate infer-special-types-for-field!
"Attempt to determine a valid special type for FIELD."
[driver, field :- i/FieldInstance]
(when-let [values (field-values driver field)]
(if (sync-util/with-error-handling (format "Error checking if %s should be marked no preview display" (sync-util/name-for-logging field))
(field-should-be-marked-no-preview-display? field values))
;; if field's values are too long on average, mark it 'no preview display' so it doesn't show up in results
(db/update! Field (u/get-id field)
:preview_display false)
;; otherwise if it's *not* no preview display, run the normal series of tests and see if it can have a nice special type
(when-let [inferred-special-type (infer-special-type field values)]
(db/update! Field (u/get-id field)
:special_type inferred-special-type)))))
(s/defn ^:always-validate infer-special-types-by-value!
"Infer (and set) the special types of all te FIELDS belonging to TABLE by looking at their values."
[table :- i/TableInstance, fields :- [i/FieldInstance]]
(let [driver (driver/->driver (:db_id table))]
(doseq [field fields]
(sync-util/with-error-handling (format "Error inferring special type by values for %s" (sync-util/name-for-logging field))
(infer-special-types-for-field! driver field)))))
......@@ -51,3 +51,52 @@
(def DatabaseInstance "Schema for a valid instance of a Metabase Database." (class Database))
(def TableInstance "Schema for a valid instance of a Metabase Table." (class Table))
(def FieldInstance "Schema for a valid instance of a Metabase Field." (class Field))
;;; +------------------------------------------------------------------------------------------------------------------------+
;;; | SAMPLING & FINGERPRINTS |
;;; +------------------------------------------------------------------------------------------------------------------------+
(def ValuesSample
"Schema for a sample of VALUES returned by the `sample` sub-stage of analysis and passed into the `fingerprint` stage.
Guaranteed to be non-empty and non-nil."
;; Validating against this is actually pretty quick, in the order of microseconds even for a 10,000 value sequence
(s/constrained [(s/pred (complement nil?))] seq "Non-empty sequence of non-nil values."))
(def GlobalFingerprint
"Fingerprint values that Fields of all types should have."
{(s/optional-key :distinct-count) s/Int})
(def Percent
"Schema for something represting a percentage. A floating-point value between (inclusive) 0 and 1."
(s/constrained s/Num #(<= 0 % 1) "Valid percentage between (inclusive) 0 and 1."))
(def NumberFingerprint
"Schema for fingerprint information for Fields deriving from `:type/Number`."
{(s/optional-key :min) s/Num
(s/optional-key :max) s/Num
(s/optional-key :avg) s/Num})
(def TextFingerprint
"Schema for fingerprint information for Fields deriving from `:type/Text`."
{(s/optional-key :percent-json) Percent
(s/optional-key :percent-url) Percent
(s/optional-key :percent-email) Percent
(s/optional-key :average-length) (s/constrained Double #(>= % 0) "Valid number greater than or equal to zero")})
(def TypeSpecificFingerprint
"Schema for type-specific fingerprint information."
(s/constrained
{(s/optional-key :type/Number) NumberFingerprint
(s/optional-key :type/Text) TextFingerprint}
(fn [m]
(= 1 (count (keys m))))
"Type-specific fingerprint with exactly one key"))
(def Fingerprint
"Schema for a Field 'fingerprint' generated as part of the analysis stage. Used to power the 'classification' sub-stage of
analysis. Stored as the `fingerprint` column of Field."
{(s/optional-key :global) GlobalFingerprint
(s/optional-key :type) TypeSpecificFingerprint
(s/optional-key :experimental) {s/Keyword s/Any}})
......@@ -73,7 +73,7 @@
:id $
:details $
:updated_at $
:features (mapv name (driver/features (driver/engine->driver (:engine db))))}))))
:features (map name (driver/features (driver/engine->driver (:engine db))))}))))
;; # DB LIFECYCLE ENDPOINTS
......@@ -156,16 +156,23 @@
;; TODO - this is a test code smell, each test should clean up after itself and this step shouldn't be neccessary. One day we should be able to remove this!
;; If you're writing a test that needs this, fix your brain and your test
;; If you're writing a NEW test that needs this, fix your brain and your test!
;; To reïterate, this is BAD BAD BAD BAD BAD BAD! It will break tests if you use it! Don't use it!
(defn- ^:deprecated delete-randomly-created-databases!
"Delete all the randomly created Databases we've made so far. Optionally specify one or more IDs to SKIP."
[& {:keys [skip]}]
(db/delete! Database :id [:not-in (into (set skip)
(for [engine datasets/all-valid-engines
:let [id (datasets/when-testing-engine engine
(:id (get-or-create-test-data-db! (driver/engine->driver engine))))]
:when id]
id))]))
(let [ids-to-skip (into (set skip)
(for [engine datasets/all-valid-engines
:let [id (datasets/when-testing-engine engine
(:id (get-or-create-test-data-db! (driver/engine->driver engine))))]
:when id]
id))]
(when-let [dbs (seq (db/select [Database :name :engine :id] :id [:not-in ids-to-skip]))]
(println (u/format-color 'red (str "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"
"WARNING: deleting randomly created databases:\n%s\n"
"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n")
(u/pprint-to-str dbs))))
(db/delete! Database :id [:not-in ids-to-skip])))
;; ## GET /api/database
......@@ -238,6 +245,19 @@
:preview_display true
:parent_id nil})
(defn- field-details [field]
(merge
default-field-details
(match-$ (hydrate/hydrate field :values)
{:updated_at $
:id $
:raw_column_id $
:created_at $
:last_analyzed $
:fingerprint $
:fk_target_field_id $
:values $})))
;; ## GET /api/meta/table/:id/query_metadata
;; TODO - add in example with Field :values
(expect
......@@ -254,36 +274,20 @@
{:schema "PUBLIC"
:name "CATEGORIES"
:display_name "Categories"
:fields [(merge default-field-details
(match-$ (hydrate/hydrate (Field (id :categories :id)) :values)
{:table_id (id :categories)
:special_type "type/PK"
:name "ID"
:display_name "ID"
:updated_at $
:id $
:raw_column_id $
:created_at $
:last_analyzed $
:base_type "type/BigInteger"
:visibility_type "normal"
:fk_target_field_id $
:values $}))
(merge default-field-details
(match-$ (hydrate/hydrate (Field (id :categories :name)) :values)
{:table_id (id :categories)
:special_type "type/Name"
:name "NAME"
:display_name "Name"
:updated_at $
:id $
:raw_column_id $
:created_at $
:last_analyzed $
:base_type "type/Text"
:visibility_type "normal"
:fk_target_field_id $
:values $}))]
:fields [(assoc (field-details (Field (id :categories :id)))
:table_id (id :categories)
:special_type "type/PK"
:name "ID"
:display_name "ID"
:base_type "type/BigInteger"
:visibility_type "normal")
(assoc (field-details (Field (id :categories :name)))
:table_id (id :categories)
:special_type "type/Name"
:name "NAME"
:display_name "Name"
:base_type "type/Text"
:visibility_type "normal")]
:segments []
:metrics []
:rows 75
......
......@@ -41,6 +41,7 @@
{:description nil
:table_id (id :users)
:raw_column_id $
:fingerprint $
:table (tu/match-$ (Table (id :users))
{:description nil
:entity_type nil
......
This diff is collapsed.
(ns metabase.models.field-test
(:require [expectations :refer :all]
[metabase.models.field-values :refer :all]
[metabase.sync.analyze.special-types :as special-types]
[metabase.sync.analyze.special-types.name :as name]))
[metabase.sync.analyze.classifiers.name :as name]))
;; field-should-have-field-values?
......@@ -70,9 +69,9 @@
;;; infer-field-special-type
(expect :type/PK (#'name/infer-special-type-by-name "id" :type/Integer))
(expect :type/PK (#'name/special-type-for-name-and-base-type "id" :type/Integer))
;; other pattern matches based on type/regex (remember, base_type matters in matching!)
(expect :type/Category (#'name/infer-special-type-by-name "rating" :type/Integer))
(expect nil (#'name/infer-special-type-by-name "rating" :type/Boolean))
(expect :type/Country (#'name/infer-special-type-by-name "country" :type/Text))
(expect nil (#'name/infer-special-type-by-name "country" :type/Integer))
(expect :type/Category (#'name/special-type-for-name-and-base-type "rating" :type/Integer))
(expect nil (#'name/special-type-for-name-and-base-type "rating" :type/Boolean))
(expect :type/Country (#'name/special-type-for-name-and-base-type "country" :type/Text))
(expect nil (#'name/special-type-for-name-and-base-type "country" :type/Integer))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment