Skip to content
Snippets Groups Projects
Commit e4e625bf authored by Simon Belak's avatar Simon Belak
Browse files

Correctly handle PKs; skip results_metadata step when fingerprinting

[ci all]
parent 1a3e6d7f
No related branches found
No related tags found
No related merge requests found
Showing
with 98 additions and 80 deletions
......@@ -688,8 +688,8 @@
engine (-> source ((some-fn :db_id :database_id)) Database :engine)
table->fields (if (instance? (type Table) source)
(comp (->> (db/select Field
:table_id [:in (map u/get-id tables)]
:visibility_type "normal")
:table_id [:in (map u/get-id tables)]
:visibility_type "normal")
field/with-targets
(map #(assoc % :engine engine))
(group-by :table_id))
......
......@@ -491,6 +491,6 @@
:fields (vec (for [field fields]
[:field-id (u/get-id field)]))
:limit max-sample-rows}
:middleware {:format-rows? false
:skip-fingerprinting? true}})]
:middleware {:format-rows? false
:skip-results-metadata? true}})]
(get-in results [:data :rows])))
......@@ -49,22 +49,22 @@
[qp]
(fn [{{:keys [card-id nested?]} :info, :as query}]
(let [results (qp query)]
(try
(let [metadata (seq (if (-> query :middleware :skip-fingerprinting?)
(qr/results->column-metadata results)
(qr/results->column-metadata+fingerprint results)))]
;; At the very least we can skip the Extra DB call to update this Card's metadata results
;; if its DB doesn't support nested queries in the first place
(when (and (i/driver-supports? :nested-queries)
card-id
(not nested?))
(record-metadata! card-id metadata))
;; add the metadata and checksum to the response
(assoc results :results_metadata {:checksum (metadata-checksum metadata)
:columns metadata}))
;; if for some reason we weren't able to record results metadata for this query then just proceed as normal
;; rather than failing the entire query
(catch Throwable e
(log/error "Error recording results metadata for query:" (.getMessage e) "\n"
(u/pprint-to-str (u/filtered-stacktrace e)))
results)))))
(if (-> query :middleware :skip-results-metadata?)
results
(try
(let [metadata (seq (qr/results->column-metadata results))]
;; At the very least we can skip the Extra DB call to update this Card's metadata results
;; if its DB doesn't support nested queries in the first place
(when (and (i/driver-supports? :nested-queries)
card-id
(not nested?))
(record-metadata! card-id metadata))
;; add the metadata and checksum to the response
(assoc results :results_metadata {:checksum (metadata-checksum metadata)
:columns metadata}))
;; if for some reason we weren't able to record results metadata for this query then just proceed as normal
;; rather than failing the entire query
(catch Throwable e
(log/error "Error recording results metadata for query:" (.getMessage e) "\n"
(u/pprint-to-str (u/filtered-stacktrace e)))
results))))))
......@@ -30,11 +30,18 @@
(isa? special-type :type/PK)
(isa? special-type :type/FK)))
(defn- not-all-empty?
[fingerprint]
(or (some-> fingerprint :type :type/Number :min some?)
(some-> fingerprint :type :type/Text :average-length pos?)))
(s/defn ^:private field-should-be-category? :- (s/maybe s/Bool)
[distinct-count :- s/Int, field :- su/Map]
;; only mark a Field as a Category if it doesn't already have a special type
(when-not (:special_type field)
(when (<= distinct-count field-values/category-cardinality-threshold)
[fingerprint :- (s/maybe i/Fingerprint), field :- su/Map]
(let [distinct-count (get-in fingerprint [:global :distinct-count])]
;; only mark a Field as a Category if it doesn't already have a special type
(when (and (nil? (:special_type field))
(not-all-empty? fingerprint)
(<= distinct-count field-values/category-cardinality-threshold))
(log/debug (format "%s has %d distinct values. Since that is less than %d, we're marking it as a category."
(sync-util/name-for-logging field)
distinct-count
......@@ -62,5 +69,5 @@
(when-not (cannot-be-category-or-list? (:base_type field) (:special_type field))
(when-let [distinct-count (get-in fingerprint [:global :distinct-count])]
(cond-> field
(field-should-be-category? distinct-count field) (assoc :special_type :type/Category)
(field-should-be-category? fingerprint field) (assoc :special_type :type/Category)
(field-should-be-auto-list? distinct-count field) (assoc :has_field_values :auto-list))))))
......@@ -21,6 +21,7 @@
(def ^:private time-type #{:type/Time})
(def ^:private date-type #{:type/Date})
(def ^:private number-type #{:type/Number})
(def ^:private any-type #{:type/*})
(def ^:private pattern+base-types+special-type
......@@ -31,7 +32,8 @@
* Convert field name to lowercase before matching against a pattern
* Consider a nil set-of-valid-base-types to mean \"match any base type\""
[[#"^.*_lat$" float-type :type/Latitude]
[[#"^id$" any-type :type/PK]
[#"^.*_lat$" float-type :type/Latitude]
[#"^.*_lon$" float-type :type/Longitude]
[#"^.*_lng$" float-type :type/Longitude]
[#"^.*_long$" float-type :type/Longitude]
......@@ -114,12 +116,11 @@
(s/defn ^:private special-type-for-name-and-base-type :- (s/maybe su/FieldType)
"If `name` and `base-type` matches a known pattern, return the `special_type` we should assign to it."
[field-name :- su/NonBlankString, base-type :- su/FieldType]
(or (when (= "id" (str/lower-case field-name)) :type/PK)
(some (fn [[name-pattern valid-base-types special-type]]
(when (and (some (partial isa? base-type) valid-base-types)
(re-find name-pattern (str/lower-case field-name)))
special-type))
pattern+base-types+special-type)))
(some (fn [[name-pattern valid-base-types special-type]]
(when (and (some (partial isa? base-type) valid-base-types)
(re-find name-pattern (str/lower-case field-name)))
special-type))
pattern+base-types+special-type))
(def ^:private FieldOrColumn
"Schema that allows a `metabase.model.field/Field` or a column from a query resultset"
......@@ -140,10 +141,10 @@
"Returns `field-or-column` with a computed special type based on the name and base type of the `field-or-column`"
[field-or-column :- FieldOrColumn, _ :- (s/maybe i/Fingerprint)]
(when-let [inferred-special-type (infer-special-type field-or-column)]
(log/debug (format "Based on the name of %s, we're giving it a special type of %s."
(sync-util/name-for-logging field-or-column)
inferred-special-type))
(assoc field-or-column :special_type inferred-special-type)))
(log/debug (format "Based on the name of %s, we're giving it a special type of %s."
(sync-util/name-for-logging field-or-column)
inferred-special-type))
(assoc field-or-column :special_type inferred-special-type)))
(defn- prefix-or-postfix
[s]
......
......@@ -21,16 +21,14 @@
(s/defn ^:private save-fingerprint!
[field :- i/FieldInstance, fingerprint :- (s/maybe i/Fingerprint)]
;; don't bother saving fingerprint if it's completely empty
(when (seq fingerprint)
(log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field)))
;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll
;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the
;; latest fingerprints.
(db/update! Field (u/get-id field)
:fingerprint fingerprint
:fingerprint_version i/latest-fingerprint-version
:last_analyzed nil)))
(log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field)))
;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll
;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the
;; latest fingerprints.
(db/update! Field (u/get-id field)
:fingerprint fingerprint
:fingerprint_version i/latest-fingerprint-version
:last_analyzed nil))
(defn- empty-stats-map [fields-count]
{:no-data-fingerprints 0
......
......@@ -3,6 +3,7 @@
(:require [cheshire.core :as json]
[clj-time.coerce :as t.coerce]
[kixi.stats.core :as stats]
[metabase.sync.analyze.classifiers.name :as classify.name]
[metabase.sync.util :as sync-util]
[metabase.util :as u]
[metabase.util.date :as du]
......@@ -48,7 +49,11 @@
(defmulti
^{:doc "Return a fingerprinter transducer for a given field based on the field's type."
:arglists '([field])}
fingerprinter (juxt :base_type (some-fn :special_type (constantly :type/*))))
fingerprinter (fn [{:keys [base_type special_type unit]}]
[(if (du/date-extract-units unit)
:type/Integer
base_type)
(or special_type :type/*)]))
(def ^:private global-fingerprinter
(redux/post-complete
......@@ -136,11 +141,18 @@
acc)
acc)))
(defprotocol ^:private IDateCoercible
"Protocol for converting objects to `java.util.Date`"
(->date ^java.util.Date [this]
"Coerce object to a `java.util.Date`."))
(extend-protocol IDateCoercible
nil (->date [_] nil)
String (->date [this] (-> this du/str->date-time t.coerce/to-date))
java.util.Date (->date [this] this))
(deffingerprinter :type/DateTime
((map (fn [dt]
(if (string? dt)
(-> dt du/str->date-time t.coerce/to-date-time)
dt)))
((map ->date)
(redux/fuse {:earliest earliest
:latest latest})))
......@@ -167,4 +179,9 @@
(defn fingerprint-fields
"Return a transducer for fingerprinting a resultset with fields `fields`."
[fields]
(apply col-wise (map fingerprinter fields)))
(apply col-wise (for [field fields]
(fingerprinter
(cond-> field
;; Try to get a better guestimate of what we're dealing with on first sync
(every? nil? ((juxt :special_type :last_analyzed) field))
(assoc :special_type (classify.name/infer-special-type field)))))))
......@@ -40,14 +40,10 @@
special type with a nil special type"
[result-metadata col]
(update result-metadata :special_type (fn [original-value]
;; If the original special type is a PK or FK, we don't want to use a new
;; computed special type because it'll just be confusing as we can't do any
;; meaningful binning etc on it. If it's not of that type and we are able to
;; compute a special type based on the results, use that
(if-let [new-special-type (and (not (isa? original-value :type/PK))
(not (isa? original-value :type/FK))
(classify-name/infer-special-type col))]
new-special-type
;; If we already know the special type, becouse it is stored, don't classify again.
;; Also try to refine special type set upstream for aggregation cols (which comes back as :type/Number)
(case original-value
(nil :type/Number) (classify-name/infer-special-type col)
original-value))))
(s/defn ^:private stored-column-metadata->result-column-metadata :- ResultColumnMetadata
......@@ -66,17 +62,12 @@
:unit nil})))
(s/defn results->column-metadata :- ResultsMetadata
"Return the desired storage format for the column metadata coming back from RESULTS."
[results]
(for [col (:cols results)]
(-> col
stored-column-metadata->result-column-metadata
(maybe-infer-special-type col))))
(s/defn results->column-metadata+fingerprint :- ResultsMetadata
"Return the desired storage format for the column metadata coming back from RESULTS and fingerprint the RESULTS."
[results]
(let [result-metadata (results->column-metadata results)]
(let [result-metadata (for [col (:cols results)]
(-> col
stored-column-metadata->result-column-metadata
(maybe-infer-special-type col)))]
(transduce identity
(redux/post-complete
(apply f/col-wise (for [metadata result-metadata]
......
......@@ -105,7 +105,7 @@
{(s/optional-key :percent-json) (s/maybe Percent)
(s/optional-key :percent-url) (s/maybe Percent)
(s/optional-key :percent-email) (s/maybe Percent)
(s/optional-key :average-length) su/PositiveNum})
(s/optional-key :average-length) s/Num})
(def DateTimeFingerprint
"Schema for fingerprint information for Fields deriving from `:type/DateTime`."
......
......@@ -277,7 +277,8 @@
(* amount multiplier)))
(->Timestamp cal))))
(def ^:private ^:const date-extract-units
(def ^:const date-extract-units
"Units which return a (numerical, periodic) component of a date"
#{:minute-of-hour :hour-of-day :day-of-week :day-of-month :day-of-year :week-of-year :month-of-year :quarter-of-year
:year})
......
......@@ -193,7 +193,7 @@
:display_name "Category ID"
:fingerprint (if (data/fks-supported?)
{:global {:distinct-count 28}}
{:global {:distinct-count 28, :type {:type/Number {:min 2.0, :max 74.0, :avg 29.98}}}})}
{:global {:distinct-count 28}, :type {:type/Number {:min 2.0, :max 74.0, :avg 29.98}}})}
:price {:special_type :type/Category
:base_type (data/expected-base-type->actual :type/Integer)
:name (data/format-name "price")
......@@ -256,7 +256,7 @@
:display_name "User ID"
:fingerprint (if (data/fks-supported?)
{:global {:distinct-count 15}}
{:global {:distinct-count 15, :type {:type/Number {:min 1.0, :max 15.0, :avg 7.929}}}})})))
{:global {:distinct-count 15}, :type {:type/Number {:min 1.0, :max 15.0, :avg 7.929}}})})))
;;; #### aggregate columns
......
......@@ -35,7 +35,7 @@
(->> query-map
qp/process-query
:data
results->column-metadata+fingerprint
results->column-metadata
(tu/round-all-decimals 2)))
(defn- query-for-card [card]
......
......@@ -30,7 +30,8 @@
:schema "default"
:fields #{{:name "id"
:database-type "SERIAL"
:base-type :type/Integer}
:base-type :type/Integer
:special-type :type/PK}
{:name "title"
:database-type "VARCHAR"
:base-type :type/Text
......@@ -143,7 +144,8 @@
{:name "id"
:display_name "ID"
:database_type "SERIAL"
:base_type :type/Integer})
:base_type :type/Integer
:special_type :type/PK})
(merge field-defaults
{:name "studio"
:display_name "Studio"
......@@ -190,7 +192,8 @@
{:name "id"
:display_name "ID"
:database_type "SERIAL"
:base_type :type/Integer})
:base_type :type/Integer
:special_type :type/PK})
(merge field-defaults
{:name "studio"
:display_name "Studio"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment