Skip to content
Snippets Groups Projects
Unverified Commit 1256e7b8 authored by dpsutton's avatar dpsutton Committed by GitHub
Browse files

Fingerprints reduce over table-rows-sample instead of realizing all (#13688)

* Fingerprints reduce over table-rows-sample instead of realizing all

tests are failing because we test fingerprinters a lot by supplying
fake results and then reducing over those results. Now we are reducing
over them in the query processor and need to get a better testing
strategy.

I think i can stub out the query-processor to take the `:rff` and just
transduce over some fake supplied results but need to verify

* Hijack query-processor to use rff on fake data

* Docstring and make private `table-rows-sample-query`

* Fix tests to use query function rather than hijack qp

also was calling empty not empty? :(

* drivers check [ci drivers]

* Docstring cleanup for rff and remove outdated comment

* Make rff required for table-rows-sample

now you must reduce over the reducible row set rather than realizing
the results in memory and then working with them

* empty commit for drivers [ci drivers]
parent cfa6bf84
Branches
Tags
No related merge requests found
......@@ -21,8 +21,9 @@
[4 "Wurstküche"]
[5 "Brite Spot Family Restaurant"]]
(->> (metadata-queries/table-rows-sample (Table (mt/id :venues))
[(Field (mt/id :venues :id))
(Field (mt/id :venues :name))])
[(Field (mt/id :venues :id))
(Field (mt/id :venues :name))]
(constantly conj))
(sort-by first)
(take 5)))))
......@@ -36,7 +37,8 @@
#'bigquery/page-callback page-callback}
(let [actual (->> (metadata-queries/table-rows-sample (Table (mt/id :venues))
[(Field (mt/id :venues :id))
(Field (mt/id :venues :name))])
(Field (mt/id :venues :name))]
(constantly conj))
(sort-by first)
(take 5))]
(is (= [[1 "Red Medicine"]
......
......@@ -234,7 +234,8 @@
(->> (metadata-queries/table-rows-sample (Table (mt/id :checkins))
[(Field (mt/id :checkins :id))
(Field (mt/id :checkins :venue_name))
(Field (mt/id :checkins :timestamp))])
(Field (mt/id :checkins :timestamp))]
(constantly conj))
(sort-by first)
(take 5)))
......
......@@ -171,7 +171,8 @@
[5 "Brite Spot Family Restaurant"]]
(vec (take 5 (metadata-queries/table-rows-sample (Table (mt/id :venues))
[(Field (mt/id :venues :id))
(Field (mt/id :venues :name))])))))))))
(Field (mt/id :venues :name))]
(constantly conj))))))))))
;; ## Big-picture tests for the way data should look post-sync
......
......@@ -125,7 +125,8 @@
["Wurstküche"]
["Brite Spot Family Restaurant"]]
(take 5 (metadata-queries/table-rows-sample (Table (mt/id :venues))
[(Field (mt/id :venues :name))]))))))
[(Field (mt/id :venues :name))]
(constantly conj)))))))
;;; APPLY-PAGE
......
......@@ -93,32 +93,41 @@
(def TableRowsSampleOptions
"Schema for `table-rows-sample` options"
(s/maybe {(s/optional-key :truncation-size) s/Int}))
(s/maybe {(s/optional-key :truncation-size) s/Int
(s/optional-key :rff) s/Any}))
(s/defn table-rows-sample :- (s/maybe si/TableSample)
"Run a basic MBQL query to fetch a sample of rows belonging to a Table. Can pass in optional `:truncation-size` for
text fields which will be honored only if the driver supports expressions."
(defn- table-rows-sample-query
"Returns the mbql query to query a table for sample rows"
[table fields {:keys [truncation-size] :as _opts}]
(let [driver (-> table table/database driver.u/database->driver)
text-fields (filter (comp #{:type/Text} :base_type) fields)
field->expressions (when (and truncation-size (driver/supports? driver :expressions))
(into {} (for [field text-fields]
[field [(str (gensym "substring"))
[:substring [:field-id (u/get-id field)] 1 truncation-size]]])))]
{:database (:db_id table)
:type :query
:query {:source-table (u/get-id table)
:expressions (into {} (vals field->expressions))
:fields (vec (for [field fields]
(if-let [[expression-name _] (get field->expressions field)]
[:expression expression-name]
[:field-id (u/get-id field)])))
:limit max-sample-rows}
:middleware {:format-rows? false
:skip-results-metadata? true}}))
(s/defn table-rows-sample
"Run a basic MBQL query to fetch a sample of rows of FIELDS belonging to a TABLE.
Options: a map of
`:truncation-size`: [optional] size to truncate text fields if the driver supports expressions.
`:rff`: [optional] a reducing function function (a function that given initial results metadata returns a reducing
function) to reduce over the result set in the the query-processor rather than realizing the whole collection"
{:style/indent 1}
([table :- si/TableInstance, fields :- [si/FieldInstance]]
(table-rows-sample table fields nil))
([table :- si/TableInstance, fields :- [si/FieldInstance]
{:keys [truncation-size] :as _opts} :- TableRowsSampleOptions]
(let [driver (-> table table/database driver.u/database->driver)
text-fields (filter (comp #{:type/Text} :base_type) fields)
field->expressions (when (and truncation-size (driver/supports? driver :expressions))
(into {} (for [field text-fields]
[field [(str (gensym "substring"))
[:substring [:field-id (u/get-id field)] 1 truncation-size]]])))
results ((resolve 'metabase.query-processor/process-query)
{:database (:db_id table)
:type :query
:query {:source-table (u/get-id table)
:expressions (into {} (vals field->expressions))
:fields (vec (for [field fields]
(if-let [[expression-name _] (get field->expressions field)]
[:expression expression-name]
[:field-id (u/get-id field)])))
:limit max-sample-rows}
:middleware {:format-rows? false
:skip-results-metadata? true}})]
(get-in results [:data :rows]))))
([table :- si/TableInstance, fields :- [si/FieldInstance], rff]
(table-rows-sample table fields rff nil))
([table :- si/TableInstance, fields :- [si/FieldInstance], rff, opts :- TableRowsSampleOptions]
(let [query (table-rows-sample-query table fields opts)
qp (resolve 'metabase.query-processor/process-query)]
(qp query {:rff rff}))))
......@@ -48,25 +48,25 @@
(s/defn ^:private fingerprint-table!
[table :- i/TableInstance, fields :- [i/FieldInstance]]
(transduce identity
(redux/post-complete
(f/fingerprint-fields fields)
(fn [fingerprints]
(reduce (fn [count-info [field fingerprint]]
(cond
(instance? Throwable fingerprint)
(update count-info :failed-fingerprints inc)
(some-> fingerprint :global :distinct-count zero?)
(update count-info :no-data-fingerprints inc)
:else
(do
(save-fingerprint! field fingerprint)
(update count-info :updated-fingerprints inc))))
(empty-stats-map (count fingerprints))
(map vector fields fingerprints))))
(metadata-queries/table-rows-sample table fields {:truncation-size truncation-size})))
(let [rff (fn [_metadata]
(redux/post-complete
(f/fingerprint-fields fields)
(fn [fingerprints]
(reduce (fn [count-info [field fingerprint]]
(cond
(instance? Throwable fingerprint)
(update count-info :failed-fingerprints inc)
(some-> fingerprint :global :distinct-count zero?)
(update count-info :no-data-fingerprints inc)
:else
(do
(save-fingerprint! field fingerprint)
(update count-info :updated-fingerprints inc))))
(empty-stats-map (count fingerprints))
(map vector fields fingerprints)))))]
(metadata-queries/table-rows-sample table fields rff {:truncation-size truncation-size})))
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | WHICH FIELDS NEED UPDATED FINGERPRINTS? |
......
......@@ -48,7 +48,7 @@
["BCD Tofu House"]]
table (Table (mt/id :venues))
fields [(Field (mt/id :venues :name))]
fetch! #(->> (metadata-queries/table-rows-sample table fields (when % {:truncation-size %}))
fetch! #(->> (metadata-queries/table-rows-sample table fields (constantly conj) (when % {:truncation-size %}))
;; since order is not guaranteed do some sorting here so we always get the same results
(sort-by first)
(take 5))]
......@@ -63,18 +63,14 @@
"Did not truncate a text field")))))
(testing "substring checking"
;; turn off validation so we can just return the query that is emitted and check for expressions
(s/without-fn-validation
;; just return the query map
(with-redefs [qp/process-query (fn [query] {:data {:rows (:query query)}})
table/database (constantly (database/map->DatabaseInstance {:id 5678}))]
(let [table (table/map->TableInstance {:id 1234})
fields [(field/map->FieldInstance {:id 4321 :base_type :type/Text})]]
(testing "uses substrings if driver supports expressions"
(with-redefs [driver/supports? (constantly true)]
(let [query (metadata-queries/table-rows-sample table fields {:truncation-size 4})]
(is (seq (:expressions query))))))
(testing "doesnt' use substrings if driver doesn't support expressions"
(with-redefs [driver/supports? (constantly false)]
(let [query (metadata-queries/table-rows-sample table fields {:truncation-size 4})]
(is (empty (:expressions query)))))))))))
(with-redefs [table/database (constantly (database/map->DatabaseInstance {:id 5678}))]
(let [table (table/map->TableInstance {:id 1234})
fields [(field/map->FieldInstance {:id 4321 :base_type :type/Text})]]
(testing "uses substrings if driver supports expressions"
(with-redefs [driver/supports? (constantly true)]
(let [query (#'metadata-queries/table-rows-sample-query table fields {:truncation-size 4})]
(is (seq (get-in query [:query :expressions]))))))
(testing "doesnt' use substrings if driver doesn't support expressions"
(with-redefs [driver/supports? (constantly false)]
(let [query (#'metadata-queries/table-rows-sample-query table fields {:truncation-size 4})]
(is (empty? (get-in query [:query :expressions]))))))))))
......@@ -102,7 +102,7 @@
fields (db/select Field :table_id (u/id table) :name "year_column")]
(testing "Can select from this table"
(is (= [[#t "2001-01-01"] [#t "2002-01-01"] [#t "1999-01-01"]]
(metadata-queries/table-rows-sample table fields))))
(metadata-queries/table-rows-sample table fields (constantly conj)))))
(testing "We can fingerprint this table"
(is (= 1
(:updated-fingerprints (#'fingerprint/fingerprint-table! table fields)))))))))
......
......@@ -62,7 +62,8 @@
["800 Degrees Neapolitan Pizzeria"]
["BCD Tofu House"]]
(->> (metadata-queries/table-rows-sample (Table (mt/id :venues))
[(Field (mt/id :venues :name))])
[(Field (mt/id :venues :name))]
(constantly conj))
;; since order is not guaranteed do some sorting here so we always get the same results
(sort-by first)
(take 5))))))
......
......@@ -3,6 +3,7 @@
(:require [clojure.test :refer :all]
[metabase
[db :as mdb]
[query-processor :as qp]
[test :as mt]
[util :as u]]
[metabase.db.metadata-queries :as metadata-queries]
......@@ -120,7 +121,8 @@
(defn- field-was-fingerprinted? {:style/indent 0} [fingerprint-versions field-properties]
(let [fingerprinted? (atom false)]
(with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted fingerprint-versions
metadata-queries/table-rows-sample (constantly [[1] [2] [3] [4] [5]])
qp/process-query (fn [_ {:keys [rff]}]
(transduce identity (rff :metadata) [[1] [2] [3] [4] [5]]))
fingerprint/save-fingerprint! (fn [& _] (reset! fingerprinted? true))]
(tt/with-temp* [Table [table]
Field [_ (assoc field-properties :table_id (u/get-id table))]]
......@@ -205,7 +207,8 @@
:fingerprint_version 1
:last_analyzed #t "2017-08-09T00:00:00"}]
(with-redefs [i/latest-fingerprint-version 3
metadata-queries/table-rows-sample (constantly [[1] [2] [3] [4] [5]])
qp/process-query (fn [_ {:keys [rff]}]
(transduce identity (rff :metadata) [[1] [2] [3] [4] [5]]))
fingerprinters/fingerprinter (constantly (fingerprinters/constant-fingerprinter {:experimental {:fake-fingerprint? true}}))]
(is (= {:no-data-fingerprints 0, :failed-fingerprints 0,
:updated-fingerprints 1, :fingerprints-attempted 1}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment