diff --git a/src/metabase/driver.clj b/src/metabase/driver.clj index 5988c55b6178efe4a2f6af3c3a594b945d69b498..d47efb96408f9b7b315bbc6aeb61cb60cdabcf8f 100644 --- a/src/metabase/driver.clj +++ b/src/metabase/driver.clj @@ -15,20 +15,12 @@ ;;; ## INTERFACE + CONSTANTS -(def ^:const max-sync-lazy-seq-results - "The maximum number of values we should return when using `field-values-lazy-seq`. +(def ^:const max-sample-rows + "The maximum number of values we should return when using `table-rows-sample`. This many is probably fine for inferring special types and what-not; we don't want to scan millions of values at any rate." 10000) -(def ^:const field-values-lazy-seq-chunk-size - "How many Field values should be fetched at a time for a chunked implementation of `field-values-lazy-seq`?" - ;; Hopefully this is a good balance between - ;; 1. Not doing too many DB calls - ;; 2. Not running out of mem - ;; 3. Not fetching too many results for things like mark-json-field! which will fail after the first result that isn't valid JSON - 500) - (def ^:const connection-error-messages "Generic error messages that drivers should return in their implementation of `humanize-connection-error-message`." {:cannot-connect-check-host-and-port "Hmm, we couldn't connect to the database. Make sure your host and port settings are correct" @@ -42,28 +34,30 @@ :username-or-password-incorrect "Looks like the username or password is incorrect."}) (defprotocol IDriver - "Methods that Metabase drivers must implement. Methods marked *OPTIONAL* have default implementations in `IDriverDefaultsMixin`. - Drivers should also implement `getName` form `clojure.lang.Named`, so we can call `name` on them: + "Methods that Metabase drivers must implement. Methods marked *OPTIONAL* have default implementations in + `IDriverDefaultsMixin`. Drivers should also implement `getName` form `clojure.lang.Named`, so we can call `name` on + them: (name (PostgresDriver.)) -> \"PostgreSQL\" This name should be a \"nice-name\" that we'll display to the user." (can-connect? ^Boolean [this, ^java.util.Map details-map] - "Check whether we can connect to a `Database` with DETAILS-MAP and perform a simple query. For example, a SQL database might - try running a query like `SELECT 1;`. This function should return `true` or `false`.") + "Check whether we can connect to a `Database` with DETAILS-MAP and perform a simple query. For example, a SQL + database might try running a query like `SELECT 1;`. This function should return `true` or `false`.") (date-interval [this, ^Keyword unit, ^Number amount] - "*OPTIONAL* Return an driver-appropriate representation of a moment relative to the current moment in time. By default, this returns an `Timestamp` by calling - `metabase.util/relative-date`; but when possible drivers should return a native form so we can be sure the correct timezone is applied. For example, SQL drivers should - return a HoneySQL form to call the appropriate SQL fns: + "*OPTIONAL* Return an driver-appropriate representation of a moment relative to the current moment in time. By + default, this returns an `Timestamp` by calling `metabase.util/relative-date`; but when possible drivers should + return a native form so we can be sure the correct timezone is applied. For example, SQL drivers should return a + HoneySQL form to call the appropriate SQL fns: (date-interval (PostgresDriver.) :month 1) -> (hsql/call :+ :%now (hsql/raw \"INTERVAL '1 month'\"))") (describe-database ^java.util.Map [this, ^DatabaseInstance database] - "Return a map containing information that describes all of the schema settings in DATABASE, most notably a set of tables. - It is expected that this function will be peformant and avoid draining meaningful resources of the database. - Results should match the `DatabaseMetadata` schema.") + "Return a map containing information that describes all of the schema settings in DATABASE, most notably a set of + tables. It is expected that this function will be peformant and avoid draining meaningful resources of the + database. Results should match the `DatabaseMetadata` schema.") (describe-table ^java.util.Map [this, ^DatabaseInstance database, ^TableInstance table] "Return a map containing information that describes the physical schema of TABLE. @@ -76,8 +70,8 @@ (details-fields ^clojure.lang.Sequential [this] "A vector of maps that contain information about connection properties that should - be exposed to the user for databases that will use this driver. This information is used to build the UI for editing - a `Database` `details` map, and for validating it on the Backend. It should include things like `host`, + be exposed to the user for databases that will use this driver. This information is used to build the UI for + editing a `Database` `details` map, and for validating it on the Backend. It should include things like `host`, `port`, and other driver-specific parameters. Each field information map should have the following properties: * `:name` @@ -94,13 +88,14 @@ * `:default` *(OPTIONAL)* - A default value for this field if the user hasn't set an explicit value. This is shown in the UI as a placeholder. + A default value for this field if the user hasn't set an explicit value. This is shown in the UI as a + placeholder. * `:placeholder` *(OPTIONAL)* - Placeholder value to show in the UI if user hasn't set an explicit value. Similar to `:default`, but this value is - *not* saved to `:details` if no explicit value is set. Since `:default` values are also shown as placeholders, you - cannot specify both `:default` and `:placeholder`. + Placeholder value to show in the UI if user hasn't set an explicit value. Similar to `:default`, but this value + is *not* saved to `:details` if no explicit value is set. Since `:default` values are also shown as + placeholders, you cannot specify both `:default` and `:placeholder`. * `:required` *(OPTIONAL)* @@ -123,7 +118,8 @@ [2 \"Rasta Can\"]]}") (features ^java.util.Set [this] - "*OPTIONAL*. A set of keyword names of optional features supported by this driver, such as `:foreign-keys`. Valid features are: + "*OPTIONAL*. A set of keyword names of optional features supported by this driver, such as `:foreign-keys`. Valid + features are: * `:foreign-keys` - Does this database support foreign key relationships? * `:nested-fields` - Does this database support nested fields (e.g. Mongo)? @@ -136,31 +132,27 @@ * `:expression-aggregations` - Does the driver support using expressions inside aggregations? e.g. something like \"sum(x) + count(y)\" or \"avg(x + y)\" * `:nested-queries` - Does the driver support using a query as the `:source-query` of another MBQL query? Examples are CTEs or subselects in SQL queries.") - (field-values-lazy-seq ^clojure.lang.Sequential [this, ^FieldInstance field] - "Return a lazy sequence of all values of FIELD. - This is used to implement some methods of the database sync process which require rows of data during execution. - - The lazy sequence should not return more than `max-sync-lazy-seq-results`, which is currently `10000`. - For drivers that provide a chunked implementation, a recommended chunk size is `field-values-lazy-seq-chunk-size`, which is currently `500`.") - (format-custom-field-name ^String [this, ^String custom-field-name] - "*OPTIONAL*. Return the custom name passed via an MBQL `:named` clause so it matches the way it is returned in the results. - This is used by the post-processing annotation stage to find the correct metadata to include with fields in the results. - The default implementation is `identity`, meaning the resulting field will have exactly the same name as passed to the `:named` clause. - Certain drivers like Redshift always lowercase these names, so this method is provided for those situations.") + "*OPTIONAL*. Return the custom name passed via an MBQL `:named` clause so it matches the way it is returned in the + results. This is used by the post-processing annotation stage to find the correct metadata to include with fields + in the results. The default implementation is `identity`, meaning the resulting field will have exactly the same + name as passed to the `:named` clause. Certain drivers like Redshift always lowercase these names, so this method + is provided for those situations.") (humanize-connection-error-message ^String [this, ^String message] "*OPTIONAL*. Return a humanized (user-facing) version of an connection error message string. - Generic error messages are provided in the constant `connection-error-messages`; return one of these whenever possible.") + Generic error messages are provided in the constant `connection-error-messages`; return one of these whenever + possible.") (mbql->native ^java.util.Map [this, ^java.util.Map query] "Transpile an MBQL structured query into the appropriate native query form. - The input QUERY will be a [fully-expanded MBQL query](https://github.com/metabase/metabase/wiki/Expanded-Queries) with - all the necessary pieces of information to build a properly formatted native query for the given database. + The input QUERY will be a [fully-expanded MBQL query](https://github.com/metabase/metabase/wiki/Expanded-Queries) + with all the necessary pieces of information to build a properly formatted native query for the given database. - If the underlying query language supports remarks or comments, the driver should use `query->remark` to generate an appropriate message and include that in an appropriate place; - alternatively a driver might directly include the query's `:info` dictionary if the underlying language is JSON-based. + If the underlying query language supports remarks or comments, the driver should use `query->remark` to generate an + appropriate message and include that in an appropriate place; alternatively a driver might directly include the + query's `:info` dictionary if the underlying language is JSON-based. The result of this function will be passed directly into calls to `execute-query`. @@ -174,28 +166,49 @@ the event that the driver was doing some caching or connection pooling.") (process-query-in-context [this, ^clojure.lang.IFn qp] - "*OPTIONAL*. Similar to `sync-in-context`, but for running queries rather than syncing. This should be used to do things like open DB connections - that need to remain open for the duration of post-processing. This function follows a middleware pattern and is injected into the QP - middleware stack immediately after the Query Expander; in other words, it will receive the expanded query. - See the Mongo and H2 drivers for examples of how this is intended to be used. + "*OPTIONAL*. Similar to `sync-in-context`, but for running queries rather than syncing. This should be used to do + things like open DB connections that need to remain open for the duration of post-processing. This function + follows a middleware pattern and is injected into the QP middleware stack immediately after the Query Expander; + in other words, it will receive the expanded query. See the Mongo and H2 drivers for examples of how this is + intended to be used. (defn process-query-in-context [driver qp] (fn [query] (qp query)))") (sync-in-context [this, ^DatabaseInstance database, ^clojure.lang.IFn f] - "*OPTIONAL*. Drivers may provide this function if they need to do special setup before a sync operation such as `sync-database!`. The sync - operation itself is encapsulated as the lambda F, which must be called with no arguments. + "*OPTIONAL*. Drivers may provide this function if they need to do special setup before a sync operation such as + `sync-database!`. The sync operation itself is encapsulated as the lambda F, which must be called with no + arguments. (defn sync-in-context [driver database f] (with-connection [_ database] (f)))") (table-rows-seq ^clojure.lang.Sequential [this, ^DatabaseInstance database, ^java.util.Map table] - "*OPTIONAL*. Return a sequence of *all* the rows in a given TABLE, which is guaranteed to have at least `:name` and `:schema` keys. - (It is guaranteed too satisfy the `DatabaseMetadataTable` schema in `metabase.sync.interface`.) - Currently, this is only used for iterating over the values in a `_metabase_metadata` table. As such, the results are not expected to be returned lazily. - There is no expectation that the results be returned in any given order.")) + "*OPTIONAL*. Return a sequence of *all* the rows in a given TABLE, which is guaranteed to have at least `:name` + and `:schema` keys. (It is guaranteed too satisfy the `DatabaseMetadataTable` schema in + `metabase.sync.interface`.) Currently, this is only used for iterating over the values in a `_metabase_metadata` + table. As such, the results are not expected to be returned lazily. + There is no expectation that the results be returned in any given order.") + + ;; TODO - Not 100% sure we need this method since it seems like we could just use an MBQL query to fetch this info. + (table-rows-sample ^clojure.lang.Sequential [this, ^TableInstance table, fields] + "Return a sample of rows in TABLE with the specified FIELDS. This is used to implement some methods of the + database sync process which require rows of data during execution. At this time, this should just return a basic + sequence of rows in the fastest way possible, with no special sorting or any sort of randomization done to ensure + a good sample. (Improved sampling is something we plan to add in the future.) + + The sample should return up to `max-sample-rows` rows, which is currently `10000`.")) + +(defn- table-rows-sample-via-qp [_ table fields] + (let [results ((resolve 'metabase.query-processor/process-query) {:database (:db_id table) + :type :query + :query {:source-table (u/get-id table) + :fields (vec (for [field fields] + [:field-id (u/get-id field)])) + :limit max-sample-rows}})] + (get-in results [:data :rows]))) (def IDriverDefaultsMixin @@ -208,7 +221,8 @@ :notify-database-updated (constantly nil) :process-query-in-context (u/drop-first-arg identity) :sync-in-context (fn [_ _ f] (f)) - :table-rows-seq (constantly nil)}) + :table-rows-seq (constantly nil) + :table-rows-sample table-rows-sample-via-qp}) ;;; ## CONFIG diff --git a/src/metabase/driver/bigquery.clj b/src/metabase/driver/bigquery.clj index d63e228bb82e28ab6a2acf1c49bf2196a9ea6e59..1906554eecd90a5b029f3d673dd3cdeef460fe0c 100644 --- a/src/metabase/driver/bigquery.clj +++ b/src/metabase/driver/bigquery.clj @@ -1,7 +1,7 @@ (ns metabase.driver.bigquery (:require [clojure [set :as set] - [string :as s] + [string :as str] [walk :as walk]] [clojure.tools.logging :as log] [honeysql @@ -119,7 +119,7 @@ (let [request (doto (QueryRequest.) (.setTimeoutMs (* query-timeout-seconds 1000)) ;; if the query contains a `#standardSQL` directive then use Standard SQL instead of legacy SQL - (.setUseLegacySql (not (s/includes? (s/lower-case query-string) "#standardsql"))) + (.setUseLegacySql (not (str/includes? (str/lower-case query-string) "#standardsql"))) (.setQuery query-string))] (google/execute (.query (.jobs client) project-id request))))) @@ -134,7 +134,8 @@ ;; Add the appropriate number of milliseconds to the number to convert it to the local timezone. ;; We do this because the dates come back in UTC but we want the grouping to match the local time (HUH?) ;; This gives us the same results as the other `has-questionable-timezone-support?` drivers - ;; Not sure if this is actually desirable, but if it's not, it probably means all of those other drivers are doing it wrong + ;; Not sure if this is actually desirable, but if it's not, it probably means all of those other drivers are + ;; doing it wrong (u/->Timestamp (- (* (Double/parseDouble s) 1000) (.getDSTSavings default-timezone) (.getRawOffset default-timezone)))))) @@ -155,7 +156,8 @@ (post-process-native response query-timeout-seconds)) ([^QueryResponse response, ^Integer timeout-seconds] (if-not (.getJobComplete response) - ;; 99% of the time by the time this is called `.getJobComplete` will return `true`. On the off chance it doesn't, wait a few seconds for the job to finish. + ;; 99% of the time by the time this is called `.getJobComplete` will return `true`. On the off chance it doesn't, + ;; wait a few seconds for the job to finish. (do (when (zero? timeout-seconds) (throw (ex-info "Query timed out." (into {} response)))) @@ -172,34 +174,25 @@ :rows (for [^TableRow row (.getRows response)] (for [[^TableCell cell, parser] (partition 2 (interleave (.getF row) parsers))] (when-let [v (.getV cell)] - ;; There is a weird error where everything that *should* be NULL comes back as an Object. See https://jira.talendforge.org/browse/TBD-1592 + ;; There is a weird error where everything that *should* be NULL comes back as an Object. + ;; See https://jira.talendforge.org/browse/TBD-1592 ;; Everything else comes back as a String luckily so we can proceed normally. (when-not (= (class v) Object) (parser v)))))})))) (defn- process-native* [database query-string] - ;; automatically retry the query if it times out or otherwise fails. This is on top of the auto-retry added by `execute` so operations going through `process-native*` may be - ;; retried up to 3 times. + ;; automatically retry the query if it times out or otherwise fails. This is on top of the auto-retry added by + ;; `execute` so operations going through `process-native*` may be retried up to 3 times. (u/auto-retry 1 (post-process-native (execute-bigquery database query-string)))) -(defn- field-values-lazy-seq [{field-name :name, :as field-instance}] - {:pre [(map? field-instance)]} - (let [{table-name :name, :as table} (field/table field-instance) - {{dataset-name :dataset-id} :details, :as db} (table/database table) - query (format "SELECT [%s.%s.%s] FROM [%s.%s] LIMIT %d" - dataset-name table-name field-name dataset-name table-name driver/field-values-lazy-seq-chunk-size) - fetch-page (fn [page] - (map first (:rows (process-native* db (str query " OFFSET " (* page driver/field-values-lazy-seq-chunk-size)))))) - fetch-all (fn fetch-all [page] - (lazy-seq (let [results (fetch-page page) - total-results-fetched (* page driver/field-values-lazy-seq-chunk-size)] - (concat results - (when (and (= (count results) driver/field-values-lazy-seq-chunk-size) - (< total-results-fetched driver/max-sync-lazy-seq-results)) - (fetch-all (inc page)))))))] - (fetch-all 0))) +(defn- table-rows-sample [{table-name :name, :as table} fields] + (let [{{dataset-name :dataset-id} :details, :as db} (table/database table)] + (:rows (process-native* db (format "SELECT %s FROM [%s.%s] LIMIT %d" + (str/join ", " (for [{field-name :name} fields] + (format "[%s.%s.%s]" dataset-name table-name field-name))) + dataset-name table-name driver/max-sample-rows))))) @@ -252,7 +245,8 @@ (declare driver) -;; Make the dataset-id the "schema" of every field or table in the query because otherwise BigQuery can't figure out where things is from +;; Make the dataset-id the "schema" of every field or table in the query because otherwise BigQuery can't figure out +;; where things is from (defn- qualify-fields-and-tables-with-dataset-id [{{{:keys [dataset-id]} :details} :database, :as query}] (walk/postwalk (fn [x] (cond @@ -268,14 +262,15 @@ {:pre [(map? honeysql-form)]} ;; replace identifiers like [shakespeare].[word] with ones like [shakespeare.word] since that's hat BigQuery expects (let [[sql & args] (sql/honeysql-form->sql+args driver honeysql-form) - sql (s/replace (hx/unescape-dots sql) #"\]\.\[" ".")] + sql (str/replace (hx/unescape-dots sql) #"\]\.\[" ".")] (assert (empty? args) "BigQuery statements can't be parameterized!") sql)) (defn- post-process-mbql [dataset-id table-name {:keys [columns rows]}] - ;; Since we don't alias column names the come back like "veryNiceDataset_shakepeare_corpus". Strip off the dataset and table IDs - (let [demangle-name (u/rpartial s/replace (re-pattern (str \^ dataset-id \_ table-name \_)) "") + ;; Since we don't alias column names the come back like "veryNiceDataset_shakepeare_corpus". Strip off the dataset + ;; and table IDs + (let [demangle-name (u/rpartial str/replace (re-pattern (str \^ dataset-id \_ table-name \_)) "") columns (for [column columns] (keyword (demangle-name column))) rows (for [row rows] @@ -334,13 +329,15 @@ ag-type))) :else (str schema-name \. table-name \. field-name))) -;; TODO - Making 2 DB calls for each field to fetch its dataset is inefficient and makes me cry, but this method is currently only used for SQL params so it's not a huge deal at this point +;; TODO - Making 2 DB calls for each field to fetch its dataset is inefficient and makes me cry, but this method is +;; currently only used for SQL params so it's not a huge deal at this point (defn- field->identifier [{table-id :table_id, :as field}] (let [db-id (db/select-one-field :db_id 'Table :id table-id) dataset (:dataset-id (db/select-one-field :details Database, :id db-id))] (hsql/raw (apply format "[%s.%s.%s]" dataset (field/qualified-name-components field))))) -;; We have to override the default SQL implementations of breakout and order-by because BigQuery propogates casting functions in SELECT +;; We have to override the default SQL implementations of breakout and order-by because BigQuery propogates casting +;; functions in SELECT ;; BAD: ;; SELECT msec_to_timestamp([sad_toucan_incidents.incidents.timestamp]) AS [sad_toucan_incidents.incidents.timestamp], count(*) AS [count] ;; FROM [sad_toucan_incidents.incidents] @@ -439,9 +436,9 @@ ;; From the dox: Fields must contain only letters, numbers, and underscores, start with a letter or underscore, and be at most 128 characters long. (defn- format-custom-field-name ^String [^String custom-field-name] - (s/join (take 128 (-> (s/trim custom-field-name) - (s/replace #"[^\w\d_]" "_") - (s/replace #"(^\d)" "_$1"))))) + (str/join (take 128 (-> (str/trim custom-field-name) + (str/replace #"[^\w\d_]" "_") + (str/replace #"(^\d)" "_$1"))))) (defrecord BigQueryDriver [] @@ -506,7 +503,7 @@ (when-not config/is-test? ;; during unit tests don't treat bigquery as having FK support #{:foreign-keys}))) - :field-values-lazy-seq (u/drop-first-arg field-values-lazy-seq) + :table-rows-sample (u/drop-first-arg table-rows-sample) :format-custom-field-name (u/drop-first-arg format-custom-field-name) :mbql->native (u/drop-first-arg mbql->native)})) diff --git a/src/metabase/driver/druid.clj b/src/metabase/driver/druid.clj index 297290189732ab2d18398cc980c0417491fe1940..0e52bdd63cfb4b5edb90a49398786c400b7f6f80 100644 --- a/src/metabase/driver/druid.clj +++ b/src/metabase/driver/druid.clj @@ -8,9 +8,11 @@ [util :as u]] [metabase.driver.druid.query-processor :as qp] [metabase.models + [database :refer [Database]] [field :as field] [table :as table]] - [metabase.util.ssh :as ssh])) + [metabase.util.ssh :as ssh] + [toucan.db :as db])) ;;; ### Request helper fns @@ -98,49 +100,21 @@ {:schema nil, :name table-name}))}))) -;;; ### field-values-lazy-seq - -(defn- field-values-lazy-seq-fetch-one-page [details table-name field-name & [paging-identifiers]] - {:pre [(map? details) (or (string? table-name) (keyword? table-name)) (or (string? field-name) (keyword? field-name)) (or (nil? paging-identifiers) (map? paging-identifiers))]} - (let [[{{:keys [pagingIdentifiers events]} :result}] (do-query details {:queryType :select - :dataSource table-name - :intervals ["1900-01-01/2100-01-01"] - :granularity :all - :dimensions [field-name] - :metrics [] - :pagingSpec (merge {:threshold driver/field-values-lazy-seq-chunk-size} - (when paging-identifiers - {:pagingIdentifiers paging-identifiers}))})] - ;; return pair of [paging-identifiers values] - [ ;; Paging identifiers return the largest offset of their results, e.g. 49 for page 1. - ;; We need to inc that number so the next page starts after that (e.g. 50) - (let [[[k offset]] (seq pagingIdentifiers)] - {k (inc offset)}) - ;; Unnest the values - (for [event events] - (get-in event [:event (keyword field-name)]))])) - -(defn- field-values-lazy-seq - ([field] - (field-values-lazy-seq (:details (table/database (field/table field))) - (:name (field/table field)) - (:name field) - 0 - nil)) - - ([details table-name field-name total-items-fetched paging-identifiers] - {:pre [(map? details) - (or (string? table-name) (keyword? table-name)) - (or (string? field-name) (keyword? field-name)) - (integer? total-items-fetched) - (or (nil? paging-identifiers) (map? paging-identifiers))]} - (lazy-seq (let [[paging-identifiers values] (field-values-lazy-seq-fetch-one-page details table-name field-name paging-identifiers) - total-items-fetched (+ total-items-fetched driver/field-values-lazy-seq-chunk-size)] - (concat values - (when (and (seq values) - (< total-items-fetched driver/max-sync-lazy-seq-results) - (= (count values) driver/field-values-lazy-seq-chunk-size)) - (field-values-lazy-seq details table-name field-name total-items-fetched paging-identifiers))))))) +;;; ### table-rows-sample + +(defn- table-rows-sample [table fields] + (let [db-details (db/select-one-field :details Database :id (:db_id table)) + [results] (do-query db-details {:queryType :select + :dataSource (:name table) + :intervals ["1900-01-01/2100-01-01"] + :granularity :all + :dimensions (map :name fields) + :metrics [] + :pagingSpec {:threshold driver/max-sample-rows}})] + ;; Unnest the values + (for [event (get-in results [:result :events])] + (for [field fields] + (get-in event [:event (keyword (:name field))]))))) ;;; ### DruidrDriver Class Definition @@ -165,7 +139,7 @@ :default 8082}])) :execute-query (fn [_ query] (qp/execute-query do-query query)) :features (constantly #{:basic-aggregations :set-timezone :expression-aggregations}) - :field-values-lazy-seq (u/drop-first-arg field-values-lazy-seq) + ;; :table-rows-sample (u/drop-first-arg table-rows-sample) :mbql->native (u/drop-first-arg qp/mbql->native)})) (defn -init-driver diff --git a/src/metabase/driver/generic_sql.clj b/src/metabase/driver/generic_sql.clj index d446edee7e2e9de03422eff4fb3a1e13e3390ab2..9a0cce98dd6bdf634850fb9a33d1531a7ef51fd0 100644 --- a/src/metabase/driver/generic_sql.clj +++ b/src/metabase/driver/generic_sql.clj @@ -231,51 +231,33 @@ ([table field] (hx/qualify-and-escape-dots (:schema table) (:name table) (:name field)))) +(def ^:private ^:dynamic *jdbc-options* {}) + (defn- query "Execute a HONEYSQL-FROM query against DATABASE, DRIVER, and optionally TABLE." ([driver database honeysql-form] (jdbc/query (db->jdbc-connection-spec database) - (honeysql-form->sql+args driver honeysql-form))) + (honeysql-form->sql+args driver honeysql-form) + *jdbc-options*)) ([driver database table honeysql-form] (query driver database (merge {:from [(qualify+escape table)]} honeysql-form)))) -(defn- field-values-lazy-seq [driver field] - (let [table (field/table field) - db (table/database table) - field-k (qualify+escape table field) - pk-field (field/Field (table/pk-field-id table)) - pk-field-k (when pk-field - (qualify+escape table pk-field)) - transform-fn (if (isa? (:base_type field) :type/Text) - u/jdbc-clob->str - identity) - select* {:select [[field-k :field]] - :from [(qualify+escape table)] ; if we don't specify an explicit ORDER BY some DBs like Redshift will return them in a (seemingly) random order - :order-by [[(or pk-field-k field-k) :asc]]} ; try to order by the table's Primary Key to avoid doing full table scans - fetch-one-page (fn [page-num] - (for [{v :field} (query driver db (apply-page driver select* {:page {:items driver/field-values-lazy-seq-chunk-size - :page (inc page-num)}}))] - (transform-fn v))) - - ;; This function returns a chunked lazy seq that will fetch some range of results, e.g. 0 - 500, then concat that chunk of results - ;; with a recursive call to (lazily) fetch the next chunk of results, until we run out of results or hit the limit. - fetch-page (fn -fetch-page [page-num] - (lazy-seq - (let [results (fetch-one-page page-num) - total-items-fetched (* (inc page-num) driver/field-values-lazy-seq-chunk-size)] - (concat results (when (and (seq results) - (< total-items-fetched driver/max-sync-lazy-seq-results) - (= (count results) driver/field-values-lazy-seq-chunk-size)) - (-fetch-page (inc page-num)))))))] - (fetch-page 0))) - - (defn- table-rows-seq [driver database table] (query driver database table {:select [:*]})) +(defn- table-rows-sample [driver table fields] + (->> (binding [*jdbc-options* {:as-arrays? true}] + (query driver (table/database table) table (apply-limit driver + {:select (for [field fields] + (keyword (:name field)))} + {:limit driver/max-sample-rows}))) + ;; the first row coming back will be the columns list so go ahead and drop it like it's hot + (drop 1))) + + (defn features "Default implementation of `IDriver` `features` for SQL drivers." [driver] @@ -410,7 +392,7 @@ :describe-table-fks describe-table-fks :execute-query (resolve 'metabase.driver.generic-sql.query-processor/execute-query) :features features - :field-values-lazy-seq field-values-lazy-seq :mbql->native (resolve 'metabase.driver.generic-sql.query-processor/mbql->native) :notify-database-updated notify-database-updated - :table-rows-seq table-rows-seq})) + :table-rows-seq table-rows-seq + :table-rows-sample table-rows-sample})) diff --git a/src/metabase/driver/googleanalytics.clj b/src/metabase/driver/googleanalytics.clj index 0fff9fe3927e35425a1e01d6f3093996aff6827b..a84228c9f5730fb256218c124b9c2f6e6cae01a9 100644 --- a/src/metabase/driver/googleanalytics.clj +++ b/src/metabase/driver/googleanalytics.clj @@ -242,7 +242,6 @@ :placeholder "4/HSk-KtxkSzTt61j5zcbee2Rmm5JHkRFbL5gD5lgkXek" :required true}]) :execute-query (u/drop-first-arg (partial qp/execute-query do-query)) - :field-values-lazy-seq (constantly []) :process-query-in-context (u/drop-first-arg process-query-in-context) :mbql->native (u/drop-first-arg qp/mbql->native) :table-rows-seq (u/drop-first-arg table-rows-seq)})) diff --git a/src/metabase/driver/mongo.clj b/src/metabase/driver/mongo.clj index 6ce11eb88c28d0afd906a81dc1a7524e58deb2bd..27fc86e36ebc0786d1a25c071e4515cf825808aa 100644 --- a/src/metabase/driver/mongo.clj +++ b/src/metabase/driver/mongo.clj @@ -131,7 +131,7 @@ ;; TODO: ideally this would take the LAST set of rows added to the table so we could ensure this data changes on reruns (let [parsed-rows (try (->> (mc/find-maps conn (:name table)) - (take driver/max-sync-lazy-seq-results) + (take driver/max-sample-rows) (reduce (fn [field-defs row] (loop [[k & more-keys] (keys row) @@ -147,16 +147,19 @@ :fields (set (for [field (keys parsed-rows)] (describe-table-field field (field parsed-rows))))}))) -(s/defn ^:private ^:always-validate field-values-lazy-seq [field :- si/FieldInstance] - (lazy-seq - (assert *mongo-connection* - "You must have an open Mongo connection in order to get lazy results with field-values-lazy-seq.") - (let [table (field/table field) - name-components (rest (field/qualified-name-components field))] - (assert (seq name-components)) - (for [row (mq/with-collection *mongo-connection* (:name table) - (mq/fields [(str/join \. name-components)]))] - (get-in row (map keyword name-components)))))) +(s/defn ^:private ^:always-validate table-rows-sample [table :- si/TableInstance, fields :- [si/FieldInstance]] + (assert *mongo-connection* + "You must have an open Mongo connection in order to get lazy results with table-rows-sample.") + (let [fields (for [field fields] + (let [name-components (rest (field/qualified-name-components field))] + (assert (seq name-components)) + (assoc field :name-components name-components))) + results (mq/with-collection *mongo-connection* (:name table) + (mq/fields (for [field fields] + (str/join \. (:name-components field)))))] + (for [row results] + (for [field fields] + (get-in row (map keyword (:name-components field))))))) (defrecord MongoDriver [] @@ -200,7 +203,7 @@ :placeholder "readPreference=nearest&replicaSet=test"}])) :execute-query (u/drop-first-arg qp/execute-query) :features (constantly #{:basic-aggregations :dynamic-schema :nested-fields}) - :field-values-lazy-seq (u/drop-first-arg field-values-lazy-seq) + :table-rows-sample (u/drop-first-arg table-rows-sample) :humanize-connection-error-message (u/drop-first-arg humanize-connection-error-message) :mbql->native (u/drop-first-arg qp/mbql->native) :process-query-in-context (u/drop-first-arg process-query-in-context) diff --git a/src/metabase/driver/presto.clj b/src/metabase/driver/presto.clj index a493cf8af5ad6fb7b6f9bf2013709e3ad7bae9af..54cd195c7d308fd2f989f31d0f603183f91d84f5 100644 --- a/src/metabase/driver/presto.clj +++ b/src/metabase/driver/presto.clj @@ -181,17 +181,15 @@ :columns (map (comp keyword :name) columns) :rows rows})) -(defn- field-values-lazy-seq [{field-name :name, :as field}] +(defn- table-rows-sample [table fields] ;; TODO - look into making this actually lazy - (let [table (field/table field) - {:keys [details]} (table/database table) + (let [{:keys [details]} (table/database table) sql (format "SELECT %s FROM %s LIMIT %d" - (quote-name field-name) - (quote+combine-names (:schema table) (:name table)) - driver/max-sync-lazy-seq-results) - {:keys [rows]} (execute-presto-query! details sql)] - (for [row rows] - (first row)))) + (str/join ", " (for [{field-name :name} fields] + (quote-name field-name))) + (quote+combine-names (:schema table) (:name table)) + driver/max-sample-rows)] + (:rows (execute-presto-query! details sql)))) (defn- humanize-connection-error-message [message] (condp re-matches message @@ -312,7 +310,7 @@ (when-not config/is-test? ;; during unit tests don't treat presto as having FK support #{:foreign-keys}))) - :field-values-lazy-seq (u/drop-first-arg field-values-lazy-seq) + :table-rows-sample (u/drop-first-arg table-rows-sample) :humanize-connection-error-message (u/drop-first-arg humanize-connection-error-message) :table-rows-seq (u/drop-first-arg table-rows-seq)}) diff --git a/src/metabase/sync/analyze/fingerprint.clj b/src/metabase/sync/analyze/fingerprint.clj index d8c0c13225d362c5a97fcd5e80642d174b32605b..359da6bd7829e6d9d85aae2ae55ffbc27b6a5553 100644 --- a/src/metabase/sync/analyze/fingerprint.clj +++ b/src/metabase/sync/analyze/fingerprint.clj @@ -19,40 +19,43 @@ [toucan.db :as db])) (s/defn ^:private ^:always-validate type-specific-fingerprint :- (s/maybe i/TypeSpecificFingerprint) - "Return type-specific fingerprint info for FIELD and a sample of VALUES if it has an elligible base type - such as a derivative of `:type/Text` or of `:type/Number`." - [field :- i/FieldInstance, values :- i/ValuesSample] + "Return type-specific fingerprint info for FIELD AND. a FieldSample of Values if it has an elligible base type" + [field :- i/FieldInstance, values :- i/FieldSample] (condp #(isa? %2 %1) (:base_type field) :type/Text {:type/Text (text/text-fingerprint values)} :type/Number {:type/Number (number/number-fingerprint values)} nil)) -(s/defn ^:private ^:always-validate fingerprint :- (s/maybe i/Fingerprint) - "Generate a 'fingerprint' from a SAMPLE of values." - ([field :- i/FieldInstance] - (when-let [values (sample/basic-sample field)] - (fingerprint field values))) - ([field :- i/FieldInstance, values :- i/ValuesSample] - (merge - (when-let [global-fingerprint (global/global-fingerprint values)] - {:global global-fingerprint}) - (when-let [type-specific-fingerprint (type-specific-fingerprint field values)] - {:type type-specific-fingerprint})))) - - -(s/defn ^:private ^:always-validate fingerprint! - "Generate and save a fingerprint for a FIELD." - [field :- i/FieldInstance] - (sync-util/with-error-handling (format "Error generating fingerprint for %s" (sync-util/name-for-logging field)) - (when-let [fingerprint (fingerprint field)] - (log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field))) - ;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll - ;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the - ;; latest fingerprints. - (db/update! Field (u/get-id field) - :fingerprint fingerprint - :fingerprint_version i/latest-fingerprint-version - :last_analyzed nil)))) +(s/defn ^:private ^:always-validate fingerprint :- i/Fingerprint + "Generate a 'fingerprint' from a FieldSample of VALUES." + [field :- i/FieldInstance, values :- i/FieldSample] + (merge + (when-let [global-fingerprint (global/global-fingerprint values)] + {:global global-fingerprint}) + (when-let [type-specific-fingerprint (type-specific-fingerprint field values)] + {:type type-specific-fingerprint}))) + + +(s/defn ^:private ^:always-validate save-fingerprint! + [field :- i/FieldInstance, fingerprint :- i/Fingerprint] + ;; don't bother saving fingerprint if it's completely empty + (when (seq fingerprint) + (log/debug (format "Saving fingerprint for %s" (sync-util/name-for-logging field))) + ;; All Fields who get new fingerprints should get marked as having the latest fingerprint version, but we'll + ;; clear their values for `last_analyzed`. This way we know these fields haven't "completed" analysis for the + ;; latest fingerprints. + (db/update! Field (u/get-id field) + :fingerprint fingerprint + :fingerprint_version i/latest-fingerprint-version + :last_analyzed nil))) + +(s/defn ^:private ^:always-validate fingerprint-table! + [table :- i/TableInstance, fields :- [i/FieldInstance]] + (doseq [[field sample] (sample/sample-fields table fields)] + (when sample + (sync-util/with-error-handling (format "Error generating fingerprint for %s" (sync-util/name-for-logging field)) + (let [fingerprint (fingerprint field sample)] + (save-fingerprint! field fingerprint)))))) ;;; +----------------------------------------------------------------------------------------------------------------+ @@ -145,9 +148,9 @@ (seq (db/select Field (honeysql-for-fields-that-need-fingerprint-updating table)))) +;; TODO - `fingerprint-fields!` and `fingerprint-table!` should probably have their names switched (s/defn ^:always-validate fingerprint-fields! "Generate and save fingerprints for all the Fields in TABLE that have not been previously analyzed." [table :- i/TableInstance] (when-let [fields (fields-to-fingerprint table)] - (doseq [field fields] - (fingerprint! field)))) + (fingerprint-table! table fields))) diff --git a/src/metabase/sync/analyze/fingerprint/global.clj b/src/metabase/sync/analyze/fingerprint/global.clj index 996b3ebd8d0a26287e437aa0ee880a6f85fcf538..a8fdedd4576a39f1ef48a35bdce650f2436d05b4 100644 --- a/src/metabase/sync/analyze/fingerprint/global.clj +++ b/src/metabase/sync/analyze/fingerprint/global.clj @@ -5,7 +5,7 @@ (s/defn ^:always-validate global-fingerprint :- i/GlobalFingerprint "Generate a fingerprint of global information for Fields of all types." - [values :- i/ValuesSample] + [values :- i/FieldSample] ;; TODO - this logic isn't as nice as the old logic that actually called the DB ;; We used to do (queries/field-distinct-count field field-values/low-cardinality-threshold) ;; Consider whether we are so married to the idea of only generating fingerprints from samples that we diff --git a/src/metabase/sync/analyze/fingerprint/number.clj b/src/metabase/sync/analyze/fingerprint/number.clj index 5fb5ce21dec4406d8865475e25b06220b1615faa..242261ab3dbcbc4e907ed9a4759535194fc0509d 100644 --- a/src/metabase/sync/analyze/fingerprint/number.clj +++ b/src/metabase/sync/analyze/fingerprint/number.clj @@ -5,13 +5,13 @@ (s/defn ^:private ^:always-validate average :- s/Num "Return the average of VALUES." - [values :- i/ValuesSample] + [values :- i/FieldSample] (/ (double (reduce + values)) (double (count values)))) (s/defn ^:always-validate number-fingerprint :- i/NumberFingerprint "Generate a fingerprint containing information about values that belong to a `:type/Number` Field." - [values :- i/ValuesSample] + [values :- i/FieldSample] {:min (apply min values) :max (apply max values) :avg (average values)}) diff --git a/src/metabase/sync/analyze/fingerprint/sample.clj b/src/metabase/sync/analyze/fingerprint/sample.clj index a76ce2e28fa7d97d10885a2e75a274f4cf076471..8f57613d604fe1207275675b54eb486944db4b90 100644 --- a/src/metabase/sync/analyze/fingerprint/sample.clj +++ b/src/metabase/sync/analyze/fingerprint/sample.clj @@ -1,26 +1,42 @@ (ns metabase.sync.analyze.fingerprint.sample - "Analysis sub-step that fetches a sample of values for a given Field, which is used to generate a fingerprint for it. - Currently this is dumb and just fetches a contiguous sequence of values, but in the future we plan to make this - more sophisticated and have different types of samples for different Fields." - (:require [metabase.driver :as driver] - [metabase.models - [database :refer [Database]] - [table :refer [Table]]] + "Analysis sub-step that fetches a sample of rows for a given Table and some set of Fields belonging to it, which is + used to generate fingerprints for those Fields. Currently this is dumb and just fetches a contiguous sequence of + rows, but in the future we plan to make this more sophisticated and have different types of samples for different + Fields, or do a better job getting a more-random sample of rows." + (:require [medley.core :as m] + [metabase.driver :as driver] + [metabase.models.database :refer [Database]] [metabase.sync.interface :as i] - [schema.core :as s] - [toucan.db :as db])) + [schema.core :as s])) -(s/defn ^:always-validate basic-sample :- (s/maybe i/ValuesSample) - "Procure a sequence of non-nil values, up to `max-sync-lazy-seq-results` (10,000 at the time of this writing), for use - in the various tests above. Maybe return `nil` if no values are available." - [field :- i/FieldInstance] +(s/defn ^:private ^:always-validate basic-sample :- (s/maybe i/TableSample) + "Procure a sequence of table rows, up to `max-sample-rows` (10,000 at the time of this writing), for + use in the fingerprinting sub-stage of analysis. Returns `nil` if no rows are available." + [table :- i/TableInstance, fields :- [i/FieldInstance]] ;; TODO - we should make `->driver` a method so we can pass things like Fields into it - (let [db-id (db/select-one-field :db_id Table :id (:table_id field)) + (let [db-id (:db_id table) driver (driver/->driver db-id) database (Database db-id)] (driver/sync-in-context driver database (fn [] - (->> (driver/field-values-lazy-seq driver field) - (take driver/max-sync-lazy-seq-results) - (filter (complement nil?)) + (->> (driver/table-rows-sample driver table fields) + (take driver/max-sample-rows) seq))))) + +(s/defn ^:private ^:always-validate table-sample->field-sample :- (s/maybe i/FieldSample) + "Fetch a sample for the Field whose values are at INDEX in the TABLE-SAMPLE. + Filters out `nil` values; returns `nil` if a non-empty sample cannot be obtained." + [table-sample :- i/TableSample, i :- s/Int] + (->> (for [row table-sample] + (nth row i)) + (filter (complement nil?)) + seq)) + +(s/defn ^:always-validate sample-fields :- [(s/pair i/FieldInstance "Field", (s/maybe i/FieldSample) "FieldSample")] + "Fetch samples for a series of FIELDS. Returns tuples of Field and sample. + This may return `nil` if the driver doesn't support `table-rows-sample` or the sample could not be fetched for some + other reason." + [table :- i/TableInstance, fields :- [i/FieldInstance]] + (when-let [table-sample (basic-sample table fields)] + (for [[i field] (m/indexed fields)] + [field (table-sample->field-sample table-sample i)]))) diff --git a/src/metabase/sync/analyze/fingerprint/text.clj b/src/metabase/sync/analyze/fingerprint/text.clj index 5129e192696b85e9cb40f882da5fb853e7a0f621..f825073fe7c1f2f05fe2ae06fea1c6df3323c0e1 100644 --- a/src/metabase/sync/analyze/fingerprint/text.clj +++ b/src/metabase/sync/analyze/fingerprint/text.clj @@ -7,7 +7,7 @@ (s/defn ^:private ^:always-validate average-length :- (s/constrained Double #(>= % 0)) "Return the average length of VALUES." - [values :- i/ValuesSample] + [values :- i/FieldSample] (let [total-length (reduce + (for [value values] (count (str value))))] (/ (double total-length) @@ -15,7 +15,7 @@ (s/defn ^:private ^:always-validate percent-satisfying-predicate :- i/Percent "Return the percentage of VALUES that satisfy PRED." - [pred :- (s/pred fn?), values :- i/ValuesSample] + [pred :- (s/pred fn?), values :- i/FieldSample] (let [total-count (count values) pred #(boolean (u/ignore-exceptions (pred %))) matching-count (count (get (group-by pred values) true []))] @@ -32,7 +32,7 @@ (s/defn ^:always-validate text-fingerprint :- i/TextFingerprint "Generate a fingerprint containing information about values that belong to a `:type/Text` Field." - [values :- i/ValuesSample] + [values :- i/FieldSample] {:percent-json (percent-satisfying-predicate valid-serialized-json? values) :percent-url (percent-satisfying-predicate u/is-url? values) :percent-email (percent-satisfying-predicate u/is-email? values) diff --git a/src/metabase/sync/interface.clj b/src/metabase/sync/interface.clj index df7eb4de1737f7531c4a5c4da159d326a22fa12b..bdde20462eab3162148d90a725e3ae92c6485935 100644 --- a/src/metabase/sync/interface.clj +++ b/src/metabase/sync/interface.clj @@ -59,12 +59,24 @@ ;;; | SAMPLING & FINGERPRINTS | ;;; +----------------------------------------------------------------------------------------------------------------+ -(def ValuesSample - "Schema for a sample of VALUES returned by the `sample` sub-stage of analysis and passed into the `fingerprint` +(def FieldSample + "Schema for a sample of values returned by the `sample` sub-stage of analysis and passed into the `fingerprint` stage. Guaranteed to be non-empty and non-nil." ;; Validating against this is actually pretty quick, in the order of microseconds even for a 10,000 value sequence (s/constrained [(s/pred (complement nil?))] seq "Non-empty sequence of non-nil values.")) +(def TableSample + "Schema for a sample of values of certain Fields for a TABLE. This should basically just be a sequence of rows where + each row is a sequence of values in the same order as the Fields passed in (basically the format you get from JDBC + when `:as-arrays?` is `false`). + + e.g. if Fields passed in were `ID` and `Name` the Table sample should look something like: + + [[1 \"Rasta Toucan\"] + [2 \"Lucky Pigeon\"] + [3 \"Kanye Nest\"]]" + [[s/Any]]) + (def GlobalFingerprint "Fingerprint values that Fields of all types should have." diff --git a/test/metabase/api/database_test.clj b/test/metabase/api/database_test.clj index c8815a5dbf41d647494c00261e01d0762dd67a89..7ae4be03485f57584005baefbcf35f3fcc03e73f 100644 --- a/test/metabase/api/database_test.clj +++ b/test/metabase/api/database_test.clj @@ -171,9 +171,10 @@ id))] (when-let [dbs (seq (db/select [Database :name :engine :id] :id [:not-in ids-to-skip]))] (println (u/format-color 'red (str "\n!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" - "WARNING: deleting randomly created databases:\n%s\n" + "WARNING: deleting randomly created databases:\n%s" "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n\n") - (u/pprint-to-str dbs)))) + (u/pprint-to-str (for [db dbs] + (dissoc db :features)))))) (db/delete! Database :id [:not-in ids-to-skip]))) diff --git a/test/metabase/driver/bigquery_test.clj b/test/metabase/driver/bigquery_test.clj index 85252d0c9c36c1fed7d033c6b499bbb59add842f..335b869b15ef4ffd77b1965d142b3ba0c92f9836 100644 --- a/test/metabase/driver/bigquery_test.clj +++ b/test/metabase/driver/bigquery_test.clj @@ -1,13 +1,19 @@ (ns metabase.driver.bigquery-test (:require [expectations :refer :all] [metabase + [driver :as driver] [query-processor :as qp] [query-processor-test :as qptest]] + metabase.driver.bigquery + [metabase.models + [field :refer [Field]] + [table :refer [Table]]] [metabase.query-processor.middleware.expand :as ql] [metabase.test [data :as data] [util :as tu]] - [metabase.test.data.datasets :refer [expect-with-engine]])) + [metabase.test.data.datasets :refer [expect-with-engine]]) + (:import metabase.driver.bigquery.BigQueryDriver)) (def ^:private col-defaults {:remapped_to nil, :remapped_from nil}) @@ -21,6 +27,20 @@ :database (data/id)}) [:data :rows])) +;;; table-rows-sample +(expect-with-engine :bigquery + [[1 "Red Medicine"] + [2 "Stout Burgers & Beers"] + [3 "The Apple Pan"] + [4 "Wurstküche"] + [5 "Brite Spot Family Restaurant"]] + (->> (driver/table-rows-sample (BigQueryDriver.) + (Table (data/id :venues)) + [(Field (data/id :venues :id)) + (Field (data/id :venues :name))]) + (sort-by first) + (take 5))) + ;; make sure that BigQuery native queries maintain the column ordering specified in the SQL -- post-processing ordering shouldn't apply (Issue #2821) (expect-with-engine :bigquery diff --git a/test/metabase/driver/druid_test.clj b/test/metabase/driver/druid_test.clj index 029fc169691b3c504396ad62813dbce60172058b..31460d92678a953f7035de2d285a75eabfa0c67b 100644 --- a/test/metabase/driver/druid_test.clj +++ b/test/metabase/driver/druid_test.clj @@ -8,11 +8,31 @@ [query-processor-test :refer [rows rows+column-names]] [timeseries-query-processor-test :as timeseries-qp-test] [util :as u]] - [metabase.models.metric :refer [Metric]] + metabase.driver.druid + [metabase.models + [field :refer [Field]] + [table :refer [Table]] + [metric :refer [Metric]]] [metabase.query-processor.middleware.expand :as ql] [metabase.test.data :as data] [metabase.test.data.datasets :as datasets :refer [expect-with-engine]] - [toucan.util.test :as tt])) + [toucan.util.test :as tt]) + (:import metabase.driver.druid.DruidDriver)) + +;;; table-rows-sample +(datasets/expect-with-engine :druid + ;; druid returns a timestamp along with the query, but that shouldn't really matter here :D + [["1" "The Misfit Restaurant + Bar" "2014-04-07T07:00:00.000Z"] + ["10" "Dal Rae Restaurant" "2015-08-22T07:00:00.000Z"] + ["100" "PizzaHacker" "2014-07-26T07:00:00.000Z"] + ["1000" "Tito's Tacos" "2014-06-03T07:00:00.000Z"] + ["101" "Golden Road Brewing" "2015-09-04T07:00:00.000Z"]] + (->> (driver/table-rows-sample (DruidDriver.) + (Table (data/id :checkins)) + [(Field (data/id :checkins :id)) + (Field (data/id :checkins :venue_name))]) + (sort-by first) + (take 5))) (def ^:const ^:private ^String native-query-1 (json/generate-string diff --git a/test/metabase/driver/generic_sql_test.clj b/test/metabase/driver/generic_sql_test.clj index 75c4ed7337106e6a30af5abebd108d40a9eb4b84..b0fb417b1add90030cc8ecaa3509ca49646cac40 100644 --- a/test/metabase/driver/generic_sql_test.clj +++ b/test/metabase/driver/generic_sql_test.clj @@ -7,7 +7,8 @@ [table :as table :refer [Table]]] [metabase.test.data :refer :all] [metabase.test.data.datasets :as datasets] - [toucan.db :as db]) + [toucan.db :as db] + [metabase.test.data :as data]) (:import metabase.driver.h2.H2Driver)) (def ^:private users-table (delay (Table :name "USERS"))) @@ -64,14 +65,17 @@ :dest-column-name "ID"}} (driver/describe-table-fks (H2Driver.) (db) @venues-table)) -;;; FIELD-VALUES-LAZY-SEQ +;;; TABLE-ROWS-SAMPLE (datasets/expect-with-engines @generic-sql-engines - ["Red Medicine" - "Stout Burgers & Beers" - "The Apple Pan" - "Wurstküche" - "Brite Spot Family Restaurant"] - (take 5 (#'sql/field-values-lazy-seq datasets/*driver* (db/select-one 'Field :id (id :venues :name))))) + [["20th Century Cafe"] + ["25°"] + ["33 Taps"] + ["800 Degrees Neapolitan Pizzeria"] + ["BCD Tofu House"]] + (->> (#'sql/table-rows-sample datasets/*driver* (Table (data/id :venues)) [(Field (data/id :venues :name))]) + ;; since order is not guaranteed do some sorting here so we always get the same results + (sort-by first) + (take 5))) ;;; TABLE-ROWS-SEQ diff --git a/test/metabase/driver/mongo_test.clj b/test/metabase/driver/mongo_test.clj index 132c3384f4ac7b8aecdb1ee3632ee0791ee26c35..40bb925810e44bdc08b4852444c8eb44b3d5a13c 100644 --- a/test/metabase/driver/mongo_test.clj +++ b/test/metabase/driver/mongo_test.clj @@ -117,6 +117,22 @@ :pk? true}}} (driver/describe-table (MongoDriver.) (data/db) (Table (data/id :venues)))) + +;;; table-rows-sample +(datasets/expect-with-engine :mongo + [[1 "Red Medicine"] + [2 "Stout Burgers & Beers"] + [3 "The Apple Pan"] + [4 "Wurstküche"] + [5 "Brite Spot Family Restaurant"]] + (driver/sync-in-context (MongoDriver.) (data/db) + (fn [] + (vec (take 5 (driver/table-rows-sample (MongoDriver.) + (Table (data/id :venues)) + [(Field (data/id :venues :id)) + (Field (data/id :venues :name))])))))) + + ;; ## Big-picture tests for the way data should look post-sync ;; Test that Tables got synced correctly, and row counts are correct diff --git a/test/metabase/driver/presto_test.clj b/test/metabase/driver/presto_test.clj index 476d70ab0adc50e7f0a8db93ddfa596dce941724..5cc83cfdd75f7d72525bb2f380a458c96a46cb10 100644 --- a/test/metabase/driver/presto_test.clj +++ b/test/metabase/driver/presto_test.clj @@ -2,7 +2,9 @@ (:require [expectations :refer :all] [metabase.driver :as driver] [metabase.driver.generic-sql :as sql] - [metabase.models.table :as table] + [metabase.models + [field :refer [Field]] + [table :refer [Table] :as table]] [metabase.test [data :as data] [util :refer [resolve-private-vars]]] @@ -96,14 +98,16 @@ :base-type :type/Integer}}} (driver/describe-table (PrestoDriver.) (data/db) (db/select-one 'Table :id (data/id :venues)))) -;;; FIELD-VALUES-LAZY-SEQ +;;; TABLE-ROWS-SAMPLE (datasets/expect-with-engine :presto - ["Red Medicine" - "Stout Burgers & Beers" - "The Apple Pan" - "Wurstküche" - "Brite Spot Family Restaurant"] - (take 5 (driver/field-values-lazy-seq (PrestoDriver.) (db/select-one 'Field :id (data/id :venues :name))))) + [["Red Medicine"] + ["Stout Burgers & Beers"] + ["The Apple Pan"] + ["Wurstküche"] + ["Brite Spot Family Restaurant"]] + (take 5 (driver/table-rows-sample (PrestoDriver.) + (Table (data/id :venues)) + [(Field (data/id :venues :name))]))) ;;; TABLE-ROWS-SEQ (datasets/expect-with-engine :presto diff --git a/test/metabase/sync/analyze/fingerprint/sample_test.clj b/test/metabase/sync/analyze/fingerprint/sample_test.clj new file mode 100644 index 0000000000000000000000000000000000000000..df27eb39d32d7b52302e390799d9c306440f0fbb --- /dev/null +++ b/test/metabase/sync/analyze/fingerprint/sample_test.clj @@ -0,0 +1,63 @@ +(ns metabase.sync.analyze.fingerprint.sample-test + (:require [expectations :refer :all] + [metabase.models + [field :refer [Field]] + [table :refer [Table]]] + [metabase.sync.analyze.fingerprint.sample :as sample] + [metabase.test.data :as data])) + +;;; +----------------------------------------------------------------------------------------------------------------+ +;;; | TESTS FOR BASIC-SAMPLE | +;;; +----------------------------------------------------------------------------------------------------------------+ + +;; Actually the order the rows come back in isn't really guaranteed so this test is sort of testing a circumstantial +;; side-effect of the way H2 returns rows when order isn't specified +(expect + [[1 "Red Medicine"] + [2 "Stout Burgers & Beers"] + [3 "The Apple Pan"] + [4 "Wurstküche"] + [5 "Brite Spot Family Restaurant"]] + (take 5 (#'sample/basic-sample + (Table (data/id :venues)) + [(Field (data/id :venues :id)) + (Field (data/id :venues :name))]))) + +;;; +----------------------------------------------------------------------------------------------------------------+ +;;; | TESTS FOR TABLE-SAMPLE->FIELD-SAMPLE | +;;; +----------------------------------------------------------------------------------------------------------------+ + +(def ^:private table-sample + [[100 "ABC" nil] + [200 "DEF" nil] + [300 nil nil] + [400 "GHI" nil] + [500 "JKL" nil]]) + +(expect + [100 200 300 400 500] + (#'sample/table-sample->field-sample table-sample 0)) + +;; should skip any `nil` values +(expect + ["ABC" "DEF" "GHI" "JKL"] + (#'sample/table-sample->field-sample table-sample 1)) + +;; should return `nil` if all values are `nil` (instead of empty sequence) +(expect + nil + (#'sample/table-sample->field-sample table-sample 2)) + + +;;; +----------------------------------------------------------------------------------------------------------------+ +;;; | TESTS FOR SAMPLE-FIELDS | +;;; +----------------------------------------------------------------------------------------------------------------+ + +(expect + [["ID" [1 2 3 4 5]] + ["NAME" ["Red Medicine" "Stout Burgers & Beers" "The Apple Pan" "Wurstküche" "Brite Spot Family Restaurant"]]] + (for [[field sample] (sample/sample-fields + (Table (data/id :venues)) + [(Field (data/id :venues :id)) + (Field (data/id :venues :name))])] + [(:name field) (take 5 sample)])) diff --git a/test/metabase/sync/analyze/fingerprint_test.clj b/test/metabase/sync/analyze/fingerprint_test.clj index aaa9c9feeadcd4e323a30d31adb8cdfae2968544..0ef8ebc51f4923fcd7d3de9400d3e43a9eb8d9d6 100644 --- a/test/metabase/sync/analyze/fingerprint_test.clj +++ b/test/metabase/sync/analyze/fingerprint_test.clj @@ -2,37 +2,42 @@ "Basic tests to make sure the fingerprint generatation code is doing something that makes sense." (:require [expectations :refer :all] [metabase.models - [field :refer [Field]] + [field :as field :refer [Field]] [table :refer [Table]]] [metabase.sync.analyze.fingerprint :as fingerprint] + [metabase.sync.analyze.fingerprint.sample :as sample] [metabase.sync.interface :as i] [metabase.test.data :as data] [metabase.util :as u] [toucan.db :as db] [toucan.util.test :as tt])) +(defn- fingerprint [field] + (let [[[_ sample]] (sample/sample-fields (field/table field) [field])] + (#'fingerprint/fingerprint field sample))) + ;; basic test for a numeric Field (expect {:global {:distinct-count 4} :type {:type/Number {:min 1, :max 4, :avg 2.03}}} - (#'fingerprint/fingerprint (Field (data/id :venues :price)))) + (fingerprint (Field (data/id :venues :price)))) ;; basic test for a Text Field (expect {:global {:distinct-count 100} :type {:type/Text {:percent-json 0.0, :percent-url 0.0, :percent-email 0.0, :average-length 15.63}}} - (#'fingerprint/fingerprint (Field (data/id :venues :name)))) + (fingerprint (Field (data/id :venues :name)))) ;; a non-integer numeric Field (expect {:global {:distinct-count 94} :type {:type/Number {:min 10.0646, :max 40.7794, :avg 35.50589199999998}}} - (#'fingerprint/fingerprint (Field (data/id :venues :latitude)))) + (fingerprint (Field (data/id :venues :latitude)))) ;; a datetime field (expect {:global {:distinct-count 618}} - (#'fingerprint/fingerprint (Field (data/id :checkins :date)))) + (fingerprint (Field (data/id :checkins :date)))) ;;; +----------------------------------------------------------------------------------------------------------------+ @@ -127,9 +132,11 @@ ;; Make sure that the above functions are used correctly to determine which Fields get (re-)fingerprinted (defn- field-was-fingerprinted? {:style/indent 0} [fingerprint-versions field-properties] - (let [fingerprinted? (atom false)] + (let [fingerprinted? (atom false) + fake-field (field/map->FieldInstance {:name "Fake Field"})] (with-redefs [i/fingerprint-version->types-that-should-be-re-fingerprinted fingerprint-versions - fingerprint/fingerprint! (fn [& _] (reset! fingerprinted? true))] + sample/sample-fields (constantly [[fake-field [1 2 3 4 5]]]) + fingerprint/save-fingerprint! (fn [& _] (reset! fingerprinted? true))] (tt/with-temp* [Table [table] Field [_ (assoc field-properties :table_id (u/get-id table))]] (fingerprint/fingerprint-fields! table)) @@ -197,14 +204,16 @@ ;; Make sure the `fingerprint!` function is correctly updating the correct columns of Field (expect - {:fingerprint {:fake-fingerprint? true} + {:fingerprint {:experimental {:fake-fingerprint? true}} :fingerprint_version 3 :last_analyzed nil} - (with-redefs [i/latest-fingerprint-version 3 - fingerprint/fingerprint (constantly {:fake-fingerprint? true})] - (tt/with-temp Field [field {:base_type :type/Integer - :fingerprint nil - :fingerprint_version 1 - :last_analyzed (u/->Timestamp "2017-08-09")}] - (#'fingerprint/fingerprint! field) + (tt/with-temp Field [field {:base_type :type/Integer + :table_id (data/id :venues) + :fingerprint nil + :fingerprint_version 1 + :last_analyzed (u/->Timestamp "2017-08-09")}] + (with-redefs [i/latest-fingerprint-version 3 + sample/sample-fields (constantly [[field [1 2 3 4 5]]]) + fingerprint/fingerprint (constantly {:experimental {:fake-fingerprint? true}})] + (#'fingerprint/fingerprint-table! (Table (data/id :venues)) [field]) (db/select-one [Field :fingerprint :fingerprint_version :last_analyzed] :id (u/get-id field))))) diff --git a/test/metabase/sync/analyze/special_types/values_test.clj b/test/metabase/sync/analyze/special_types/values_test.clj index 6eb021652a85b4776c628c3f4d91538d67d520c9..223a2a4da3e74635d22190cb05e8935568f07852 100644 --- a/test/metabase/sync/analyze/special_types/values_test.clj +++ b/test/metabase/sync/analyze/special_types/values_test.clj @@ -1,7 +1,10 @@ (ns metabase.sync.analyze.special-types.values-test - (:require [metabase.models.field :refer [Field]] + (:require [metabase.models + [field :refer [Field]] + [table :refer [Table]]] [metabase.query-processor-test :as qp-test] [metabase.sync.analyze.fingerprint :as fingerprint] + [metabase.sync.analyze.fingerprint.sample :as sample] [metabase.test.data :as data] [metabase.test.data.datasets :as datasets])) @@ -9,5 +12,7 @@ ;; This test won't work for Druid because it doesn't have a 'venues' Table. TODO - Add a test for Druid as well (datasets/expect-with-engines qp-test/non-timeseries-engines 16 - (Math/round (get-in (#'fingerprint/fingerprint (Field (data/id :venues :name))) - [:type :type/Text :average-length]))) + (let [field (Field (data/id :venues :name)) + [[_ sample]] (sample/sample-fields (Table (data/id :venues)) [field])] + (Math/round (get-in (#'fingerprint/fingerprint field sample) + [:type :type/Text :average-length])))) diff --git a/test/metabase/sync_database_test.clj b/test/metabase/sync_database_test.clj index 5fc1cdc381d380cee3ecbe23e7a0c4e41a228900..eff38b70ee3e1c24db16fab27b3a7fccc29392f7 100644 --- a/test/metabase/sync_database_test.clj +++ b/test/metabase/sync_database_test.clj @@ -57,6 +57,11 @@ :schema nil} :dest-column-name "studio"}}))) +;; enough values that it won't get marked as a Category, but still get a fingerprint or w/e +(defn- table-rows-sample [_ _ fields] + (for [i (range 500)] + (repeat (count fields) i))) + (extend SyncTestDriver driver/IDriver (merge driver/IDriverDefaultsMixin @@ -65,8 +70,7 @@ :describe-table-fks describe-table-fks :features (constantly #{:foreign-keys}) :details-fields (constantly []) - ;; enough values that it won't get marked as a Category, but still get a fingerprint or w/e - :field-values-lazy-seq (fn [& _] (range 500))})) + :table-rows-sample table-rows-sample})) (driver/register-driver! :sync-test (SyncTestDriver.)) @@ -160,6 +164,15 @@ (sync-database! db) (mapv table-details (db/select Table, :db_id (u/get-id db), {:order-by [:name]})))) +;; NOCOMMIT +(defn- x [] + (tt/with-temp Database [db {:engine :sync-test}] + (sync-database! db) + ;; we are purposely running the sync twice to test for possible logic issues which only manifest + ;; on resync of a database, such as adding tables that already exist or duplicating fields + (sync-database! db) + (mapv table-details (db/select Table, :db_id (u/get-id db), {:order-by [:name]})))) + ;; ## SYNC TABLE diff --git a/test/metabase/test/data/bigquery.clj b/test/metabase/test/data/bigquery.clj index cd6b667c0793b7c2a1b6cdaa22a6b5e37af6a88d..1edbfd3d1e2fa54b0563d0b8984fb1a974419094 100644 --- a/test/metabase/test/data/bigquery.clj +++ b/test/metabase/test/data/bigquery.clj @@ -12,11 +12,11 @@ [metabase.util :as u]) (:import com.google.api.client.util.DateTime com.google.api.services.bigquery.Bigquery - [com.google.api.services.bigquery.model Dataset DatasetReference QueryRequest Table TableDataInsertAllRequest TableDataInsertAllRequest$Rows TableFieldSchema TableReference TableRow TableSchema] + [com.google.api.services.bigquery.model Dataset DatasetReference QueryRequest Table + TableDataInsertAllRequest TableDataInsertAllRequest$Rows TableFieldSchema TableReference TableRow + TableSchema] metabase.driver.bigquery.BigQueryDriver)) -(resolve-private-vars metabase.driver.bigquery post-process-native) - ;;; # ------------------------------------------------------------ CONNECTION DETAILS ------------------------------------------------------------ (def ^:private ^String normalize-name (comp (u/rpartial s/replace #"-" "_") name)) @@ -75,9 +75,10 @@ (println (u/format-color 'blue "Created BigQuery table '%s.%s'." dataset-id table-id))) (defn- table-row-count ^Integer [^String dataset-id, ^String table-id] - (ffirst (:rows (post-process-native (google/execute (.query (.jobs bigquery) project-id - (doto (QueryRequest.) - (.setQuery (format "SELECT COUNT(*) FROM [%s.%s]" dataset-id table-id))))))))) + (ffirst (:rows (#'bigquery/post-process-native + (google/execute (.query (.jobs bigquery) project-id + (doto (QueryRequest.) + (.setQuery (format "SELECT COUNT(*) FROM [%s.%s]" dataset-id table-id))))))))) ;; This is a dirty HACK (defn- ^DateTime timestamp-honeysql-form->GoogleDateTime diff --git a/test/metabase/test/mock/toucanery.clj b/test/metabase/test/mock/toucanery.clj index b9e2a022d0f18fb93fc96706c1c6f830799b6a6d..fe7360f88bba04156201dd347e39bba11cb368cf 100644 --- a/test/metabase/test/mock/toucanery.clj +++ b/test/metabase/test/mock/toucanery.clj @@ -54,6 +54,11 @@ [{:keypath "movies.filming.description", :value "If the movie is currently being filmed."} {:keypath "movies.description", :value "A cinematic adventure."}])) +;; enough so it can get fingerprinted, but not be a category +(defn- table-rows-sample [_ _ fields] + (for [i (range 500)] + (repeat (count fields) i))) + (defrecord ToucaneryDriver [] clojure.lang.Named @@ -62,12 +67,12 @@ (extend ToucaneryDriver driver/IDriver (merge driver/IDriverDefaultsMixin - {:describe-database describe-database - :describe-table describe-table - :features (constantly #{:dynamic-schema :nested-fields}) - :details-fields (constantly []) - :field-values-lazy-seq (fn [& _] (range 500)) ; enough so it can get fingerprinted, but not be a category - :table-rows-seq table-rows-seq})) + {:describe-database describe-database + :describe-table describe-table + :features (constantly #{:dynamic-schema :nested-fields}) + :details-fields (constantly []) + :table-rows-seq table-rows-seq + :table-rows-sample table-rows-sample})) (driver/register-driver! :toucanery (ToucaneryDriver.))