From f5bd634428d9a13a4b1332f7fa977727c80f3b9a Mon Sep 17 00:00:00 2001 From: Alexander Solovyov <alexander@solovyov.net> Date: Wed, 24 Apr 2024 16:38:59 +0300 Subject: [PATCH] Store fields used in MBQL cards in QueryField (#41623) --- .../migrations/001_update_migrations.yaml | 16 +++--- src/metabase/db/custom_migrations.clj | 46 ++++++++++++---- src/metabase/lib/util.cljc | 9 ++-- src/metabase/models/card.clj | 6 +-- src/metabase/models/field_usage.clj | 6 +-- src/metabase/models/query_field.clj | 47 +++++++++++++++- src/metabase/native_query_analyzer.clj | 54 +++++-------------- test/metabase/db/custom_migrations_test.clj | 35 +++++++----- test/metabase/models/query_field_test.clj | 23 ++++++++ .../metabase/pulse/pulse_integration_test.clj | 4 +- 10 files changed, 162 insertions(+), 84 deletions(-) diff --git a/resources/migrations/001_update_migrations.yaml b/resources/migrations/001_update_migrations.yaml index 210db42e876..2217b187e67 100644 --- a/resources/migrations/001_update_migrations.yaml +++ b/resources/migrations/001_update_migrations.yaml @@ -6542,14 +6542,6 @@ databaseChangeLog: - customChange: class: "metabase.db.custom_migrations.CreateSampleContent" - - changeSet: - id: v50.2024-04-09T15:55:23 - author: piranha - comment: Backfill query_field for native queries - changes: - - customChange: - class: "metabase.db.custom_migrations.BackfillQueryField" - - changeSet: id: v50.2024-04-12T12:33:09 author: piranha @@ -6569,6 +6561,14 @@ databaseChangeLog: relativeToChangelogFile: true rollback: # no rollback necessary, we're not removing the columns yet + - changeSet: + id: v50.2024-04-18T14:33:19 + author: piranha + comment: Backfill query_field + changes: + - customChange: + class: "metabase.db.custom_migrations.BackfillQueryField" + - changeSet: id: v50.2024-04-19T17:04:04 author: noahmoss diff --git a/src/metabase/db/custom_migrations.clj b/src/metabase/db/custom_migrations.clj index 383b6fac973..fc9958b7706 100644 --- a/src/metabase/db/custom_migrations.clj +++ b/src/metabase/db/custom_migrations.clj @@ -1259,13 +1259,39 @@ (qs/shutdown scheduler))) (define-migration BackfillQueryField - (let [update-query-fields! (requiring-resolve 'metabase.native-query-analyzer/update-query-fields-for-card!) - cards (t2/select :model/Card :id [:in {:from [[:report_card :c]] - :left-join [[:query_field :f] [:= :f.card_id :c.id]] - :select [:c.id] - :where [:and - [:not :c.archived] - [:= :c.query_type "native"] - [:= :f.id nil]]}])] - (doseq [card cards] - (update-query-fields! card)))) + (let [field-ids-for-sql (requiring-resolve 'metabase.native-query-analyzer/field-ids-for-sql) + ;; practically #'metabase.models.query-field/field-ids-for-mbql + mbql-fields-xf (comp + (map #(match % + ["field" (id :guard integer?) opts] [id (:source-field opts)] + :else nil)) + cat + (remove nil?)) + field-ids-for-mbql (fn [query] + {:direct (->> (tree-seq coll? seq query) + (into #{} mbql-fields-xf) + not-empty)}) + cards (t2/select :report_card :id [:in {:from [[:report_card :c]] + :left-join [[:query_field :f] [:= :f.card_id :c.id]] + :select [:c.id] + :where [:and + [:not :c.archived] + [:= :f.id nil]]}])] + (doseq [{card-id :id query :dataset_query :as card} cards] + (let [query (json/parse-string query true) + {:keys [direct indirect] :as res} (if (= (:type query) "native") + (field-ids-for-sql query) + (field-ids-for-mbql query)) + id->record (fn [direct? field-id] + {:card_id card-id + :field_id field-id + :direct_reference direct?}) + records (concat + (map (partial id->record true) direct) + (map (partial id->record false) indirect)) + known (set + (when (seq records) + (t2/select-fn-set :id :metabase_field :id [:in (map :field_id records)]))) + records (filterv (comp known :field_id) records)] + (when (seq records) + (t2/insert! :query_field records)))))) diff --git a/src/metabase/lib/util.cljc b/src/metabase/lib/util.cljc index fb8faaabb00..660beb0c067 100644 --- a/src/metabase/lib/util.cljc +++ b/src/metabase/lib/util.cljc @@ -575,7 +575,10 @@ (when (#{:mbql/query :query :native :internal} query-type) query-type)))) -(mu/defn referenced-field-ids :- [:maybe [:sequential ::lib.schema.id/field]] - "Find all the integer field IDs in ``, Which can arbitrarily be anything that is part of MLv2 query schema." +(mu/defn referenced-field-ids :- [:maybe [:set ::lib.schema.id/field]] + "Find all the integer field IDs in `coll`, Which can arbitrarily be anything that is part of MLv2 query schema." [coll] - (lib.util.match/match coll [:field _ (id :guard int?)] id)) + (not-empty + (into #{} + (comp cat (filter some?)) + (lib.util.match/match coll [:field opts (id :guard int?)] [id (:source-field opts)])))) diff --git a/src/metabase/models/card.clj b/src/metabase/models/card.clj index a4943421c73..258271251ec 100644 --- a/src/metabase/models/card.clj +++ b/src/metabase/models/card.clj @@ -29,10 +29,10 @@ [metabase.models.permissions :as perms] [metabase.models.pulse :as pulse] [metabase.models.query :as query] + [metabase.models.query-field :as query-field] [metabase.models.revision :as revision] [metabase.models.serialization :as serdes] [metabase.moderation :as moderation] - [metabase.native-query-analyzer :as query-analyzer] [metabase.public-settings :as public-settings] [metabase.public-settings.premium-features :as premium-features @@ -527,7 +527,7 @@ (log/info "Card references Fields in params:" field-ids) (field-values/update-field-values-for-on-demand-dbs! field-ids)) (parameter-card/upsert-or-delete-from-parameters! "card" (:id card) (:parameters card)) - (query-analyzer/update-query-fields-for-card! card))) + (query-field/update-query-fields-for-card! card))) (t2/define-before-update :model/Card [{:keys [verified-result-metadata?] :as card}] @@ -555,7 +555,7 @@ [card] (u/prog1 card (when (contains? (t2/changes card) :dataset_query) - (query-analyzer/update-query-fields-for-card! card)))) + (query-field/update-query-fields-for-card! card)))) ;; Cards don't normally get deleted (they get archived instead) so this mostly affects tests (t2/define-before-delete :model/Card diff --git a/src/metabase/models/field_usage.clj b/src/metabase/models/field_usage.clj index c2819a6dc7d..1a4aa78c222 100644 --- a/src/metabase/models/field_usage.clj +++ b/src/metabase/models/field_usage.clj @@ -56,8 +56,8 @@ :breakout_binning_num_bins (:num-bins binning-option)})))) (defn- expression->field-usage - [expresison-clause] - (when-let [field-ids (seq (lib.util/referenced-field-ids expresison-clause))] + [expression-clause] + (when-let [field-ids (seq (lib.util/referenced-field-ids expression-clause))] (for [field-id field-ids] {:field_id field-id :used_in :expression}))) @@ -66,7 +66,7 @@ (defn- join->field-usages [query join] - (let [join-query (fetch-source-query/resolve-source-cards (assoc query :stages (get join :stages)))] + (let [join-query (fetch-source-query/resolve-source-cards (assoc query :stages (:stages join)))] ;; treat the source query as a :mbql/query (pmbql->field-usages join-query))) diff --git a/src/metabase/models/query_field.clj b/src/metabase/models/query_field.clj index c5baf273198..04ed030ff6a 100644 --- a/src/metabase/models/query_field.clj +++ b/src/metabase/models/query_field.clj @@ -1,10 +1,55 @@ (ns metabase.models.query-field (:require + [metabase.legacy-mbql.util :as mbql.u] + [metabase.native-query-analyzer :as query-analyzer] + [metabase.util.log :as log] [methodical.core :as methodical] [toucan2.core :as t2])) - (methodical/defmethod t2/table-name :model/QueryField [_model] :query_field) (doto :model/QueryField (derive :metabase/model)) + +;;; Updating QueryField from card + +(defn- field-ids-for-mbql + "Find out ids of all fields used in a query. Conforms to the same protocol as [[query-analyzer/field-ids-for-sql]], + so returns `{:direct #{...int ids}}` map. + + Does not track wildcards for queries rendered as tables afterwards." + [query] + {:direct (mbql.u/referenced-field-ids query)}) + +(defn update-query-fields-for-card! + "Clears QueryFields associated with this card and creates fresh, up-to-date-ones. + + Any card is accepted, but this functionality only works for ones with a native query. + + If you're invoking this from a test, be sure to turn on [[*parse-queries-in-test?*]]. + + Returns `nil` (and logs the error) if there was a parse error." + [{card-id :id, query :dataset_query}] + (when query + (try + (let [{:keys [direct indirect] :as res} (case (:type query) + :native (query-analyzer/field-ids-for-sql query) + :query (field-ids-for-mbql query) + nil nil) + id->record (fn [direct? field-id] + {:card_id card-id + :field_id field-id + :direct_reference direct?}) + query-field-records (concat + (map (partial id->record true) direct) + (map (partial id->record false) indirect))] + ;; when response is `nil`, it's a disabled parser, not unknown columns + (when (some? res) + ;; This feels inefficient at first glance, but the number of records should be quite small and doing some + ;; sort of upsert-or-delete would involve comparisons in Clojure-land that are more expensive than just + ;; "killing and filling" the records. + (t2/with-transaction [_conn] + (t2/delete! :model/QueryField :card_id card-id) + (t2/insert! :model/QueryField query-field-records)))) + (catch Exception e + (log/error e "Error parsing native query"))))) diff --git a/src/metabase/native_query_analyzer.clj b/src/metabase/native_query_analyzer.clj index 88c68b94067..0bf1887e4b2 100644 --- a/src/metabase/native_query_analyzer.clj +++ b/src/metabase/native_query_analyzer.clj @@ -12,7 +12,6 @@ [metabase.native-query-analyzer.parameter-substitution :as nqa.sub] [metabase.public-settings :as public-settings] [metabase.util :as u] - [metabase.util.log :as log] [toucan2.core :as t2])) (def ^:dynamic *parse-queries-in-test?* @@ -80,50 +79,21 @@ ;; limit to the named tables (seq table-wildcards) (active-fields-from-tables table-wildcards)))) -(defn- field-ids-for-card +(defn field-ids-for-sql "Returns a `{:direct #{...} :indirect #{...}}` map with field IDs that (may) be referenced in the given cards's query. Errs on the side of optimism: i.e., it may return fields that are *not* in the query, and is unlikely to fail to return fields that are in the query. Direct references are columns that are named in the query; indirect ones are from wildcards. If a field could be both direct and indirect, it will *only* show up in the `:direct` set." - [card] - (let [query (:dataset_query card) - db-id (:database query) - sql-string (:query (nqa.sub/replace-tags query)) - parsed-query (macaw/query->components (macaw/parsed-query sql-string)) - direct-ids (direct-field-ids-for-query parsed-query db-id) - indirect-ids (set/difference - (indirect-field-ids-for-query parsed-query db-id) - direct-ids)] - {:direct direct-ids - :indirect indirect-ids})) - -(defn update-query-fields-for-card! - "Clears QueryFields associated with this card and creates fresh, up-to-date-ones. - - Any card is accepted, but this functionality only works for ones with a native query. - - If you're invoking this from a test, be sure to turn on [[*parse-queries-in-test?*]]. - - Returns `nil` (and logs the error) if there was a parse error." - [{card-id :id, query :dataset_query :as card}] - (when (and (active?) - (= :native (:type query))) - (try - (let [{:keys [direct indirect]} (field-ids-for-card card) - id->record (fn [direct? field-id] - {:card_id card-id - :field_id field-id - :direct_reference direct?}) - query-field-records (concat - (map (partial id->record true) direct) - (map (partial id->record false) indirect))] - ;; This feels inefficient at first glance, but the number of records should be quite small and doing some sort - ;; of upsert-or-delete would involve comparisons in Clojure-land that are more expensive than just "killing and - ;; filling" the records. - (t2/with-transaction [_conn] - (t2/delete! :model/QueryField :card_id card-id) - (t2/insert! :model/QueryField query-field-records))) - (catch Exception e - (log/error e "Error parsing native query"))))) + [query] + (when (active?) + (let [db-id (:database query) + sql-string (:query (nqa.sub/replace-tags query)) + parsed-query (macaw/query->components (macaw/parsed-query sql-string)) + direct-ids (direct-field-ids-for-query parsed-query db-id) + indirect-ids (set/difference + (indirect-field-ids-for-query parsed-query db-id) + direct-ids)] + {:direct direct-ids + :indirect indirect-ids}))) diff --git a/test/metabase/db/custom_migrations_test.clj b/test/metabase/db/custom_migrations_test.clj index b3f1f091179..41f9c157cfc 100644 --- a/test/metabase/db/custom_migrations_test.clj +++ b/test/metabase/db/custom_migrations_test.clj @@ -1675,16 +1675,22 @@ (is (nil? (:cache_field_values_schedule db)))))))))) (deftest backfill-query-field-test - (impl/test-migrations "v50.2024-04-09T15:55:23" [migrate!] + (impl/test-migrations "v50.2024-04-18T14:33:19" [migrate!] (let [user-id (:id (new-instance-with-default :core_user)) - ;; it is already `false`, but binding it anyway to indicate it's important - card-id (binding [query-analyzer/*parse-queries-in-test?* false] - (:id (new-instance-with-default - :report_card - {:creator_id user-id - :database_id (mt/id) - :query_type "native" - :dataset_query (json/generate-string (mt/native-query {:query "SELECT id FROM venues"}))}))) + c1-id (:id (new-instance-with-default + :report_card + {:creator_id user-id + :database_id (mt/id) + :query_type "native" + :dataset_query (json/generate-string + (mt/native-query {:query "SELECT id FROM venues"}))})) + c2-id (:id (new-instance-with-default + :report_card + {:creator_id user-id + :database_id (mt/id) + :query_type "query" + :dataset_query (json/generate-string + (mt/mbql-query venues {:aggregation [[:distinct $name]]}))})) archived-id (binding [query-analyzer/*parse-queries-in-test?* false] (:id (new-instance-with-default :report_card @@ -1692,15 +1698,20 @@ :creator_id user-id :database_id (mt/id) :query_type "native" - :dataset_query (json/generate-string (mt/native-query {:query "SELECT id FROM venues"}))}))) + :dataset_query (json/generate-string + (mt/native-query {:query "SELECT id FROM venues"}))}))) ;; (first (vals %)) are necessary since h2 generates :count(id) as name for column get-count #(t2/select-one-fn (comp first vals) [:model/QueryField [[:count :id]]] :card_id %)] (testing "QueryField is empty - queries weren't analyzed" - (is (zero? (get-count card-id))) + ;; they were not analyzed since `new-instance-with-default` inserts directly into the table, omitting model + ;; hooks + (is (zero? (get-count c1-id))) + (is (zero? (get-count c2-id))) (is (zero? (get-count archived-id)))) (binding [query-analyzer/*parse-queries-in-test?* true] (migrate!)) (testing "QueryField is filled now" - (is (pos? (get-count card-id))) + (is (pos? (get-count c1-id))) + (is (pos? (get-count c2-id))) (testing "but not for archived card" (is (zero? (get-count archived-id)))))))) diff --git a/test/metabase/models/query_field_test.clj b/test/metabase/models/query_field_test.clj index af97442b554..f8e30920b7e 100644 --- a/test/metabase/models/query_field_test.clj +++ b/test/metabase/models/query_field_test.clj @@ -2,6 +2,8 @@ (:require [clojure.set :as set] [clojure.test :refer :all] + [metabase.models :refer [Card]] + [metabase.models.query-field :as query-field] [metabase.native-query-analyzer :as query-analyzer] [metabase.test :as mt] [toucan2.core :as t2] @@ -129,3 +131,24 @@ ;; subset since it also includes the PKs/FKs (is (set/subset? #{total-qf tax-qf} (t2/select-fn-set qf->map :model/QueryField :card_id card-id :direct_reference true)))))))) + +(deftest ^:parallel parse-mbql-test + (testing "Parsing MBQL query returns correct used fields" + (mt/with-temp [Card c1 {:dataset_query (mt/mbql-query venues + {:aggregation [[:distinct $name] + [:distinct $price]] + :limit 5})} + Card c2 {:dataset_query {:query {:source-table (str "card__" (:id c1))} + :database (:id (mt/db)) + :type :query}} + Card c3 {:dataset_query (mt/mbql-query checkins + {:joins [{:source-table (str "card__" (:id c2)) + :alias "Venues" + :condition [:= $checkins.venue_id $venues.id]}]})}] + (mt/$ids + (is (= {:direct #{%venues.name %venues.price}} + (#'query-field/field-ids-for-mbql (:dataset_query c1)))) + (is (= {:direct nil} + (#'query-field/field-ids-for-mbql (:dataset_query c2)))) + (is (= {:direct #{%venues.id %checkins.venue_id}} + (#'query-field/field-ids-for-mbql (:dataset_query c3)))))))) diff --git a/test/metabase/pulse/pulse_integration_test.clj b/test/metabase/pulse/pulse_integration_test.clj index 52e212caa64..4dea84c5033 100644 --- a/test/metabase/pulse/pulse_integration_test.clj +++ b/test/metabase/pulse/pulse_integration_test.clj @@ -292,7 +292,7 @@ [date-str n] (format "WITH T AS (SELECT CAST('%s' AS TIMESTAMP) AS example_timestamp), - SAMPLE AS (SELECT T.example_timestamp AS full_datetime_utc, + \"SAMPLE\" AS (SELECT T.example_timestamp AS full_datetime_utc, T.example_timestamp AT TIME ZONE 'US/Pacific' AS full_datetime_pacific, CAST(T.example_timestamp AS TIMESTAMP) AS example_timestamp, CAST(T.example_timestamp AS TIMESTAMP WITH TIME ZONE) AS example_timestamp_with_time_zone, @@ -308,7 +308,7 @@ EXTRACT(SECOND FROM T.example_timestamp) AS example_second FROM T) SELECT * - FROM SAMPLE + FROM \"SAMPLE\" CROSS JOIN generate_series(1, %s);" date-str n)) -- GitLab