From f5bd634428d9a13a4b1332f7fa977727c80f3b9a Mon Sep 17 00:00:00 2001
From: Alexander Solovyov <alexander@solovyov.net>
Date: Wed, 24 Apr 2024 16:38:59 +0300
Subject: [PATCH] Store fields used in MBQL cards in QueryField (#41623)

---
 .../migrations/001_update_migrations.yaml     | 16 +++---
 src/metabase/db/custom_migrations.clj         | 46 ++++++++++++----
 src/metabase/lib/util.cljc                    |  9 ++--
 src/metabase/models/card.clj                  |  6 +--
 src/metabase/models/field_usage.clj           |  6 +--
 src/metabase/models/query_field.clj           | 47 +++++++++++++++-
 src/metabase/native_query_analyzer.clj        | 54 +++++--------------
 test/metabase/db/custom_migrations_test.clj   | 35 +++++++-----
 test/metabase/models/query_field_test.clj     | 23 ++++++++
 .../metabase/pulse/pulse_integration_test.clj |  4 +-
 10 files changed, 162 insertions(+), 84 deletions(-)

diff --git a/resources/migrations/001_update_migrations.yaml b/resources/migrations/001_update_migrations.yaml
index 210db42e876..2217b187e67 100644
--- a/resources/migrations/001_update_migrations.yaml
+++ b/resources/migrations/001_update_migrations.yaml
@@ -6542,14 +6542,6 @@ databaseChangeLog:
         - customChange:
             class: "metabase.db.custom_migrations.CreateSampleContent"
 
-  - changeSet:
-      id: v50.2024-04-09T15:55:23
-      author: piranha
-      comment: Backfill query_field for native queries
-      changes:
-        - customChange:
-            class: "metabase.db.custom_migrations.BackfillQueryField"
-
   - changeSet:
       id: v50.2024-04-12T12:33:09
       author: piranha
@@ -6569,6 +6561,14 @@ databaseChangeLog:
             relativeToChangelogFile: true
       rollback: # no rollback necessary, we're not removing the columns yet
 
+  - changeSet:
+      id: v50.2024-04-18T14:33:19
+      author: piranha
+      comment: Backfill query_field
+      changes:
+        - customChange:
+            class: "metabase.db.custom_migrations.BackfillQueryField"
+
   - changeSet:
       id: v50.2024-04-19T17:04:04
       author: noahmoss
diff --git a/src/metabase/db/custom_migrations.clj b/src/metabase/db/custom_migrations.clj
index 383b6fac973..fc9958b7706 100644
--- a/src/metabase/db/custom_migrations.clj
+++ b/src/metabase/db/custom_migrations.clj
@@ -1259,13 +1259,39 @@
     (qs/shutdown scheduler)))
 
 (define-migration BackfillQueryField
-  (let [update-query-fields! (requiring-resolve 'metabase.native-query-analyzer/update-query-fields-for-card!)
-        cards                (t2/select :model/Card :id [:in {:from      [[:report_card :c]]
-                                                              :left-join [[:query_field :f] [:= :f.card_id :c.id]]
-                                                              :select    [:c.id]
-                                                              :where     [:and
-                                                                          [:not :c.archived]
-                                                                          [:= :c.query_type "native"]
-                                                                          [:= :f.id nil]]}])]
-    (doseq [card cards]
-      (update-query-fields! card))))
+  (let [field-ids-for-sql  (requiring-resolve 'metabase.native-query-analyzer/field-ids-for-sql)
+        ;; practically #'metabase.models.query-field/field-ids-for-mbql
+        mbql-fields-xf     (comp
+                            (map #(match %
+                                    ["field" (id :guard integer?) opts] [id (:source-field opts)]
+                                    :else                               nil))
+                            cat
+                            (remove nil?))
+        field-ids-for-mbql (fn [query]
+                             {:direct (->> (tree-seq coll? seq query)
+                                           (into #{} mbql-fields-xf)
+                                           not-empty)})
+        cards              (t2/select :report_card :id [:in {:from      [[:report_card :c]]
+                                                             :left-join [[:query_field :f] [:= :f.card_id :c.id]]
+                                                             :select    [:c.id]
+                                                             :where     [:and
+                                                                         [:not :c.archived]
+                                                                         [:= :f.id nil]]}])]
+    (doseq [{card-id :id query :dataset_query :as card} cards]
+      (let [query                             (json/parse-string query true)
+            {:keys [direct indirect] :as res} (if (= (:type query) "native")
+                                                (field-ids-for-sql query)
+                                                (field-ids-for-mbql query))
+            id->record                        (fn [direct? field-id]
+                                                {:card_id          card-id
+                                                 :field_id         field-id
+                                                 :direct_reference direct?})
+            records                           (concat
+                                               (map (partial id->record true) direct)
+                                               (map (partial id->record false) indirect))
+            known                             (set
+                                               (when (seq records)
+                                                 (t2/select-fn-set :id :metabase_field :id [:in (map :field_id records)])))
+            records                           (filterv (comp known :field_id) records)]
+        (when (seq records)
+          (t2/insert! :query_field records))))))
diff --git a/src/metabase/lib/util.cljc b/src/metabase/lib/util.cljc
index fb8faaabb00..660beb0c067 100644
--- a/src/metabase/lib/util.cljc
+++ b/src/metabase/lib/util.cljc
@@ -575,7 +575,10 @@
       (when (#{:mbql/query :query :native :internal} query-type)
         query-type))))
 
-(mu/defn referenced-field-ids :- [:maybe [:sequential ::lib.schema.id/field]]
-  "Find all the integer field IDs in ``, Which can arbitrarily be anything that is part of MLv2 query schema."
+(mu/defn referenced-field-ids :- [:maybe [:set ::lib.schema.id/field]]
+  "Find all the integer field IDs in `coll`, Which can arbitrarily be anything that is part of MLv2 query schema."
   [coll]
-  (lib.util.match/match coll [:field _ (id :guard int?)] id))
+  (not-empty
+   (into #{}
+         (comp cat (filter some?))
+         (lib.util.match/match coll [:field opts (id :guard int?)] [id (:source-field opts)]))))
diff --git a/src/metabase/models/card.clj b/src/metabase/models/card.clj
index a4943421c73..258271251ec 100644
--- a/src/metabase/models/card.clj
+++ b/src/metabase/models/card.clj
@@ -29,10 +29,10 @@
    [metabase.models.permissions :as perms]
    [metabase.models.pulse :as pulse]
    [metabase.models.query :as query]
+   [metabase.models.query-field :as query-field]
    [metabase.models.revision :as revision]
    [metabase.models.serialization :as serdes]
    [metabase.moderation :as moderation]
-   [metabase.native-query-analyzer :as query-analyzer]
    [metabase.public-settings :as public-settings]
    [metabase.public-settings.premium-features
     :as premium-features
@@ -527,7 +527,7 @@
       (log/info "Card references Fields in params:" field-ids)
       (field-values/update-field-values-for-on-demand-dbs! field-ids))
     (parameter-card/upsert-or-delete-from-parameters! "card" (:id card) (:parameters card))
-    (query-analyzer/update-query-fields-for-card! card)))
+    (query-field/update-query-fields-for-card! card)))
 
 (t2/define-before-update :model/Card
   [{:keys [verified-result-metadata?] :as card}]
@@ -555,7 +555,7 @@
   [card]
   (u/prog1 card
     (when (contains? (t2/changes card) :dataset_query)
-      (query-analyzer/update-query-fields-for-card! card))))
+      (query-field/update-query-fields-for-card! card))))
 
 ;; Cards don't normally get deleted (they get archived instead) so this mostly affects tests
 (t2/define-before-delete :model/Card
diff --git a/src/metabase/models/field_usage.clj b/src/metabase/models/field_usage.clj
index c2819a6dc7d..1a4aa78c222 100644
--- a/src/metabase/models/field_usage.clj
+++ b/src/metabase/models/field_usage.clj
@@ -56,8 +56,8 @@
         :breakout_binning_num_bins  (:num-bins binning-option)}))))
 
 (defn- expression->field-usage
-  [expresison-clause]
-  (when-let [field-ids (seq (lib.util/referenced-field-ids expresison-clause))]
+  [expression-clause]
+  (when-let [field-ids (seq (lib.util/referenced-field-ids expression-clause))]
     (for [field-id field-ids]
       {:field_id field-id
        :used_in  :expression})))
@@ -66,7 +66,7 @@
 
 (defn- join->field-usages
   [query join]
-  (let [join-query (fetch-source-query/resolve-source-cards (assoc query :stages (get join :stages)))]
+  (let [join-query (fetch-source-query/resolve-source-cards (assoc query :stages (:stages join)))]
     ;; treat the source query as a :mbql/query
     (pmbql->field-usages join-query)))
 
diff --git a/src/metabase/models/query_field.clj b/src/metabase/models/query_field.clj
index c5baf273198..04ed030ff6a 100644
--- a/src/metabase/models/query_field.clj
+++ b/src/metabase/models/query_field.clj
@@ -1,10 +1,55 @@
 (ns metabase.models.query-field
   (:require
+   [metabase.legacy-mbql.util :as mbql.u]
+   [metabase.native-query-analyzer :as query-analyzer]
+   [metabase.util.log :as log]
    [methodical.core :as methodical]
    [toucan2.core :as t2]))
 
-
 (methodical/defmethod t2/table-name :model/QueryField [_model] :query_field)
 
 (doto :model/QueryField
   (derive :metabase/model))
+
+;;; Updating QueryField from card
+
+(defn- field-ids-for-mbql
+  "Find out ids of all fields used in a query. Conforms to the same protocol as [[query-analyzer/field-ids-for-sql]],
+  so returns `{:direct #{...int ids}}` map.
+
+  Does not track wildcards for queries rendered as tables afterwards."
+  [query]
+  {:direct (mbql.u/referenced-field-ids query)})
+
+(defn update-query-fields-for-card!
+  "Clears QueryFields associated with this card and creates fresh, up-to-date-ones.
+
+  Any card is accepted, but this functionality only works for ones with a native query.
+
+  If you're invoking this from a test, be sure to turn on [[*parse-queries-in-test?*]].
+
+  Returns `nil` (and logs the error) if there was a parse error."
+  [{card-id :id, query :dataset_query}]
+  (when query
+    (try
+      (let [{:keys [direct indirect] :as res} (case (:type query)
+                                                :native (query-analyzer/field-ids-for-sql query)
+                                                :query  (field-ids-for-mbql query)
+                                                nil     nil)
+            id->record                        (fn [direct? field-id]
+                                                {:card_id          card-id
+                                                 :field_id         field-id
+                                                 :direct_reference direct?})
+            query-field-records               (concat
+                                               (map (partial id->record true) direct)
+                                               (map (partial id->record false) indirect))]
+        ;; when response is `nil`, it's a disabled parser, not unknown columns
+        (when (some? res)
+          ;; This feels inefficient at first glance, but the number of records should be quite small and doing some
+          ;; sort of upsert-or-delete would involve comparisons in Clojure-land that are more expensive than just
+          ;; "killing and filling" the records.
+          (t2/with-transaction [_conn]
+            (t2/delete! :model/QueryField :card_id card-id)
+            (t2/insert! :model/QueryField query-field-records))))
+      (catch Exception e
+        (log/error e "Error parsing native query")))))
diff --git a/src/metabase/native_query_analyzer.clj b/src/metabase/native_query_analyzer.clj
index 88c68b94067..0bf1887e4b2 100644
--- a/src/metabase/native_query_analyzer.clj
+++ b/src/metabase/native_query_analyzer.clj
@@ -12,7 +12,6 @@
    [metabase.native-query-analyzer.parameter-substitution :as nqa.sub]
    [metabase.public-settings :as public-settings]
    [metabase.util :as u]
-   [metabase.util.log :as log]
    [toucan2.core :as t2]))
 
 (def ^:dynamic *parse-queries-in-test?*
@@ -80,50 +79,21 @@
       ;; limit to the named tables
       (seq table-wildcards)            (active-fields-from-tables table-wildcards))))
 
-(defn- field-ids-for-card
+(defn field-ids-for-sql
   "Returns a `{:direct #{...} :indirect #{...}}` map with field IDs that (may) be referenced in the given cards's
   query. Errs on the side of optimism: i.e., it may return fields that are *not* in the query, and is unlikely to fail
   to return fields that are in the query.
 
   Direct references are columns that are named in the query; indirect ones are from wildcards. If a field could be
   both direct and indirect, it will *only* show up in the `:direct` set."
-  [card]
-  (let [query        (:dataset_query card)
-        db-id        (:database query)
-        sql-string   (:query (nqa.sub/replace-tags query))
-        parsed-query (macaw/query->components (macaw/parsed-query sql-string))
-        direct-ids   (direct-field-ids-for-query parsed-query db-id)
-        indirect-ids (set/difference
-                      (indirect-field-ids-for-query parsed-query db-id)
-                      direct-ids)]
-    {:direct   direct-ids
-     :indirect indirect-ids}))
-
-(defn update-query-fields-for-card!
-  "Clears QueryFields associated with this card and creates fresh, up-to-date-ones.
-
-  Any card is accepted, but this functionality only works for ones with a native query.
-
-  If you're invoking this from a test, be sure to turn on [[*parse-queries-in-test?*]].
-
-  Returns `nil` (and logs the error) if there was a parse error."
-  [{card-id :id, query :dataset_query :as card}]
-  (when (and (active?)
-             (= :native (:type query)))
-    (try
-      (let [{:keys [direct indirect]} (field-ids-for-card card)
-            id->record                (fn [direct? field-id]
-                                        {:card_id          card-id
-                                         :field_id         field-id
-                                         :direct_reference direct?})
-            query-field-records       (concat
-                                       (map (partial id->record true) direct)
-                                       (map (partial id->record false) indirect))]
-        ;; This feels inefficient at first glance, but the number of records should be quite small and doing some sort
-        ;; of upsert-or-delete would involve comparisons in Clojure-land that are more expensive than just "killing and
-        ;; filling" the records.
-        (t2/with-transaction [_conn]
-          (t2/delete! :model/QueryField :card_id card-id)
-          (t2/insert! :model/QueryField query-field-records)))
-      (catch Exception e
-        (log/error e "Error parsing native query")))))
+  [query]
+  (when (active?)
+    (let [db-id        (:database query)
+          sql-string   (:query (nqa.sub/replace-tags query))
+          parsed-query (macaw/query->components (macaw/parsed-query sql-string))
+          direct-ids   (direct-field-ids-for-query parsed-query db-id)
+          indirect-ids (set/difference
+                        (indirect-field-ids-for-query parsed-query db-id)
+                        direct-ids)]
+      {:direct   direct-ids
+       :indirect indirect-ids})))
diff --git a/test/metabase/db/custom_migrations_test.clj b/test/metabase/db/custom_migrations_test.clj
index b3f1f091179..41f9c157cfc 100644
--- a/test/metabase/db/custom_migrations_test.clj
+++ b/test/metabase/db/custom_migrations_test.clj
@@ -1675,16 +1675,22 @@
               (is (nil? (:cache_field_values_schedule db))))))))))
 
 (deftest backfill-query-field-test
-  (impl/test-migrations "v50.2024-04-09T15:55:23" [migrate!]
+  (impl/test-migrations "v50.2024-04-18T14:33:19" [migrate!]
     (let [user-id     (:id (new-instance-with-default :core_user))
-          ;; it is already `false`, but binding it anyway to indicate it's important
-          card-id     (binding [query-analyzer/*parse-queries-in-test?* false]
-                        (:id (new-instance-with-default
-                              :report_card
-                              {:creator_id    user-id
-                               :database_id   (mt/id)
-                               :query_type    "native"
-                               :dataset_query (json/generate-string (mt/native-query {:query "SELECT id FROM venues"}))})))
+          c1-id       (:id (new-instance-with-default
+                            :report_card
+                            {:creator_id    user-id
+                             :database_id   (mt/id)
+                             :query_type    "native"
+                             :dataset_query (json/generate-string
+                                             (mt/native-query {:query "SELECT id FROM venues"}))}))
+          c2-id       (:id (new-instance-with-default
+                            :report_card
+                            {:creator_id    user-id
+                             :database_id   (mt/id)
+                             :query_type    "query"
+                             :dataset_query (json/generate-string
+                                             (mt/mbql-query venues {:aggregation [[:distinct $name]]}))}))
           archived-id (binding [query-analyzer/*parse-queries-in-test?* false]
                         (:id (new-instance-with-default
                               :report_card
@@ -1692,15 +1698,20 @@
                                :creator_id    user-id
                                :database_id   (mt/id)
                                :query_type    "native"
-                               :dataset_query (json/generate-string (mt/native-query {:query "SELECT id FROM venues"}))})))
+                               :dataset_query (json/generate-string
+                                               (mt/native-query {:query "SELECT id FROM venues"}))})))
           ;; (first (vals %)) are necessary since h2 generates :count(id) as name for column
           get-count   #(t2/select-one-fn (comp first vals) [:model/QueryField [[:count :id]]] :card_id %)]
       (testing "QueryField is empty - queries weren't analyzed"
-        (is (zero? (get-count card-id)))
+        ;; they were not analyzed since `new-instance-with-default` inserts directly into the table, omitting model
+        ;; hooks
+        (is (zero? (get-count c1-id)))
+        (is (zero? (get-count c2-id)))
         (is (zero? (get-count archived-id))))
       (binding [query-analyzer/*parse-queries-in-test?* true]
         (migrate!))
       (testing "QueryField is filled now"
-        (is (pos? (get-count card-id)))
+        (is (pos? (get-count c1-id)))
+        (is (pos? (get-count c2-id)))
         (testing "but not for archived card"
           (is (zero? (get-count archived-id))))))))
diff --git a/test/metabase/models/query_field_test.clj b/test/metabase/models/query_field_test.clj
index af97442b554..f8e30920b7e 100644
--- a/test/metabase/models/query_field_test.clj
+++ b/test/metabase/models/query_field_test.clj
@@ -2,6 +2,8 @@
   (:require
    [clojure.set :as set]
    [clojure.test :refer :all]
+   [metabase.models :refer [Card]]
+   [metabase.models.query-field :as query-field]
    [metabase.native-query-analyzer :as query-analyzer]
    [metabase.test :as mt]
    [toucan2.core :as t2]
@@ -129,3 +131,24 @@
           ;; subset since it also includes the PKs/FKs
           (is (set/subset? #{total-qf tax-qf}
                            (t2/select-fn-set qf->map :model/QueryField :card_id card-id :direct_reference true))))))))
+
+(deftest ^:parallel parse-mbql-test
+  (testing "Parsing MBQL query returns correct used fields"
+    (mt/with-temp [Card c1 {:dataset_query (mt/mbql-query venues
+                                             {:aggregation [[:distinct $name]
+                                                            [:distinct $price]]
+                                              :limit       5})}
+                   Card c2 {:dataset_query {:query    {:source-table (str "card__" (:id c1))}
+                                            :database (:id (mt/db))
+                                            :type     :query}}
+                   Card c3 {:dataset_query (mt/mbql-query checkins
+                                             {:joins [{:source-table (str "card__" (:id c2))
+                                                       :alias "Venues"
+                                                       :condition [:= $checkins.venue_id $venues.id]}]})}]
+      (mt/$ids
+        (is (= {:direct #{%venues.name %venues.price}}
+               (#'query-field/field-ids-for-mbql (:dataset_query c1))))
+        (is (= {:direct nil}
+               (#'query-field/field-ids-for-mbql (:dataset_query c2))))
+        (is (= {:direct #{%venues.id %checkins.venue_id}}
+               (#'query-field/field-ids-for-mbql (:dataset_query c3))))))))
diff --git a/test/metabase/pulse/pulse_integration_test.clj b/test/metabase/pulse/pulse_integration_test.clj
index 52e212caa64..4dea84c5033 100644
--- a/test/metabase/pulse/pulse_integration_test.clj
+++ b/test/metabase/pulse/pulse_integration_test.clj
@@ -292,7 +292,7 @@
   [date-str n]
   (format
     "WITH T AS (SELECT CAST('%s' AS TIMESTAMP) AS example_timestamp),
-          SAMPLE AS (SELECT T.example_timestamp                                   AS full_datetime_utc,
+          \"SAMPLE\" AS (SELECT T.example_timestamp                                   AS full_datetime_utc,
                             T.example_timestamp AT TIME ZONE 'US/Pacific'         AS full_datetime_pacific,
                             CAST(T.example_timestamp AS TIMESTAMP)                AS example_timestamp,
                             CAST(T.example_timestamp AS TIMESTAMP WITH TIME ZONE) AS example_timestamp_with_time_zone,
@@ -308,7 +308,7 @@
                             EXTRACT(SECOND FROM T.example_timestamp)              AS example_second
                      FROM T)
      SELECT *
-     FROM SAMPLE
+     FROM \"SAMPLE\"
               CROSS JOIN
           generate_series(1, %s);"
     date-str n))
-- 
GitLab