From 719fc4f750f72511bf65388e60c80cdf412d978e Mon Sep 17 00:00:00 2001
From: Chris Truter <crisptrutski@users.noreply.github.com>
Date: Tue, 23 Apr 2024 19:14:00 +0200
Subject: [PATCH] Invalidate model caching when updating upload table (#41653)

---
 src/metabase/models/persisted_info.clj |  9 +++
 src/metabase/upload.clj                | 56 ++++++++++++-----
 test/metabase/upload_test.clj          | 84 ++++++++++++++++++++++++--
 3 files changed, 130 insertions(+), 19 deletions(-)

diff --git a/src/metabase/models/persisted_info.clj b/src/metabase/models/persisted_info.clj
index 384e2325eec..b50ae7e1b49 100644
--- a/src/metabase/models/persisted_info.clj
+++ b/src/metabase/models/persisted_info.clj
@@ -117,6 +117,15 @@
   ([conditions-map state]
    (t2/update! PersistedInfo conditions-map {:active false, :state state, :state_change_at :%now})))
 
+(defn invalidate!
+  "Invalidates any caches corresponding to the `conditions-map`. Equivalent to toggling the caching off and on again."
+  [conditions-map]
+  ;; We do not immediately delete the cached table, it will get clobbered during the next refresh cycle.
+  (t2/update! PersistedInfo
+              (merge {:active true} conditions-map)
+              ;; TODO perhaps we should immediately kick off a recalculation of these caches
+              {:active false, :state "creating", :state_change_at :%now}))
+
 (defn- create-row
   "Marks PersistedInfo as `creating`, these will at some point be persisted by the PersistRefresh task."
   [user-id card]
diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj
index 4a421cffd38..791c82bd1e3 100644
--- a/src/metabase/upload.clj
+++ b/src/metabase/upload.clj
@@ -22,6 +22,7 @@
    [metabase.models.data-permissions :as data-perms]
    [metabase.models.humanization :as humanization]
    [metabase.models.interface :as mi]
+   [metabase.models.persisted-info :as persisted-info]
    [metabase.models.table :as table]
    [metabase.public-settings :as public-settings]
    [metabase.public-settings.premium-features :as premium-features]
@@ -513,6 +514,40 @@
            (field->db-type driver field->new-type)
            args)))
 
+(defn- mbql? [model]
+  (= "query" (name (:query_type model "query"))))
+
+(defn- no-joins?
+  "Returns true if `query` has no explicit joins in it, otherwise false."
+  [query]
+  ;; TODO while it's unlikely (at the time of writing this) that uploaded tables have FKs, we should probably check
+  ;;      for implicit joins as well.
+  (->> (range (lib/stage-count query))
+       (not-any? (fn [stage-idx]
+                   (lib/joins query stage-idx)))))
+
+(defn- only-table-id
+  "For models that depend on only one table, return its id, otherwise return nil. Doesn't support native queries."
+  [model]
+  ; dataset_query can be empty in tests
+  (when-let [query (some-> model :dataset_query lib/->pMBQL not-empty)]
+    (when (and (mbql? model) (no-joins? query))
+      (lib/source-table-id query))))
+
+(defn- invalidate-cached-models!
+  "Invalidate the model caches for all cards whose `:based_on_upload` value resolves to the given table."
+  [table]
+  ;; NOTE: It is important that this logic is kept in sync with `model-hydrate-based-on-upload`
+  (when-let [model-ids (->> (t2/select [:model/Card :id :dataset_query]
+                                       :table_id (:id table)
+                                       :type     :model
+                                       :archived false)
+                            (filter (comp #{(:id table)} only-table-id))
+                            (map :id)
+                            seq)]
+    ;; Ideally we would do all the filtering in the query, but this would not allow us to leverage mlv2.
+    (persisted-info/invalidate! {:card_id [:in model-ids]})))
+
 (defn- update-with-csv! [database table file & {:keys [replace-rows?]}]
   (try
     (let [parse (infer-parser file)]
@@ -569,6 +604,8 @@
             (let [auto-pk-field (table-id->auto-pk-column (:id table))]
               (t2/update! :model/Field (:id auto-pk-field) {:display_name (:name auto-pk-field)})))
 
+          (invalidate-cached-models! table)
+
           (events/publish-event! :event/upload-append
                                  {:user-id  (:id @api/*current-user*)
                                   :model-id (:id table)
@@ -699,14 +736,6 @@
               (filter #(can-upload-to-table? (:db %) %))
               (map :id)))))
 
-(defn- no-joins?
-  "Returns true if `query` has no joins in it, otherwise false."
-  [query]
-  (let [all-joins (mapcat (fn [stage]
-                            (lib/joins query stage))
-                          (range (lib/stage-count query)))]
-    (empty? all-joins)))
-
 (mu/defn model-hydrate-based-on-upload
   "Batch hydrates `:based_on_upload` for each item of `models`. Assumes each item of `model` represents a model."
   [models :- [:sequential [:map
@@ -724,15 +753,12 @@
                                    (remove #(false? (:is_upload %)))
                                    (keep :table_id)
                                    set)
-        mbql?                 (fn [model] (= "query" (name (:query_type model "query"))))
         has-uploadable-table? (comp (uploadable-table-ids table-ids) :table_id)]
     (for [model models]
-      (m/assoc-some
-       model
-       :based_on_upload
-       (when-let [query (some-> model :dataset_query lib/->pMBQL not-empty)] ; dataset_query can be empty in tests
-         (when (and (mbql? model) (has-uploadable-table? model) (no-joins? query))
-           (lib/source-table-id query)))))))
+      ;; NOTE: It is important that this logic is kept in sync with `invalidate-cached-models!`
+      ;; If not, it will mean that the user could modify the table via a given model's page without seeing it update.
+      (m/assoc-some model :based_on_upload (when (has-uploadable-table? model)
+                                             (only-table-id model))))))
 
 (mi/define-batched-hydration-method based-on-upload
   :based_on_upload
diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj
index b9534ee2a2d..23fa4adb5a4 100644
--- a/test/metabase/upload_test.clj
+++ b/test/metabase/upload_test.clj
@@ -4,6 +4,7 @@
    [clojure.data.csv :as csv]
    [clojure.java.io :as io]
    [clojure.java.jdbc :as jdbc]
+   [clojure.set :as set]
    [clojure.string :as str]
    [clojure.test :refer :all]
    [flatland.ordered.map :as ordered-map]
@@ -12,6 +13,9 @@
    [metabase.driver.ddl.interface :as ddl.i]
    [metabase.driver.sql-jdbc.connection :as sql-jdbc.conn]
    [metabase.driver.util :as driver.u]
+   [metabase.lib.core :as lib]
+   [metabase.lib.metadata :as lib.metadata]
+   [metabase.lib.metadata.jvm :as lib.metadata.jvm]
    [metabase.models :refer [Field]]
    [metabase.models.data-permissions :as data-perms]
    [metabase.models.interface :as mi]
@@ -384,11 +388,13 @@
                 (is (not= (:id table-1)
                           (:id table-2)))))))))))
 
-(defn- query-table
-  [table]
-  (qp/process-query {:database (:db_id table)
+(defn- query [db-id source-table]
+  (qp/process-query {:database db-id
                      :type     :query
-                     :query    {:source-table (:id table)}}))
+                     :query    {:source-table source-table}}))
+
+(defn- query-table [table]
+  (query (:db_id table) (:id table)))
 
 (defn- column-names-for-table
   [table]
@@ -400,6 +406,9 @@
   [table]
   (mt/rows (query-table table)))
 
+(defn- rows-for-model [db-id model-id]
+  (mt/rows (query db-id (str "card__" model-id))))
+
 (def ^:private example-files
   {:comma      ["id    ,nulls,string ,bool ,number       ,date      ,datetime"
                 "2\t   ,,          a ,true ,1.1\t        ,2022-01-01,2022-01-01T00:00:00"
@@ -1412,6 +1421,73 @@
 
               (io/delete-file file))))))))
 
+
+(defn- mbql [mp table]
+  (let [table-metadata (lib.metadata/table mp (:id table))]
+    (lib/query mp table-metadata)))
+
+(defn- join-mbql [mp base-table join-table]
+  (let [base-table-metadata (lib.metadata/table mp (:id base-table))
+        join-table-metadata (lib.metadata/table mp (:id join-table))
+        ;; We use the primary keys as the join fields as we know they will exist and have compatible types.
+        pk-metadata         (fn [table]
+                              (let [field-id (t2/select-one-pk :model/Field
+                                                               :table_id (:id table)
+                                                               :semantic_type :type/PK)]
+                                (lib.metadata/field mp field-id)))
+        base-id-metadata         (pk-metadata base-table)
+        join-id-metadata         (pk-metadata join-table)]
+
+    (-> (lib/query mp base-table-metadata)
+        (lib/join (lib/join-clause join-table-metadata
+                                   [(lib/= (lib/ref base-id-metadata)
+                                           (lib/ref join-id-metadata))])))))
+
+(defn- cached-model-ids []
+  (into #{} (map :card_id) (t2/select [:model/PersistedInfo :card_id] :active true)))
+
+(deftest update-invalidate-model-cache-test
+  (mt/test-drivers (mt/normal-drivers-with-feature :uploads :persist-models)
+    (doseq [action (actions-to-test driver/*driver*)]
+      (testing (action-testing-str action)
+        (with-upload-table! [table (create-upload-table!)]
+          (let [table-id    (:id table)
+                csv-rows    ["name" "Luke Skywalker"]
+                file        (csv-file-with csv-rows)
+                other-id    (mt/id :venues)
+                other-table (t2/select-one :model/Table other-id)
+                mp          (lib.metadata.jvm/application-database-metadata-provider (:db_id table))]
+
+            (mt/with-temp [:model/Card {question-id        :id} {:table_id table-id, :dataset_query (mbql mp table)}
+                           :model/Card {model-id           :id} {:table_id table-id, :type :model, :dataset_query (mbql mp table)}
+                           :model/Card {complex-model-id   :id} {:table_id table-id, :type :model, :dataset_query (join-mbql mp table other-table)}
+                           :model/Card {archived-model-id  :id} {:table_id table-id, :type :model, :archived true, :dataset_query (mbql mp table)}
+                           :model/Card {unrelated-model-id :id} {:table_id other-id, :type :model, :dataset_query (mbql mp other-table)}
+                           :model/Card {joined-model-id    :id} {:table_id other-id, :type :model, :dataset_query (join-mbql mp other-table table)}]
+
+              (is (= #{question-id model-id complex-model-id}
+                     (into #{} (map :id) (t2/select :model/Card :table_id table-id :archived false))))
+
+              (mt/with-persistence-enabled [persist-models!]
+                (persist-models!)
+
+                (let [cached-before (cached-model-ids)
+                      _             (update-csv! action {:file file, :table-id (:id table)})
+                      cached-after  (cached-model-ids)]
+
+                  (testing "The models are cached"
+                    (let [active-model-ids #{model-id complex-model-id unrelated-model-id joined-model-id}]
+                      (is (= active-model-ids (set/intersection cached-before (conj active-model-ids archived-model-id))))))
+                  (testing "The cache is invalidated by the update"
+                    (is (not (contains? cached-after model-id))))
+                  (testing "No unwanted caches were invalidated"
+                    (is (= #{model-id} (set/difference cached-before cached-after))))
+                  (testing "We can see the new row when querying the model"
+                    (is (some (fn [[_ row-name]] (= "Luke Skywalker" row-name))
+                              (rows-for-model (:db_id table) model-id)))))))
+
+            (io/delete-file file)))))))
+
 (deftest update-mb-row-id-csv-and-table-test
   (mt/test-drivers (mt/normal-drivers-with-feature :uploads)
     (doseq [action (actions-to-test driver/*driver*)]
-- 
GitLab