From 719fc4f750f72511bf65388e60c80cdf412d978e Mon Sep 17 00:00:00 2001 From: Chris Truter <crisptrutski@users.noreply.github.com> Date: Tue, 23 Apr 2024 19:14:00 +0200 Subject: [PATCH] Invalidate model caching when updating upload table (#41653) --- src/metabase/models/persisted_info.clj | 9 +++ src/metabase/upload.clj | 56 ++++++++++++----- test/metabase/upload_test.clj | 84 ++++++++++++++++++++++++-- 3 files changed, 130 insertions(+), 19 deletions(-) diff --git a/src/metabase/models/persisted_info.clj b/src/metabase/models/persisted_info.clj index 384e2325eec..b50ae7e1b49 100644 --- a/src/metabase/models/persisted_info.clj +++ b/src/metabase/models/persisted_info.clj @@ -117,6 +117,15 @@ ([conditions-map state] (t2/update! PersistedInfo conditions-map {:active false, :state state, :state_change_at :%now}))) +(defn invalidate! + "Invalidates any caches corresponding to the `conditions-map`. Equivalent to toggling the caching off and on again." + [conditions-map] + ;; We do not immediately delete the cached table, it will get clobbered during the next refresh cycle. + (t2/update! PersistedInfo + (merge {:active true} conditions-map) + ;; TODO perhaps we should immediately kick off a recalculation of these caches + {:active false, :state "creating", :state_change_at :%now})) + (defn- create-row "Marks PersistedInfo as `creating`, these will at some point be persisted by the PersistRefresh task." [user-id card] diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj index 4a421cffd38..791c82bd1e3 100644 --- a/src/metabase/upload.clj +++ b/src/metabase/upload.clj @@ -22,6 +22,7 @@ [metabase.models.data-permissions :as data-perms] [metabase.models.humanization :as humanization] [metabase.models.interface :as mi] + [metabase.models.persisted-info :as persisted-info] [metabase.models.table :as table] [metabase.public-settings :as public-settings] [metabase.public-settings.premium-features :as premium-features] @@ -513,6 +514,40 @@ (field->db-type driver field->new-type) args))) +(defn- mbql? [model] + (= "query" (name (:query_type model "query")))) + +(defn- no-joins? + "Returns true if `query` has no explicit joins in it, otherwise false." + [query] + ;; TODO while it's unlikely (at the time of writing this) that uploaded tables have FKs, we should probably check + ;; for implicit joins as well. + (->> (range (lib/stage-count query)) + (not-any? (fn [stage-idx] + (lib/joins query stage-idx))))) + +(defn- only-table-id + "For models that depend on only one table, return its id, otherwise return nil. Doesn't support native queries." + [model] + ; dataset_query can be empty in tests + (when-let [query (some-> model :dataset_query lib/->pMBQL not-empty)] + (when (and (mbql? model) (no-joins? query)) + (lib/source-table-id query)))) + +(defn- invalidate-cached-models! + "Invalidate the model caches for all cards whose `:based_on_upload` value resolves to the given table." + [table] + ;; NOTE: It is important that this logic is kept in sync with `model-hydrate-based-on-upload` + (when-let [model-ids (->> (t2/select [:model/Card :id :dataset_query] + :table_id (:id table) + :type :model + :archived false) + (filter (comp #{(:id table)} only-table-id)) + (map :id) + seq)] + ;; Ideally we would do all the filtering in the query, but this would not allow us to leverage mlv2. + (persisted-info/invalidate! {:card_id [:in model-ids]}))) + (defn- update-with-csv! [database table file & {:keys [replace-rows?]}] (try (let [parse (infer-parser file)] @@ -569,6 +604,8 @@ (let [auto-pk-field (table-id->auto-pk-column (:id table))] (t2/update! :model/Field (:id auto-pk-field) {:display_name (:name auto-pk-field)}))) + (invalidate-cached-models! table) + (events/publish-event! :event/upload-append {:user-id (:id @api/*current-user*) :model-id (:id table) @@ -699,14 +736,6 @@ (filter #(can-upload-to-table? (:db %) %)) (map :id))))) -(defn- no-joins? - "Returns true if `query` has no joins in it, otherwise false." - [query] - (let [all-joins (mapcat (fn [stage] - (lib/joins query stage)) - (range (lib/stage-count query)))] - (empty? all-joins))) - (mu/defn model-hydrate-based-on-upload "Batch hydrates `:based_on_upload` for each item of `models`. Assumes each item of `model` represents a model." [models :- [:sequential [:map @@ -724,15 +753,12 @@ (remove #(false? (:is_upload %))) (keep :table_id) set) - mbql? (fn [model] (= "query" (name (:query_type model "query")))) has-uploadable-table? (comp (uploadable-table-ids table-ids) :table_id)] (for [model models] - (m/assoc-some - model - :based_on_upload - (when-let [query (some-> model :dataset_query lib/->pMBQL not-empty)] ; dataset_query can be empty in tests - (when (and (mbql? model) (has-uploadable-table? model) (no-joins? query)) - (lib/source-table-id query))))))) + ;; NOTE: It is important that this logic is kept in sync with `invalidate-cached-models!` + ;; If not, it will mean that the user could modify the table via a given model's page without seeing it update. + (m/assoc-some model :based_on_upload (when (has-uploadable-table? model) + (only-table-id model)))))) (mi/define-batched-hydration-method based-on-upload :based_on_upload diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj index b9534ee2a2d..23fa4adb5a4 100644 --- a/test/metabase/upload_test.clj +++ b/test/metabase/upload_test.clj @@ -4,6 +4,7 @@ [clojure.data.csv :as csv] [clojure.java.io :as io] [clojure.java.jdbc :as jdbc] + [clojure.set :as set] [clojure.string :as str] [clojure.test :refer :all] [flatland.ordered.map :as ordered-map] @@ -12,6 +13,9 @@ [metabase.driver.ddl.interface :as ddl.i] [metabase.driver.sql-jdbc.connection :as sql-jdbc.conn] [metabase.driver.util :as driver.u] + [metabase.lib.core :as lib] + [metabase.lib.metadata :as lib.metadata] + [metabase.lib.metadata.jvm :as lib.metadata.jvm] [metabase.models :refer [Field]] [metabase.models.data-permissions :as data-perms] [metabase.models.interface :as mi] @@ -384,11 +388,13 @@ (is (not= (:id table-1) (:id table-2))))))))))) -(defn- query-table - [table] - (qp/process-query {:database (:db_id table) +(defn- query [db-id source-table] + (qp/process-query {:database db-id :type :query - :query {:source-table (:id table)}})) + :query {:source-table source-table}})) + +(defn- query-table [table] + (query (:db_id table) (:id table))) (defn- column-names-for-table [table] @@ -400,6 +406,9 @@ [table] (mt/rows (query-table table))) +(defn- rows-for-model [db-id model-id] + (mt/rows (query db-id (str "card__" model-id)))) + (def ^:private example-files {:comma ["id ,nulls,string ,bool ,number ,date ,datetime" "2\t ,, a ,true ,1.1\t ,2022-01-01,2022-01-01T00:00:00" @@ -1412,6 +1421,73 @@ (io/delete-file file)))))))) + +(defn- mbql [mp table] + (let [table-metadata (lib.metadata/table mp (:id table))] + (lib/query mp table-metadata))) + +(defn- join-mbql [mp base-table join-table] + (let [base-table-metadata (lib.metadata/table mp (:id base-table)) + join-table-metadata (lib.metadata/table mp (:id join-table)) + ;; We use the primary keys as the join fields as we know they will exist and have compatible types. + pk-metadata (fn [table] + (let [field-id (t2/select-one-pk :model/Field + :table_id (:id table) + :semantic_type :type/PK)] + (lib.metadata/field mp field-id))) + base-id-metadata (pk-metadata base-table) + join-id-metadata (pk-metadata join-table)] + + (-> (lib/query mp base-table-metadata) + (lib/join (lib/join-clause join-table-metadata + [(lib/= (lib/ref base-id-metadata) + (lib/ref join-id-metadata))]))))) + +(defn- cached-model-ids [] + (into #{} (map :card_id) (t2/select [:model/PersistedInfo :card_id] :active true))) + +(deftest update-invalidate-model-cache-test + (mt/test-drivers (mt/normal-drivers-with-feature :uploads :persist-models) + (doseq [action (actions-to-test driver/*driver*)] + (testing (action-testing-str action) + (with-upload-table! [table (create-upload-table!)] + (let [table-id (:id table) + csv-rows ["name" "Luke Skywalker"] + file (csv-file-with csv-rows) + other-id (mt/id :venues) + other-table (t2/select-one :model/Table other-id) + mp (lib.metadata.jvm/application-database-metadata-provider (:db_id table))] + + (mt/with-temp [:model/Card {question-id :id} {:table_id table-id, :dataset_query (mbql mp table)} + :model/Card {model-id :id} {:table_id table-id, :type :model, :dataset_query (mbql mp table)} + :model/Card {complex-model-id :id} {:table_id table-id, :type :model, :dataset_query (join-mbql mp table other-table)} + :model/Card {archived-model-id :id} {:table_id table-id, :type :model, :archived true, :dataset_query (mbql mp table)} + :model/Card {unrelated-model-id :id} {:table_id other-id, :type :model, :dataset_query (mbql mp other-table)} + :model/Card {joined-model-id :id} {:table_id other-id, :type :model, :dataset_query (join-mbql mp other-table table)}] + + (is (= #{question-id model-id complex-model-id} + (into #{} (map :id) (t2/select :model/Card :table_id table-id :archived false)))) + + (mt/with-persistence-enabled [persist-models!] + (persist-models!) + + (let [cached-before (cached-model-ids) + _ (update-csv! action {:file file, :table-id (:id table)}) + cached-after (cached-model-ids)] + + (testing "The models are cached" + (let [active-model-ids #{model-id complex-model-id unrelated-model-id joined-model-id}] + (is (= active-model-ids (set/intersection cached-before (conj active-model-ids archived-model-id)))))) + (testing "The cache is invalidated by the update" + (is (not (contains? cached-after model-id)))) + (testing "No unwanted caches were invalidated" + (is (= #{model-id} (set/difference cached-before cached-after)))) + (testing "We can see the new row when querying the model" + (is (some (fn [[_ row-name]] (= "Luke Skywalker" row-name)) + (rows-for-model (:db_id table) model-id))))))) + + (io/delete-file file))))))) + (deftest update-mb-row-id-csv-and-table-test (mt/test-drivers (mt/normal-drivers-with-feature :uploads) (doseq [action (actions-to-test driver/*driver*)] -- GitLab