Skip to content
Snippets Groups Projects
Unverified Commit 47aa931a authored by Braden Shepherdson's avatar Braden Shepherdson Committed by GitHub
Browse files

Serdes v2: Backfill entity_id based on identity-hash before serdes (#26118)

This makes deserialization faster since entity_id columns are indexed.
parent a5f99d0d
No related branches found
No related tags found
No related merge requests found
(ns metabase-enterprise.serialization.v2.backfill-ids
"Finds all models with `:entity_id` columns, scans them for anything without a blank ID, and
generates consistent entity_id based on their hashes.
Note that cross-JVM portability is required - but that's specified for [[java.util.Random]],
so this should produce identical IDs on all platforms and JVM implementations."
(:require
[clojure.tools.logging :as log]
[metabase-enterprise.serialization.v2.models :as serdes.models]
[metabase.logger]
[metabase.models.serialization.hash :as serdes.hash]
[metabase.util :as u]
[metabase.util.i18n :refer [trs]]
[toucan.db :as db]
[toucan.models :as models]))
(defn backfill-ids-for
"Updates all rows of a particular model to have `:entity_id` set, based on the [[serdes.hash/identity-hash]]."
[model]
(let [missing (db/select model :entity_id nil)
pk (models/primary-key model)]
(when (seq missing)
(log/info (trs "Backfilling entity_id for {0} rows of {1}" (pr-str (count missing)) (:name model)))
(doseq [entity missing
:let [hashed (serdes.hash/identity-hash entity)
eid (u/generate-nano-id hashed)]]
(db/update! model (get entity pk) :entity_id eid)))))
(defn- has-entity-id? [model]
(:entity_id (models/properties model)))
(defn backfill-ids
"Updates all rows of all models that are (a) serialized and (b) have `entity_id` columns to have the
`entity_id` set. If the `entity_id` is NULL, it is set based on the [[serdes.hash/identity-hash]] for that
row."
[]
(doseq [model-name serdes.models/exported-models
:let [model (db/resolve-model (symbol model-name))]
:when (has-entity-id? model)]
(backfill-ids-for model)))
......@@ -7,6 +7,7 @@
[clojure.string :as str]
[clojure.tools.logging :as log]
[medley.core :as m]
[metabase-enterprise.serialization.v2.backfill-ids :as serdes.backfill]
[metabase-enterprise.serialization.v2.models :as serdes.models]
[metabase.models :refer [Card Collection Dashboard DashboardCard]]
[metabase.models.collection :as collection]
......@@ -41,6 +42,7 @@
there."
[opts]
(log/tracef "Extracting Metabase with options: %s" (pr-str opts))
(serdes.backfill/backfill-ids)
(let [model-pred (if (:data-model-only opts)
#{"Database" "Dimension" "Field" "FieldValues" "Metric" "Segment" "Table"}
(constantly true))
......@@ -170,6 +172,7 @@
(filter #(= (first %) "Collection"))
(map second)
set))]
(serdes.backfill/backfill-ids)
(if-let [analysis (escape-analysis selected-collections)]
;; If that is non-nil, emit the report.
(escape-report analysis)
......
......@@ -2,6 +2,7 @@
"Loading is the interesting part of deserialization: integrating the maps \"ingested\" from files into the appdb.
See the detailed breakdown of the (de)serialization processes in [[metabase.models.serialization.base]]."
(:require [medley.core :as m]
[metabase-enterprise.serialization.v2.backfill-ids :as serdes.backfill]
[metabase-enterprise.serialization.v2.ingest :as serdes.ingest]
[metabase.models.serialization.base :as serdes.base]))
......@@ -59,6 +60,7 @@
[ingestion]
;; We proceed in the arbitrary order of ingest-list, deserializing all the files. Their declared dependencies guide
;; the import, and make sure all containers are imported before contents, etc.
(serdes.backfill/backfill-ids)
(let [contents (serdes.ingest/ingest-list ingestion)]
(reduce load-one {:expanding #{}
:seen #{}
......
(ns metabase-enterprise.serialization.v2.backfill-ids-test
(:require
[clojure.test :refer :all]
[metabase-enterprise.serialization.test-util :as ts]
[metabase-enterprise.serialization.v2.backfill-ids :as serdes.backfill]
[metabase.models :refer [Collection]]
[toucan.db :as db]))
(deftest backfill-needed-test
(ts/with-empty-h2-app-db
(ts/with-temp-dpc [Collection [{c1-id :id} {:name "some collection"}]
Collection [{c2-id :id} {:name "other collection"}]
;; These two deliberately have the same name!
Collection [{c3-id :id} {:name "child collection"
:location (str "/" c1-id "/")}]
Collection [{c4-id :id} {:name "child collection"
:location (str "/" c2-id "/")}]]
(let [coll-ids [c1-id c2-id c3-id c4-id]
all-eids #(db/select-field :entity_id Collection :id [:in coll-ids])]
(testing "all collections have entity_ids"
(is (every? some? (all-eids))))
(testing "removing the entity_ids"
(doseq [id coll-ids]
(db/update! Collection id :entity_id nil))
(is (every? nil? (all-eids))))
(testing "backfill now recreates them"
(serdes.backfill/backfill-ids-for Collection)
(is (every? some? (all-eids))))))))
(deftest no-overwrite-test
(ts/with-empty-h2-app-db
(ts/with-temp-dpc [Collection [{c1-id :id c1-eid :entity_id} {:name "some collection"}]
Collection [{c2-id :id} {:name "other collection"}]]
(testing "deleting the entity_id for one of them"
(db/update! Collection c2-id {:entity_id nil})
(is (= #{c1-eid nil}
(db/select-field :entity_id Collection))))
(testing "backfill"
(serdes.backfill/backfill-ids-for Collection)
(testing "sets a blank entity_id"
(is (some? (db/select-one-field :entity_id Collection :id c2-id))))
(testing "does not change the original entity_id"
(is (= c1-eid (db/select-one-field :entity_id Collection :id c1-id))))))))
(deftest repeatable-test
(ts/with-empty-h2-app-db
(ts/with-temp-dpc [Collection [{c1-eid :entity_id} {:name "some collection"}]
Collection [{c2-id :id} {:name "other collection"}]]
(testing "deleting the entity_id for one of them"
(db/update! Collection c2-id {:entity_id nil})
(is (= #{c1-eid nil}
(db/select-field :entity_id Collection))))
(testing "backfilling twice"
(serdes.backfill/backfill-ids-for Collection)
(let [first-eid (db/select-one-field :entity_id Collection :id c2-id)]
(db/update! Collection c2-id {:entity_id nil})
(is (= #{c1-eid nil}
(db/select-field :entity_id Collection)))
(serdes.backfill/backfill-ids-for Collection)
(testing "produces the same entity_id both times"
(is (= first-eid (db/select-one-field :entity_id Collection :id c2-id)))))))))
......@@ -7,7 +7,6 @@
[metabase.models :refer [Card Collection Dashboard DashboardCard Database Field FieldValues Metric Pulse
PulseChannel PulseChannelRecipient Segment Table User]]
[metabase.models.serialization.base :as serdes.base]
[metabase.models.serialization.hash :as serdes.hash]
[toucan.db :as db]))
(defn- no-labels [path]
......@@ -106,43 +105,6 @@
(is (= (format "/%d/%d/" (:id parent-dest) (:id child-dest))
(:location grandchild-dest))))))))))
(deftest deserialization-upsert-and-dupe-test
(testing "basic collections with their names changing, one without entity_id:"
(let [serialized (atom nil)
c1a (atom nil)
c2a (atom nil)
c1b (atom nil)
c2b (atom nil)]
(ts/with-source-and-dest-dbs
(testing "serializing the two collections"
(ts/with-source-db
(reset! c1b (ts/create! Collection :name "Renamed Collection 1"))
(reset! c2b (ts/create! Collection :name "Collection 2 version 2"))
(db/update! Collection (:id @c2b) {:entity_id nil})
(reset! c2b (db/select-one Collection :id (:id @c2b)))
(is (nil? (:entity_id @c2b)))
(reset! serialized (into [] (serdes.extract/extract-metabase {})))))
(testing "serialization should use identity hashes where no entity_id is defined"
(is (= #{(:entity_id @c1b)
(serdes.hash/identity-hash @c2b)}
(ids-by-model @serialized "Collection"))))
(testing "deserializing, the name change causes a duplicated collection"
(ts/with-dest-db
(reset! c1a (ts/create! Collection :name "Collection 1" :entity_id (:entity_id @c1b)))
(reset! c2a (ts/create! Collection :name "Collection 2 version 1"))
(db/update! Collection (:id @c2a) {:entity_id nil})
(reset! c2a (db/select-one Collection :id (:id @c2a)))
(is (nil? (:entity_id @c2b)))
(serdes.load/load-metabase (ingestion-in-memory @serialized))
(is (= 3 (db/count Collection)) "Collection 2 versions get duplicated, since the identity-hash changed")
(is (= #{"Renamed Collection 1"
"Collection 2 version 1"
"Collection 2 version 2"}
(set (db/select-field :name Collection))))))))))
(deftest deserialization-database-table-field-test
(testing "databases, tables and fields are nested in namespaces"
(let [serialized (atom nil)
......
......@@ -15,7 +15,7 @@
[metabase.config :as config]
[metabase.shared.util :as shared.u]
[metabase.util.i18n :refer [trs tru]]
[nano-id.core :refer [nano-id]]
[nano-id.core :as nano-id]
[potemkin :as p]
[ring.util.codec :as codec]
[weavejester.dependency :as dep])
......@@ -951,9 +951,21 @@
(= (email->domain email-address) domain))
(defn generate-nano-id
"Generates a random NanoID string. Usually these are used for the entity_id field of various models."
[]
(nano-id))
"Generates a random NanoID string. Usually these are used for the entity_id field of various models.
If an argument is provided, it's taken to be an identity-hash string and used to seed the RNG,
producing the same value every time."
([] (nano-id/nano-id))
([seed-str]
(let [seed (Long/parseLong seed-str 16)
rnd (java.util.Random. seed)
gen (nano-id/custom
"_-0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
21
(fn [len]
(let [ba (byte-array len)]
(.nextBytes rnd ba)
ba)))]
(gen))))
(defn pick-first
"Returns a pair [match others] where match is the first element of `coll` for which `pred` returns
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment