From 3f2942346dcd4a58cf76b647495afa4039e0c9f6 Mon Sep 17 00:00:00 2001 From: Braden Shepherdson <Braden.Shepherdson@gmail.com> Date: Thu, 7 Jul 2022 16:10:18 -0400 Subject: [PATCH] YAML files for serialization (#23491) Write `storage.yaml` and `ingest.yaml` to serialize all the way to YAML files and back. Lots of generative testing to check it's isomorphic. --- .clj-kondo/config.edn | 1 + .../serialization/v2/ingest/yaml.clj | 44 +++++++ .../serialization/v2/load.clj | 6 +- .../serialization/v2/storage.clj | 15 +++ .../serialization/v2/storage/yaml.clj | 41 +++++++ .../serialization/v2/extract_test.clj | 8 +- .../serialization/v2/load_test.clj | 18 +-- .../serialization/v2/yaml_test.clj | 115 ++++++++++++++++++ src/metabase/models/serialization/base.clj | 22 ++-- src/metabase/models/setting.clj | 2 +- 10 files changed, 244 insertions(+), 28 deletions(-) create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj create mode 100644 enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj diff --git a/.clj-kondo/config.edn b/.clj-kondo/config.edn index 3b6197df019..e3152a075e6 100644 --- a/.clj-kondo/config.edn +++ b/.clj-kondo/config.edn @@ -366,6 +366,7 @@ metabase.test/with-user-in-groups clojure.core/let metabase.test.data.interface/defdataset clojure.core/def metabase.test.data.interface/defdataset-edn clojure.core/def + metabase-enterprise.serialization.test-util/with-random-dump-dir clojure.core/fn metabase.driver.mongo.util/with-mongo-connection clojure.core/let metabase.driver.mongo.query-processor/mongo-let clojure.core/let toucan.db/with-call-counting clojure.core/fn diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj new file mode 100644 index 00000000000..bf524725873 --- /dev/null +++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj @@ -0,0 +1,44 @@ +(ns metabase-enterprise.serialization.v2.ingest.yaml + (:require [clojure.java.io :as io] + [metabase-enterprise.serialization.v2.ingest :as ingest] + [yaml.core :as yaml]) + (:import java.io.File)) + +(defmulti ^:private build-metas + (fn [^File file] (.getName file))) + +(defmethod build-metas "settings.yaml" [file] + (let [settings (yaml/from-file file)] + (for [[k _] settings] + {:model "Setting" :id (name k)}))) + +(defmethod build-metas :default [^File file] + (let [model-name (-> file .getParentFile .getName) + [_ id label] (re-matches #"^([A-Za-z0-9_-]+)(?:\+(.*))?.yaml$" (.getName file))] + [(cond-> {:model model-name :id id} + label (assoc :label label))])) + +(defn- ingest-entity [root-dir {:keys [model id label] :as meta-map}] + (let [filename (if label + (str id "+" label ".yaml") + (str id ".yaml"))] + (-> (io/file root-dir model filename) + yaml/from-file + (assoc :serdes/meta meta-map)))) + +(deftype YamlIngestion [^File root-dir settings] + ingest/Ingestable + (ingest-list [_] + (eduction (comp (filter (fn [^File f] (.isFile f))) + (mapcat build-metas)) + (file-seq root-dir))) + (ingest-one [_ {:keys [model id] :as meta-map}] + (if (= "Setting" model) + {:serdes/meta meta-map :key (keyword id) :value (get settings (keyword id))} + (ingest-entity root-dir meta-map)))) + +(defn ingest-yaml + "Creates a new Ingestable on a directory of YAML files, as created by + [[metabase-enterprise.serialization.v2.storage.yaml]]." + [root-dir] + (->YamlIngestion (io/file root-dir) (yaml/from-file (io/file root-dir "settings.yaml")))) diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj index e70254ea905..39b5e98e719 100644 --- a/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj +++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj @@ -56,12 +56,12 @@ [[metabase.models.serialization.base/load-one!]] and its various overridable parts, which see. Circular dependencies are not allowed, and are detected and thrown as an error." - [{:keys [expanding ingestion seen] :as ctx} {:keys [id type] :as meta-map}] + [{:keys [expanding ingestion seen] :as ctx} {id :id model-name :model :as meta-map}] (cond - (expanding id) (throw (ex-info (format "Circular dependency on %s %s" type id) {})) + (expanding id) (throw (ex-info (format "Circular dependency on %s %s" model-name id) {})) (seen id) ctx ; Already been done, just skip it. :else (let [ingested (serdes.ingest/ingest-one ingestion meta-map) - model (db/resolve-model (symbol type)) + model (db/resolve-model (symbol model-name)) deps (serdes.base/serdes-dependencies ingested) ctx (-> ctx (update :expanding conj id) diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj new file mode 100644 index 00000000000..ed67b9efb0c --- /dev/null +++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj @@ -0,0 +1,15 @@ +(ns metabase-enterprise.serialization.v2.storage + "A multimethod entry point for storage sinks. Storage is the second phase of serialization. + See [[metabase.models.serialization.base]] for detailed documentation of the serialization process. + Implementations of storage should live in [[metabase-enterprise.serialization.v2.storage.yaml]] and similar.") + +(defmulti store-all! + "`(store-all! stream opts)` + `stream` is a reducible stream of portable maps with `:serdes/meta` keys. + `opts` is a map of options, such as the path to the root directory. + + See [[metabase.models.serialization.base]] for detailed documentation of the serialization process, and the maps in + the stream. + + Keyed on the only required key in `opts`: `{:storage/target ...}`." + (fn [_ {target :storage/target}] target)) diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj new file mode 100644 index 00000000000..733aca79dd4 --- /dev/null +++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj @@ -0,0 +1,41 @@ +(ns metabase-enterprise.serialization.v2.storage.yaml + (:require [clojure.java.io :as io] + [metabase-enterprise.serialization.v2.storage :as storage] + [yaml.core :as yaml])) + +(defn- spit-yaml + [path obj] + (apply io/make-parents path) + (spit (apply io/file path) (yaml/generate-string obj :dumper-options {:flow-style :block}))) + +(defn- store-entity! [{:keys [root-dir]} {{:keys [id model label]} :serdes/meta :as entity}] + (let [basename (if (nil? label) + (str id ".yaml") + ;; + is a legal, unescaped character on all common filesystems, but not `identity-hash` or NanoID! + (str id "+" label ".yaml")) + path [root-dir model basename]] + (spit-yaml path (dissoc entity :serdes/meta)))) + +(defn- store-settings! [{:keys [root-dir]} settings] + (let [as-map (into (sorted-map) + (for [{:keys [key value]} settings] + [key value]))] + (spit-yaml [root-dir "settings.yaml"] as-map))) + +(defmethod storage/store-all! :yaml [stream opts] + (when-not (or (string? (:root-dir opts)) + (instance? java.io.File (:root-dir opts))) + (throw (ex-info ":yaml storage requires the :root-dir option to be a string or File" + {:opts opts}))) + (let [settings (atom [])] + (doseq [entity stream] + (if (-> entity :serdes/meta :model (= "Setting")) + (swap! settings conj entity) + (store-entity! opts entity))) + (store-settings! opts @settings))) + +(defn store! + "Helper for storing a serialized database to a tree of YAML files." + [stream root-dir] + (storage/store-all! stream {:storage/target :yaml + :root-dir root-dir})) diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj index 90574451886..797408564ad 100644 --- a/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj +++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj @@ -28,7 +28,7 @@ (testing "a top-level collection is extracted correctly" (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id coll-id]))] - (is (= {:type "Collection" :id coll-eid :label coll-slug} (:serdes/meta ser))) + (is (= {:model "Collection" :id coll-eid :label coll-slug} (:serdes/meta ser))) (is (not (contains? ser :location))) (is (not (contains? ser :id))) (is (nil? (:personal_owner_id ser))) @@ -37,7 +37,7 @@ (testing "a nested collection is extracted with the right parent_id" (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id child-id]))] - (is (= {:type "Collection" :id child-eid :label child-slug} (:serdes/meta ser))) + (is (= {:model "Collection" :id child-eid :label child-slug} (:serdes/meta ser))) (is (not (contains? ser :location))) (is (not (contains? ser :id))) (is (= coll-eid (:parent_id ser))) @@ -45,7 +45,7 @@ (testing "personal collections are extracted with email as key" (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id pc-id]))] - (is (= {:type "Collection" :id pc-eid :label pc-slug} (:serdes/meta ser))) + (is (= {:model "Collection" :id pc-eid :label pc-slug} (:serdes/meta ser))) (is (not (contains? ser :location))) (is (not (contains? ser :id))) (is (nil? (:parent_id ser))) @@ -55,7 +55,7 @@ (letfn [(collections [extraction] (->> extraction (into []) (map :serdes/meta) - (filter #(= "Collection" (:type %))) + (filter #(= "Collection" (:model %))) (map :id) set))] (testing "no user specified" diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj index 864b8252b70..aae206bdcd6 100644 --- a/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj +++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj @@ -9,16 +9,16 @@ [toucan.db :as db])) (defn- ingestion-in-memory [extractions] - (let [mapped (into {} (for [{{:keys [type id]} :serdes/meta :as m} (into [] extractions)] - [[type id] m]))] + (let [mapped (into {} (for [{{:keys [model id]} :serdes/meta :as m} (into [] extractions)] + [[model id] m]))] (reify serdes.ingest/Ingestable (ingest-list [_] (eduction (map :serdes/meta) (vals mapped))) - (ingest-one [_ {:keys [type id]}] - (or (get mapped [type id]) - (throw (ex-info (format "Unknown ingestion target: %s %s" type id) - {:type type :id id :world mapped}))))))) + (ingest-one [_ {:keys [model id]}] + (or (get mapped [model id]) + (throw (ex-info (format "Unknown ingestion target: %s %s" model id) + {:model model :id id :world mapped}))))))) ;;; WARNING for test authors: [[extract/extract-metabase]] returns a lazy reducible value. To make sure you don't ;;; confound your tests with data from your dev appdb, remember to eagerly @@ -33,8 +33,8 @@ (ts/with-source-db (ts/create! Collection :name "Basic Collection" :entity_id eid1) (reset! serialized (into [] (serdes.extract/extract-metabase {}))) - (is (some (fn [{{:keys [type id]} :serdes/meta}] - (and (= type "Collection") (= id eid1))) + (is (some (fn [{{:keys [model id]} :serdes/meta}] + (and (= model "Collection") (= id eid1))) @serialized)))) (testing "loading into an empty database succeeds" @@ -113,7 +113,7 @@ (serdes.hash/identity-hash @c2b)} (->> @serialized (map :serdes/meta) - (filter #(= "Collection" (:type %))) + (filter #(= "Collection" (:model %))) (map :id) set)))) diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj new file mode 100644 index 00000000000..2377774282e --- /dev/null +++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj @@ -0,0 +1,115 @@ +(ns metabase-enterprise.serialization.v2.yaml-test + (:require [clojure.java.io :as io] + [clojure.test :refer :all] + [metabase-enterprise.serialization.test-util :as ts] + [metabase-enterprise.serialization.v2.extract :as extract] + [metabase-enterprise.serialization.v2.ingest :as ingest] + [metabase-enterprise.serialization.v2.ingest.yaml :as ingest.yaml] + [metabase-enterprise.serialization.v2.storage.yaml :as storage.yaml] + [metabase.models.collection :refer [Collection]] + [metabase.test.generate :as test-gen] + [reifyhealth.specmonstah.core :as rs] + [yaml.core :as yaml])) + +(defn- dir->file-set [dir] + (->> dir + .listFiles + (filter #(.isFile %)) + (map #(.getName %)) + set)) + +(deftest basic-dump-test + (ts/with-random-dump-dir [dump-dir] + (ts/with-empty-h2-app-db + (ts/with-temp-dpc [Collection [parent {:name "Some Collection"}] + Collection [child {:name "Child Collection" :location (format "/%d/" (:id parent))}]] + (let [export (into [] (extract/extract-metabase nil)) + parent-filename (format "%s+some_collection.yaml" (:entity_id parent)) + child-filename (format "%s+child_collection.yaml" (:entity_id child))] + (storage.yaml/store! export dump-dir) + (testing "the right files in the right places" + (is (= #{parent-filename child-filename} + (dir->file-set (io/file dump-dir "Collection"))) + "Entities go in subdirectories") + (is (= #{"settings.yaml"} + (dir->file-set (io/file dump-dir))) + "A few top-level files are expected")) + + (testing "the Collections properly exported" + (is (= (-> (into {} (Collection (:id parent))) + (dissoc :id :location) + (assoc :parent_id nil)) + (yaml/from-file (io/file dump-dir "Collection" parent-filename)))) + + (is (= (-> (into {} (Collection (:id child))) + (dissoc :id :location) + (assoc :parent_id (:entity_id parent))) + (yaml/from-file (io/file dump-dir "Collection" child-filename)))))))))) + +(deftest basic-ingest-test + (ts/with-random-dump-dir [dump-dir] + (io/make-parents dump-dir "Collection" "fake") ; Prepare the right directories. + (spit (io/file dump-dir "settings.yaml") + (yaml/generate-string {:some-key "with string value" + :another-key 7 + :blank-key nil})) + (spit (io/file dump-dir "Collection" "fake-id+the_label.yaml") + (yaml/generate-string {:some "made up" :data "here"})) + (spit (io/file dump-dir "Collection" "no-label.yaml") + (yaml/generate-string {:some "other" :data "in this one"})) + + (let [ingestable (ingest.yaml/ingest-yaml dump-dir) + meta-maps (into [] (ingest/ingest-list ingestable)) + exp-files {{:model "Collection" :id "fake-id" :label "the_label"} {:some "made up" :data "here"} + {:model "Collection" :id "no-label"} {:some "other" :data "in this one"} + {:model "Setting" :id "some-key"} {:key :some-key :value "with string value"} + {:model "Setting" :id "another-key"} {:key :another-key :value 7} + {:model "Setting" :id "blank-key"} {:key :blank-key :value nil}}] + (testing "the right set of file is returned by ingest-list" + (is (= (set (keys exp-files)) + (set meta-maps)))) + + (testing "individual reads in any order are correct" + (doseq [meta-map (->> exp-files + keys + (repeat 10) + (into [] cat) + shuffle)] + (is (= (-> exp-files + (get meta-map) + (assoc :serdes/meta meta-map)) + (ingest/ingest-one ingestable meta-map)))))))) + +(deftest e2e-storage-ingestion-test + (ts/with-random-dump-dir [dump-dir] + (ts/with-empty-h2-app-db + (test-gen/insert! {:collection [[100 {:refs {:personal_owner_id ::rs/omit}}]]}) + (let [extraction (into [] (extract/extract-metabase {})) + entities (reduce (fn [m {{:keys [model id]} :serdes/meta :as entity}] + (assoc-in m [model id] entity)) + {} extraction)] + (is (= 100 (-> entities (get "Collection") vals count))) + + (testing "storage" + (storage.yaml/store! (seq extraction) dump-dir) + (testing "for Collections" + (is (= 100 (count (dir->file-set (io/file dump-dir "Collection"))))) + (doseq [{:keys [entity_id slug] :as coll} (vals (get entities "Collection")) + :let [filename (str entity_id "+" slug ".yaml")]] + (is (= (dissoc coll :serdes/meta) + (yaml/from-file (io/file dump-dir "Collection" filename)))))) + + (testing "for settings" + (is (= (into {} (for [{:keys [key value]} (vals (get entities "Setting"))] + [key value])) + (yaml/from-file (io/file dump-dir "settings.yaml")))))) + + (testing "ingestion" + (let [ingestable (ingest.yaml/ingest-yaml dump-dir)] + (testing "ingest-list is accurate" + (is (= (into #{} (comp (map vals) cat (map :serdes/meta)) (vals entities)) + (into #{} (ingest/ingest-list ingestable))))) + + (testing "each entity matches its in-memory original" + (doseq [entity extraction] + (is (= entity (ingest/ingest-one ingestable (:serdes/meta entity)))))))))))) diff --git a/src/metabase/models/serialization/base.clj b/src/metabase/models/serialization/base.clj index c2bd53143a1..d29a3fd93ca 100644 --- a/src/metabase/models/serialization/base.clj +++ b/src/metabase/models/serialization/base.clj @@ -61,7 +61,7 @@ (fn [model _] model)) (defmulti extract-query - "Performs the select query, possibly filtered, for all the entities of this type that should be serialized. Called + "Performs the select query, possibly filtered, for all the entities of this model that should be serialized. Called from [[extract-all]]'s default implementation. `(extract-query \"ModelName\" opts)` @@ -87,8 +87,8 @@ (defmulti extract-one "Extracts a single entity retrieved from the database into a portable map with `:serdes/meta` attached. - The default implementation uses the model name as the `:type` and either `:entity_id` or [[serdes.hash/identity-hash]] - as the `:id`. It also strips off the database's numeric primary key. + The default implementation uses the model name as the `:model` and either `:entity_id` or + [[serdes.hash/identity-hash]] as the `:id`. It also strips off the database's numeric primary key. That suffices for a few simple entities, but most entities will need to override this. They should follow the pattern of: @@ -132,9 +132,9 @@ (let [model (db/resolve-model (symbol model-name)) pk (models/primary-key model)] (-> entity - (assoc :serdes/meta {:type model-name - :id (or (:entity_id entity) - (serdes.hash/identity-hash (model (get entity pk))))}) + (assoc :serdes/meta {:model model-name + :id (or (:entity_id entity) + (serdes.hash/identity-hash (model (get entity pk))))}) (dissoc pk)))) (defmethod extract-one :default [model-name entity] @@ -198,10 +198,10 @@ ;;; +----------------------------------------------------------------------------------------------------------------+ ;;; The Clojure maps from extraction and ingestion always include a special key `:serdes/meta` giving some information ;;; about the serialized entity. The value is always a map like: -;;; `{:type "ModelName" :id "entity ID or identity hash string" :label "Human-readable name"}` -;;; `:type` and `:id` are required; `:label` is optional. +;;; `{:model "ModelName" :id "entity ID or identity hash string" :label "Human-readable name"}` +;;; `:model` and `:id` are required; `:label` is optional. ;;; -;;; Many of the multimethods are keyed on the `:type` field. +;;; Many of the multimethods are keyed on the `:model` field. (defmulti load-prescan-all "Returns a reducible stream of `[entity_id identity-hash primary-key]` triples for the entire table. @@ -235,9 +235,9 @@ key])) (defn- ingested-model - "The dispatch function for several of the load multimethods: dispatching on the type of the incoming entity." + "The dispatch function for several of the load multimethods: dispatching on the model of the incoming entity." [ingested] - (-> ingested :serdes/meta :type)) + (-> ingested :serdes/meta :model)) (defmulti serdes-dependencies "Given an entity map as ingested (not a Toucan entity) returns a (possibly empty) list of its dependencies, where each diff --git a/src/metabase/models/setting.clj b/src/metabase/models/setting.clj index 0a3837ec4cd..df3f9a0a1f3 100644 --- a/src/metabase/models/setting.clj +++ b/src/metabase/models/setting.clj @@ -151,7 +151,7 @@ (defmethod serdes.base/extract-all "Setting" [_model _opts] (for [{:keys [key value]} (admin-writable-site-wide-settings :getter (partial get-value-of-type :string))] - {:serdes/meta {:type "Setting" :id (name key)} + {:serdes/meta {:model "Setting" :id (name key)} :key key :value value})) -- GitLab