From 3f2942346dcd4a58cf76b647495afa4039e0c9f6 Mon Sep 17 00:00:00 2001
From: Braden Shepherdson <Braden.Shepherdson@gmail.com>
Date: Thu, 7 Jul 2022 16:10:18 -0400
Subject: [PATCH] YAML files for serialization (#23491)

Write `storage.yaml` and `ingest.yaml` to serialize all the way to YAML files and back.

Lots of generative testing to check it's isomorphic.
---
 .clj-kondo/config.edn                         |   1 +
 .../serialization/v2/ingest/yaml.clj          |  44 +++++++
 .../serialization/v2/load.clj                 |   6 +-
 .../serialization/v2/storage.clj              |  15 +++
 .../serialization/v2/storage/yaml.clj         |  41 +++++++
 .../serialization/v2/extract_test.clj         |   8 +-
 .../serialization/v2/load_test.clj            |  18 +--
 .../serialization/v2/yaml_test.clj            | 115 ++++++++++++++++++
 src/metabase/models/serialization/base.clj    |  22 ++--
 src/metabase/models/setting.clj               |   2 +-
 10 files changed, 244 insertions(+), 28 deletions(-)
 create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj
 create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj
 create mode 100644 enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj
 create mode 100644 enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj

diff --git a/.clj-kondo/config.edn b/.clj-kondo/config.edn
index 3b6197df019..e3152a075e6 100644
--- a/.clj-kondo/config.edn
+++ b/.clj-kondo/config.edn
@@ -366,6 +366,7 @@
            metabase.test/with-user-in-groups clojure.core/let
            metabase.test.data.interface/defdataset clojure.core/def
            metabase.test.data.interface/defdataset-edn clojure.core/def
+           metabase-enterprise.serialization.test-util/with-random-dump-dir clojure.core/fn
            metabase.driver.mongo.util/with-mongo-connection clojure.core/let
            metabase.driver.mongo.query-processor/mongo-let clojure.core/let
            toucan.db/with-call-counting clojure.core/fn
diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj
new file mode 100644
index 00000000000..bf524725873
--- /dev/null
+++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/ingest/yaml.clj
@@ -0,0 +1,44 @@
+(ns metabase-enterprise.serialization.v2.ingest.yaml
+  (:require [clojure.java.io :as io]
+            [metabase-enterprise.serialization.v2.ingest :as ingest]
+            [yaml.core :as yaml])
+  (:import java.io.File))
+
+(defmulti ^:private build-metas
+  (fn [^File file] (.getName file)))
+
+(defmethod build-metas "settings.yaml" [file]
+  (let [settings (yaml/from-file file)]
+    (for [[k _] settings]
+      {:model "Setting" :id (name k)})))
+
+(defmethod build-metas :default [^File file]
+  (let [model-name   (-> file .getParentFile .getName)
+        [_ id label] (re-matches #"^([A-Za-z0-9_-]+)(?:\+(.*))?.yaml$" (.getName file))]
+    [(cond-> {:model model-name :id id}
+       label (assoc :label label))]))
+
+(defn- ingest-entity [root-dir {:keys [model id label] :as meta-map}]
+  (let [filename (if label
+                   (str id "+" label ".yaml")
+                   (str id ".yaml"))]
+    (-> (io/file root-dir model filename)
+        yaml/from-file
+        (assoc :serdes/meta meta-map))))
+
+(deftype YamlIngestion [^File root-dir settings]
+  ingest/Ingestable
+  (ingest-list [_]
+    (eduction (comp (filter (fn [^File f] (.isFile f)))
+                    (mapcat build-metas))
+              (file-seq root-dir)))
+  (ingest-one [_ {:keys [model id] :as meta-map}]
+    (if (= "Setting" model)
+      {:serdes/meta meta-map :key (keyword id) :value (get settings (keyword id))}
+      (ingest-entity  root-dir meta-map))))
+
+(defn ingest-yaml
+  "Creates a new Ingestable on a directory of YAML files, as created by
+  [[metabase-enterprise.serialization.v2.storage.yaml]]."
+  [root-dir]
+  (->YamlIngestion (io/file root-dir) (yaml/from-file (io/file root-dir "settings.yaml"))))
diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj
index e70254ea905..39b5e98e719 100644
--- a/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj
+++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/load.clj
@@ -56,12 +56,12 @@
   [[metabase.models.serialization.base/load-one!]] and its various overridable parts, which see.
 
   Circular dependencies are not allowed, and are detected and thrown as an error."
-  [{:keys [expanding ingestion seen] :as ctx} {:keys [id type] :as meta-map}]
+  [{:keys [expanding ingestion seen] :as ctx} {id :id model-name :model :as meta-map}]
   (cond
-    (expanding id) (throw (ex-info (format "Circular dependency on %s %s" type id) {}))
+    (expanding id) (throw (ex-info (format "Circular dependency on %s %s" model-name id) {}))
     (seen id)      ctx ; Already been done, just skip it.
     :else (let [ingested (serdes.ingest/ingest-one ingestion meta-map)
-                model    (db/resolve-model (symbol type))
+                model    (db/resolve-model (symbol model-name))
                 deps     (serdes.base/serdes-dependencies ingested)
                 ctx      (-> ctx
                              (update :expanding conj id)
diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj
new file mode 100644
index 00000000000..ed67b9efb0c
--- /dev/null
+++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage.clj
@@ -0,0 +1,15 @@
+(ns metabase-enterprise.serialization.v2.storage
+  "A multimethod entry point for storage sinks. Storage is the second phase of serialization.
+  See [[metabase.models.serialization.base]] for detailed documentation of the serialization process.
+  Implementations of storage should live in [[metabase-enterprise.serialization.v2.storage.yaml]] and similar.")
+
+(defmulti store-all!
+  "`(store-all! stream opts)`
+  `stream` is a reducible stream of portable maps with `:serdes/meta` keys.
+  `opts` is a map of options, such as the path to the root directory.
+
+  See [[metabase.models.serialization.base]] for detailed documentation of the serialization process, and the maps in
+  the stream.
+
+  Keyed on the only required key in `opts`: `{:storage/target ...}`."
+  (fn [_ {target :storage/target}] target))
diff --git a/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj
new file mode 100644
index 00000000000..733aca79dd4
--- /dev/null
+++ b/enterprise/backend/src/metabase_enterprise/serialization/v2/storage/yaml.clj
@@ -0,0 +1,41 @@
+(ns metabase-enterprise.serialization.v2.storage.yaml
+  (:require [clojure.java.io :as io]
+            [metabase-enterprise.serialization.v2.storage :as storage]
+            [yaml.core :as yaml]))
+
+(defn- spit-yaml
+  [path obj]
+  (apply io/make-parents path)
+  (spit (apply io/file path) (yaml/generate-string obj :dumper-options {:flow-style :block})))
+
+(defn- store-entity! [{:keys [root-dir]} {{:keys [id model label]} :serdes/meta :as entity}]
+  (let [basename (if (nil? label)
+                   (str id ".yaml")
+                   ;; + is a legal, unescaped character on all common filesystems, but not `identity-hash` or NanoID!
+                   (str id "+" label ".yaml"))
+        path [root-dir model basename]]
+    (spit-yaml path (dissoc entity :serdes/meta))))
+
+(defn- store-settings! [{:keys [root-dir]} settings]
+  (let [as-map (into (sorted-map)
+                     (for [{:keys [key value]} settings]
+                       [key value]))]
+    (spit-yaml [root-dir "settings.yaml"] as-map)))
+
+(defmethod storage/store-all! :yaml [stream opts]
+  (when-not (or (string? (:root-dir opts))
+                (instance? java.io.File (:root-dir opts)))
+    (throw (ex-info ":yaml storage requires the :root-dir option to be a string or File"
+                    {:opts opts})))
+  (let [settings (atom [])]
+    (doseq [entity stream]
+      (if (-> entity :serdes/meta :model (= "Setting"))
+        (swap! settings conj entity)
+        (store-entity! opts entity)))
+    (store-settings! opts @settings)))
+
+(defn store!
+  "Helper for storing a serialized database to a tree of YAML files."
+  [stream root-dir]
+  (storage/store-all! stream {:storage/target :yaml
+                              :root-dir       root-dir}))
diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj
index 90574451886..797408564ad 100644
--- a/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj
+++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/extract_test.clj
@@ -28,7 +28,7 @@
 
       (testing "a top-level collection is extracted correctly"
         (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id coll-id]))]
-          (is (= {:type "Collection" :id coll-eid :label coll-slug} (:serdes/meta ser)))
+          (is (= {:model "Collection" :id coll-eid :label coll-slug} (:serdes/meta ser)))
           (is (not (contains? ser :location)))
           (is (not (contains? ser :id)))
           (is (nil? (:personal_owner_id ser)))
@@ -37,7 +37,7 @@
 
       (testing "a nested collection is extracted with the right parent_id"
         (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id child-id]))]
-          (is (= {:type "Collection" :id child-eid :label child-slug} (:serdes/meta ser)))
+          (is (= {:model "Collection" :id child-eid :label child-slug} (:serdes/meta ser)))
           (is (not (contains? ser :location)))
           (is (not (contains? ser :id)))
           (is (= coll-eid (:parent_id ser)))
@@ -45,7 +45,7 @@
 
       (testing "personal collections are extracted with email as key"
         (let [ser (serdes.base/extract-one "Collection" (select-one "Collection" [:= :id pc-id]))]
-          (is (= {:type "Collection" :id pc-eid :label pc-slug} (:serdes/meta ser)))
+          (is (= {:model "Collection" :id pc-eid :label pc-slug} (:serdes/meta ser)))
           (is (not (contains? ser :location)))
           (is (not (contains? ser :id)))
           (is (nil? (:parent_id ser)))
@@ -55,7 +55,7 @@
         (letfn [(collections [extraction] (->> extraction
                                                (into [])
                                                (map :serdes/meta)
-                                               (filter #(= "Collection" (:type %)))
+                                               (filter #(= "Collection" (:model %)))
                                                (map :id)
                                                set))]
           (testing "no user specified"
diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj
index 864b8252b70..aae206bdcd6 100644
--- a/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj
+++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/load_test.clj
@@ -9,16 +9,16 @@
             [toucan.db :as db]))
 
 (defn- ingestion-in-memory [extractions]
-  (let [mapped (into {} (for [{{:keys [type id]} :serdes/meta :as m} (into [] extractions)]
-                          [[type id] m]))]
+  (let [mapped (into {} (for [{{:keys [model id]} :serdes/meta :as m} (into [] extractions)]
+                          [[model id] m]))]
     (reify
       serdes.ingest/Ingestable
       (ingest-list [_]
         (eduction (map :serdes/meta) (vals mapped)))
-      (ingest-one [_ {:keys [type id]}]
-        (or (get mapped [type id])
-            (throw (ex-info (format "Unknown ingestion target: %s %s" type id)
-                            {:type type :id id :world mapped})))))))
+      (ingest-one [_ {:keys [model id]}]
+        (or (get mapped [model id])
+            (throw (ex-info (format "Unknown ingestion target: %s %s" model id)
+                            {:model model :id id :world mapped})))))))
 
 ;;; WARNING for test authors: [[extract/extract-metabase]] returns a lazy reducible value. To make sure you don't
 ;;; confound your tests with data from your dev appdb, remember to eagerly
@@ -33,8 +33,8 @@
           (ts/with-source-db
             (ts/create! Collection :name "Basic Collection" :entity_id eid1)
             (reset! serialized (into [] (serdes.extract/extract-metabase {})))
-            (is (some (fn [{{:keys [type id]} :serdes/meta}]
-                        (and (= type "Collection") (= id eid1)))
+            (is (some (fn [{{:keys [model id]} :serdes/meta}]
+                        (and (= model "Collection") (= id eid1)))
                       @serialized))))
 
         (testing "loading into an empty database succeeds"
@@ -113,7 +113,7 @@
                    (serdes.hash/identity-hash @c2b)}
                  (->> @serialized
                       (map :serdes/meta)
-                      (filter #(= "Collection" (:type %)))
+                      (filter #(= "Collection" (:model %)))
                       (map :id)
                       set))))
 
diff --git a/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj b/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj
new file mode 100644
index 00000000000..2377774282e
--- /dev/null
+++ b/enterprise/backend/test/metabase_enterprise/serialization/v2/yaml_test.clj
@@ -0,0 +1,115 @@
+(ns metabase-enterprise.serialization.v2.yaml-test
+  (:require [clojure.java.io :as io]
+            [clojure.test :refer :all]
+            [metabase-enterprise.serialization.test-util :as ts]
+            [metabase-enterprise.serialization.v2.extract :as extract]
+            [metabase-enterprise.serialization.v2.ingest :as ingest]
+            [metabase-enterprise.serialization.v2.ingest.yaml :as ingest.yaml]
+            [metabase-enterprise.serialization.v2.storage.yaml :as storage.yaml]
+            [metabase.models.collection :refer [Collection]]
+            [metabase.test.generate :as test-gen]
+            [reifyhealth.specmonstah.core :as rs]
+            [yaml.core :as yaml]))
+
+(defn- dir->file-set [dir]
+  (->> dir
+       .listFiles
+       (filter #(.isFile %))
+       (map #(.getName %))
+       set))
+
+(deftest basic-dump-test
+  (ts/with-random-dump-dir [dump-dir]
+    (ts/with-empty-h2-app-db
+      (ts/with-temp-dpc [Collection [parent {:name "Some Collection"}]
+                         Collection [child  {:name "Child Collection" :location (format "/%d/" (:id parent))}]]
+        (let [export          (into [] (extract/extract-metabase nil))
+              parent-filename (format "%s+some_collection.yaml"  (:entity_id parent))
+              child-filename  (format "%s+child_collection.yaml" (:entity_id child))]
+          (storage.yaml/store! export dump-dir)
+          (testing "the right files in the right places"
+            (is (= #{parent-filename child-filename}
+                   (dir->file-set (io/file dump-dir "Collection")))
+                "Entities go in subdirectories")
+            (is (= #{"settings.yaml"}
+                   (dir->file-set (io/file dump-dir)))
+                "A few top-level files are expected"))
+
+          (testing "the Collections properly exported"
+            (is (= (-> (into {} (Collection (:id parent)))
+                       (dissoc :id :location)
+                       (assoc :parent_id nil))
+                   (yaml/from-file (io/file dump-dir "Collection" parent-filename))))
+
+            (is (= (-> (into {} (Collection (:id child)))
+                       (dissoc :id :location)
+                       (assoc :parent_id (:entity_id parent)))
+                   (yaml/from-file (io/file dump-dir "Collection" child-filename))))))))))
+
+(deftest basic-ingest-test
+  (ts/with-random-dump-dir [dump-dir]
+    (io/make-parents dump-dir "Collection" "fake") ; Prepare the right directories.
+    (spit (io/file dump-dir "settings.yaml")
+          (yaml/generate-string {:some-key "with string value"
+                                 :another-key 7
+                                 :blank-key nil}))
+    (spit (io/file dump-dir "Collection" "fake-id+the_label.yaml")
+          (yaml/generate-string {:some "made up" :data "here"}))
+    (spit (io/file dump-dir "Collection" "no-label.yaml")
+          (yaml/generate-string {:some "other" :data "in this one"}))
+
+    (let [ingestable (ingest.yaml/ingest-yaml dump-dir)
+          meta-maps  (into [] (ingest/ingest-list ingestable))
+          exp-files  {{:model "Collection" :id "fake-id" :label "the_label"} {:some "made up" :data "here"}
+                      {:model "Collection" :id "no-label"}                   {:some "other" :data "in this one"}
+                      {:model "Setting" :id "some-key"}                      {:key :some-key :value "with string value"}
+                      {:model "Setting" :id "another-key"}                   {:key :another-key :value 7}
+                      {:model "Setting" :id "blank-key"}                     {:key :blank-key :value nil}}]
+      (testing "the right set of file is returned by ingest-list"
+        (is (= (set (keys exp-files))
+               (set meta-maps))))
+
+      (testing "individual reads in any order are correct"
+        (doseq [meta-map (->> exp-files
+                              keys
+                              (repeat 10)
+                              (into [] cat)
+                              shuffle)]
+          (is (= (-> exp-files
+                     (get meta-map)
+                     (assoc :serdes/meta meta-map))
+                 (ingest/ingest-one ingestable meta-map))))))))
+
+(deftest e2e-storage-ingestion-test
+  (ts/with-random-dump-dir [dump-dir]
+    (ts/with-empty-h2-app-db
+      (test-gen/insert! {:collection [[100 {:refs {:personal_owner_id ::rs/omit}}]]})
+      (let [extraction (into [] (extract/extract-metabase {}))
+            entities   (reduce (fn [m {{:keys [model id]} :serdes/meta :as entity}]
+                                 (assoc-in m [model id] entity))
+                               {} extraction)]
+        (is (= 100 (-> entities (get "Collection") vals count)))
+
+        (testing "storage"
+          (storage.yaml/store! (seq extraction) dump-dir)
+          (testing "for Collections"
+            (is (= 100 (count (dir->file-set (io/file dump-dir "Collection")))))
+            (doseq [{:keys [entity_id slug] :as coll} (vals (get entities "Collection"))
+                    :let [filename (str entity_id "+" slug ".yaml")]]
+              (is (= (dissoc coll :serdes/meta)
+                     (yaml/from-file (io/file dump-dir "Collection" filename))))))
+
+          (testing "for settings"
+            (is (= (into {} (for [{:keys [key value]} (vals (get entities "Setting"))]
+                              [key value]))
+                   (yaml/from-file (io/file dump-dir "settings.yaml"))))))
+
+        (testing "ingestion"
+          (let [ingestable (ingest.yaml/ingest-yaml dump-dir)]
+            (testing "ingest-list is accurate"
+              (is (= (into #{} (comp (map vals) cat (map :serdes/meta)) (vals entities))
+                     (into #{} (ingest/ingest-list ingestable)))))
+
+            (testing "each entity matches its in-memory original"
+              (doseq [entity extraction]
+                (is (= entity (ingest/ingest-one ingestable (:serdes/meta entity))))))))))))
diff --git a/src/metabase/models/serialization/base.clj b/src/metabase/models/serialization/base.clj
index c2bd53143a1..d29a3fd93ca 100644
--- a/src/metabase/models/serialization/base.clj
+++ b/src/metabase/models/serialization/base.clj
@@ -61,7 +61,7 @@
   (fn [model _] model))
 
 (defmulti extract-query
-  "Performs the select query, possibly filtered, for all the entities of this type that should be serialized. Called
+  "Performs the select query, possibly filtered, for all the entities of this model that should be serialized. Called
   from [[extract-all]]'s default implementation.
 
   `(extract-query \"ModelName\" opts)`
@@ -87,8 +87,8 @@
 (defmulti extract-one
   "Extracts a single entity retrieved from the database into a portable map with `:serdes/meta` attached.
 
-  The default implementation uses the model name as the `:type` and either `:entity_id` or [[serdes.hash/identity-hash]]
-  as the `:id`. It also strips off the database's numeric primary key.
+  The default implementation uses the model name as the `:model` and either `:entity_id` or
+  [[serdes.hash/identity-hash]] as the `:id`. It also strips off the database's numeric primary key.
 
   That suffices for a few simple entities, but most entities will need to override this.
   They should follow the pattern of:
@@ -132,9 +132,9 @@
   (let [model (db/resolve-model (symbol model-name))
         pk    (models/primary-key model)]
     (-> entity
-        (assoc :serdes/meta {:type model-name
-                             :id   (or (:entity_id entity)
-                                       (serdes.hash/identity-hash (model (get entity pk))))})
+        (assoc :serdes/meta {:model model-name
+                             :id    (or (:entity_id entity)
+                                        (serdes.hash/identity-hash (model (get entity pk))))})
         (dissoc pk))))
 
 (defmethod extract-one :default [model-name entity]
@@ -198,10 +198,10 @@
 ;;; +----------------------------------------------------------------------------------------------------------------+
 ;;; The Clojure maps from extraction and ingestion always include a special key `:serdes/meta` giving some information
 ;;; about the serialized entity. The value is always a map like:
-;;; `{:type "ModelName" :id "entity ID or identity hash string" :label "Human-readable name"}`
-;;; `:type` and `:id` are required; `:label` is optional.
+;;; `{:model "ModelName" :id "entity ID or identity hash string" :label "Human-readable name"}`
+;;; `:model` and `:id` are required; `:label` is optional.
 ;;;
-;;; Many of the multimethods are keyed on the `:type` field.
+;;; Many of the multimethods are keyed on the `:model` field.
 
 (defmulti load-prescan-all
   "Returns a reducible stream of `[entity_id identity-hash primary-key]` triples for the entire table.
@@ -235,9 +235,9 @@
      key]))
 
 (defn- ingested-model
-  "The dispatch function for several of the load multimethods: dispatching on the type of the incoming entity."
+  "The dispatch function for several of the load multimethods: dispatching on the model of the incoming entity."
   [ingested]
-  (-> ingested :serdes/meta :type))
+  (-> ingested :serdes/meta :model))
 
 (defmulti serdes-dependencies
   "Given an entity map as ingested (not a Toucan entity) returns a (possibly empty) list of its dependencies, where each
diff --git a/src/metabase/models/setting.clj b/src/metabase/models/setting.clj
index 0a3837ec4cd..df3f9a0a1f3 100644
--- a/src/metabase/models/setting.clj
+++ b/src/metabase/models/setting.clj
@@ -151,7 +151,7 @@
 (defmethod serdes.base/extract-all "Setting" [_model _opts]
   (for [{:keys [key value]} (admin-writable-site-wide-settings
                               :getter (partial get-value-of-type :string))]
-    {:serdes/meta {:type "Setting" :id (name key)}
+    {:serdes/meta {:model "Setting" :id (name key)}
      :key key
      :value value}))
 
-- 
GitLab