Skip to content
Snippets Groups Projects
Unverified Commit 72b4c637 authored by Cam Saul's avatar Cam Saul Committed by GitHub
Browse files

Robust Snowflake test data loading (#28420)

* Robust Snowflake test data loading

* Handle invalid dates
parent 8ad2a85b
Branches
Tags
No related merge requests found
......@@ -23,7 +23,7 @@
(deftest ^:parallel ddl-statements-test
(testing "make sure we didn't break the code that is used to generate DDL statements when we add new test datasets"
(binding [test.data.snowflake/*database-prefix* "v3_"]
(binding [test.data.snowflake/*database-prefix-fn* (constantly "v3_")]
(testing "Create DB DDL statements"
(is (= "DROP DATABASE IF EXISTS \"v3_test-data\"; CREATE DATABASE \"v3_test-data\";"
(sql.tx/create-db-sql :snowflake (mt/get-dataset-definition defs/test-data)))))
......@@ -193,7 +193,7 @@
:type :native
:native {:query (str "SELECT {{filter_date}}, \"last_login\" "
(format "FROM \"%stest-data\".\"PUBLIC\".\"users\" "
test.data.snowflake/*database-prefix*)
(test.data.snowflake/*database-prefix-fn*))
"WHERE date_trunc('day', CAST(\"last_login\" AS timestamp))"
" = date_trunc('day', CAST({{filter_date}} AS timestamp))")
:template-tags {:filter_date {:name "filter_date"
......
......@@ -2,9 +2,11 @@
(:require
[clojure.java.jdbc :as jdbc]
[clojure.string :as str]
[clojure.test :refer :all]
[java-time :as t]
[metabase.driver.sql-jdbc.connection :as sql-jdbc.conn]
[metabase.driver.sql-jdbc.sync :as sql-jdbc.sync]
[metabase.driver.sql.util.unprepare :as unprepare]
[metabase.public-settings :as public-settings]
[metabase.test.data.interface :as tx]
[metabase.test.data.sql :as sql.tx]
[metabase.test.data.sql-jdbc :as sql-jdbc.tx]
......@@ -12,8 +14,11 @@
[metabase.test.data.sql-jdbc.load-data :as load-data]
[metabase.test.data.sql.ddl :as ddl]
[metabase.util :as u]
[metabase.util.date-2 :as u.date]
[metabase.util.log :as log]))
(set! *warn-on-reflection* true)
(sql-jdbc.tx/add-test-extensions! :snowflake)
(defmethod tx/sorts-nil-first? :snowflake [_ _] false)
......@@ -30,16 +35,62 @@
:type/Time "TIME"}]
(defmethod sql.tx/field-base-type->sql-type [:snowflake base-type] [_ _] sql-type))
(def ^:dynamic *database-prefix*
"v4_")
;;; In the past we had one shared prefix for everybody, and everybody was expected to play nice and not screw with it.
;;; This eventually led to BIG problems when one CI job would think see that a dataset did not exist, and try to
;;; recreate it, and then another CI job would do the same thing at the same time, and eventually we'd end up with a
;;; half-created dataset that was missing a bunch of rows. So here is the new strategy going forward:
;;;
;;; 1. Every instance of Metabase gets their own prefix like `<current-date-utc>_<site-uuid>_` e.g. `test-data` becomes
;;; something like
;;;
;;; 2023_02_17_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data
;;;
;;; This will prevent jobs from running at the same time from stomping on each other's work.
;;;
;;; 2. To avoid filling our Snowflake account up with ephemeral data that never gets deleted, we will delete datasets
;;; following this pattern when they are two days old or older. E.g. if it is currently `2023-02-17` in UTC then we
;;; can delete anything dataset starts with `2023_02_15` or older. This is done once the first time we create a
;;; Snowflake test dataset in this process. See [[delete-old-datasets-if-needed!]] below.
;;;
;;; See this Slack thread for more info
;;; https://metaboat.slack.com/archives/CKZEMT1MJ/p1676659086280609?thread_ts=1676656964.624609&cid=CKZEMT1MJ or ask
;;; me (Cam) if you have any questions.
(defn- utc-date
"`LocalDate` in UTC time."
[]
(t/local-date (u.date/with-time-zone-same-instant (t/zoned-date-time) "UTC")))
(defn- unique-prefix
"Unique prefix for test datasets for this instance. Format is `<current-date-utc>_<site-uuid>_`. See comments above.
Example:
2023_02_17_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_"
([]
(unique-prefix (utc-date)))
([local-date]
{:pre [(instance? java.time.LocalDate local-date)]}
(-> (format "%s_%s_" local-date (public-settings/site-uuid))
(str/replace #"-" "_"))))
(def ^:dynamic *database-prefix-fn*
"Function that returns a unique prefix to use for test datasets for this instance. This is dynamic so we can rebind it
to something fixed when we're testing the SQL we generate
e.g. [[metabase.driver.snowflake-test/report-timezone-test]].
This is a function because [[unique-prefix]] can't be calculated until the application database is initialized
because it relies on [[public-settings/site-uuid]]."
#'unique-prefix)
(defn- qualified-db-name
"Prepend `database-name` with a version number so we can create new versions without breaking existing tests."
"Prepend `database-name` with the [[*database-prefix-fn*]] so we don't stomp on any other jobs running at the same
time."
[database-name]
;; try not to qualify the database name twice!
(if (str/starts-with? database-name *database-prefix*)
database-name
(str *database-prefix* database-name)))
(let [prefix (*database-prefix-fn*)]
;; try not to qualify the database name twice!
(if (str/starts-with? database-name prefix)
database-name
(str prefix database-name))))
(defmethod tx/dbdef->connection-details :snowflake
[_ context {:keys [database-name]}]
......@@ -74,54 +125,102 @@
[]
(sql-jdbc.conn/connection-details->spec :snowflake (tx/dbdef->connection-details :snowflake :server nil)))
(defn- existing-dataset-names []
(let [db-spec (no-db-connection-spec)]
(jdbc/with-db-metadata [metadata db-spec]
;; for whatever dumb reason the Snowflake JDBC driver always returns these as uppercase despite us making them
;; all lower-case
(set (map u/lower-case-en (sql-jdbc.sync/get-catalogs metadata))))))
(defn- old-dataset-name?
"Is this dataset name prefixed by a date two days ago or older?
(let [datasets (atom nil)]
(defn- existing-datasets []
(when-not (seq @datasets)
(reset! datasets (existing-dataset-names))
(log/infof "These Snowflake datasets have already been loaded:\n%s" (u/pprint-to-str (sort @datasets))))
@datasets)
If the date is invalid e.g. `2023-02-31` then we'll count it as old so it will get deleted anyway."
[dataset-name]
(when-let [[_ year month day] (re-matches #"^(\d{4})_(\d{2})_(\d{2}).*$" dataset-name)]
(let [dataset-date (try
(t/local-date (parse-long year) (parse-long month) (parse-long day))
(catch Throwable _
nil))]
(if-not dataset-date
true
(t/before? dataset-date (u.date/add (utc-date) :day -1))))))
(defn- add-existing-dataset! [database-name]
(swap! datasets conj database-name))
(deftest ^:parallel old-dataset-name?-test
(are [s] (old-dataset-name? s)
"2023_02_01_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
"2023_01_17_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
"2022_02_17_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
(str (unique-prefix (u.date/add (utc-date) :day -2)) "test-data")
;; if the date is invalid we should just treat it as old and delete it.
"2022_00_00_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
"2022_13_01_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
"2022_02_31_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data")
(are [s] (not (old-dataset-name? s))
"2050_02_17_82e897cb_ad31_4c82_a4b6_3e9e2e1dc1cb_test-data"
"v3_test-data"
(str (unique-prefix) "test-data")
(str (unique-prefix (u.date/add (utc-date) :day -1)) "test-data")))
(defn- remove-existing-dataset! [database-name]
(swap! datasets disj database-name)))
(defn- old-dataset-names
"Return a collection of all dataset names that are old -- prefixed with a date two days ago or older?"
[]
(with-open [conn (jdbc/get-connection (no-db-connection-spec))]
(let [metadata (.getMetaData conn)]
(with-open [rset (.getCatalogs metadata)]
(loop [acc []]
(if-not (.next rset)
acc
;; for whatever dumb reason the Snowflake JDBC driver always returns these as uppercase despite us making
;; them all lower-case
(let [catalog (u/lower-case-en (.getString rset "TABLE_CAT"))
acc (cond-> acc
(old-dataset-name? catalog) (conj catalog))]
(recur acc))))))))
(defn- delete-old-datasets!
"Delete any datasets prefixed by a date that is two days ago or older. See comments above."
[]
;; the printlns below are on purpose because we want them to show up when running tests, even on CI, to make sure this
;; stuff is working correctly. We can change it to `log` in the future when we're satisfied everything is working as
;; intended -- Cam
#_{:clj-kondo/ignore [:discouraged-var]}
(println "[Snowflake] deleting old datasets...")
(when-let [old-datasets (not-empty (old-dataset-names))]
(with-open [conn (jdbc/get-connection (no-db-connection-spec))
stmt (.createStatement conn)]
(doseq [dataset-name old-datasets]
#_{:clj-kondo/ignore [:discouraged-var]}
(println "[Snowflake] Deleting old dataset:" dataset-name)
(try
(.execute stmt (format "DROP DATABASE \"%s\";" dataset-name))
;; if this fails for some reason it's probably just because some other job tried to delete the dataset at the
;; same time. No big deal. Just log this and carry on trying to delete the other datasets. If we don't end up
;; deleting anything it's not the end of the world because it won't affect our ability to run our tests
(catch Throwable e
#_{:clj-kondo/ignore [:discouraged-var]}
(println "[Snowflake] Error deleting old dataset:" (ex-message e))))))))
(defonce ^:private deleted-old-datasets?
(atom false))
(defn- delete-old-datsets-if-needed!
"Call [[delete-old-datasets!]], only if we haven't done so already."
[]
(when-not @deleted-old-datasets?
(locking deleted-old-datasets?
(when-not @deleted-old-datasets?
(delete-old-datasets!)
(reset! deleted-old-datasets? true)))))
(defmethod tx/create-db! :snowflake
[driver db-def & options]
(let [{:keys [database-name], :as db-def} (update db-def :database-name qualified-db-name)]
;; ok, now check if already created. If already created, no-op
(when-not (contains? (existing-datasets) database-name)
(log/infof "Creating new Snowflake database %s..." (pr-str database-name))
;; if not created, create the DB...
(try
;; call the default impl for SQL JDBC drivers
(apply (get-method tx/create-db! :sql-jdbc/test-extensions) driver db-def options)
;; and add it to the set of DBs that have been created
(add-existing-dataset! database-name)
;; if creating the DB failed, DROP it so we don't get stuck with a DB full of bad data and skip trying to
;; load it next time around
(catch Throwable e
(let [drop-db-sql (format "DROP DATABASE \"%s\";" database-name)]
(log/errorf "Creating DB failed: %s" e)
(log/errorf "[Snowflake] %s" drop-db-sql)
(jdbc/execute! (no-db-connection-spec) [drop-db-sql]))
(throw e))))))
;; qualify the DB name with the unique prefix
(let [db-def (update db-def :database-name qualified-db-name)]
;; clean up any old datasets that should be deleted
(delete-old-datsets-if-needed!)
;; now call the default impl for SQL JDBC drivers
(apply (get-method tx/create-db! :sql-jdbc/test-extensions) driver db-def options)))
(defmethod tx/destroy-db! :snowflake
[_ {:keys [database-name]}]
[_driver {:keys [database-name]}]
(let [database-name (qualified-db-name database-name)
sql (format "DROP DATABASE \"%s\";" database-name)]
(log/infof "[Snowflake] %s" sql)
(jdbc/execute! (no-db-connection-spec) [sql])
(remove-existing-dataset! database-name)))
(jdbc/execute! (no-db-connection-spec) [sql])))
;; For reasons I don't understand the Snowflake JDBC driver doesn't seem to work when trying to use parameterized
;; INSERT statements, even though the documentation suggests it should. Just go ahead and deparameterize all the
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment