diff --git a/.circleci/config.yml b/.circleci/config.yml index d4d1bc452ebb9f669e3c2d7b7d03cfb4336c5a92..3c1013fe9752e240624b1f56d62ff51ae85ad64a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -148,7 +148,7 @@ executors: - image: circleci/clojure:lein-2.8.1 - image: metabase/presto-mb-ci environment: - JAVA_TOOL_OPTIONS: "-Xmx2g" + JAVA_TOOL_OPTIONS: "-Xmx1g" sparksql: working_directory: /home/circleci/metabase/metabase/ @@ -691,7 +691,7 @@ jobs: - run: name: Test << parameters.driver >> driver << parameters.description >> environment: - DRIVERS: h2,<< parameters.driver >> + DRIVERS: << parameters.driver >> command: lein with-profile +ci,+junit,+ee test no_output_timeout: << parameters.timeout >> - store_test_results: diff --git a/modules/drivers/mongo/test/metabase/driver/mongo_test.clj b/modules/drivers/mongo/test/metabase/driver/mongo_test.clj index 218011d19e9c9cba042a695f39468cfa9f666157..53aec8737bf0352605c418c32e74097f7457aec0 100644 --- a/modules/drivers/mongo/test/metabase/driver/mongo_test.clj +++ b/modules/drivers/mongo/test/metabase/driver/mongo_test.clj @@ -11,6 +11,7 @@ [metabase.models.table :as table :refer [Table]] [metabase.query-processor :as qp] [metabase.query-processor-test :as qp.t :refer [rows]] + [metabase.sync :as sync] [metabase.test :as mt] [metabase.test.data.interface :as tx] [taoensso.nippy :as nippy] @@ -148,6 +149,8 @@ (deftest all-num-columns-test (mt/test-driver :mongo (mt/dataset all-null-columns + ;; do a full sync on the DB to get the correct special type info + (sync/sync-database! (mt/db)) (is (= [{:name "_id", :database_type "java.lang.Long", :base_type :type/Integer, :special_type :type/PK} {:name "favorite_snack", :database_type "NULL", :base_type :type/*, :special_type nil} {:name "name", :database_type "java.lang.String", :base_type :type/Text, :special_type :type/Name}] diff --git a/project.clj b/project.clj index 21eb25daa10c54497a75b8b54d783c488c3634e5..5463b6c84194335aad92e4232e37684ca55d9c0a 100644 --- a/project.clj +++ b/project.clj @@ -245,7 +245,7 @@ :format-result metabase.junit/format-result}} :ci - {:jvm-opts ["-Xmx2500m"]} + {:jvm-opts ["-Xmx2000m"]} :install {} diff --git a/src/metabase/sync.clj b/src/metabase/sync.clj index eafb841b36b81611973f268f8fc15e3172839ee1..bdd78bf37f012a5c47703ba29ec20a3baec7fcc1 100644 --- a/src/metabase/sync.clj +++ b/src/metabase/sync.clj @@ -26,20 +26,30 @@ (s/defn sync-database! :- SyncDatabaseResults "Perform all the different sync operations synchronously for `database`. - This is considered a 'full sync' in that all the different sync operations are performed at consecutively. Please - note that this function is *not* what is called by the scheduled tasks; those call different steps independently. - This function is called when a Database is first added." + By default, does a `:full` sync that performs all the different sync operations consecutively. You may instead + specify only a `:schema` sync that will sync just the schema but skip analysis. + + Please note that this function is *not* what is called by the scheduled tasks; those call different steps + independently. This function is called when a Database is first added." {:style/indent 1} - [database :- i/DatabaseInstance] - (sync-util/sync-operation :sync database (format "Sync %s" (sync-util/name-for-logging database)) - (mapv (fn [[f step-name]] (assoc (f database) :name step-name)) - [ - ;; First make sure Tables, Fields, and FK information is up-to-date - [sync-metadata/sync-db-metadata! "metadata"] - ;; Next, run the 'analysis' step where we do things like scan values of fields and update special types accordingly - [analyze/analyze-db! "analyze"] - ;; Finally, update cached FieldValues - [field-values/update-field-values! "field-values"]]))) + ([database] + (sync-database! database nil)) + + ([database :- i/DatabaseInstance + {:keys [scan], :or {scan :full}} :- (s/maybe {(s/optional-key :scan) (s/maybe (s/enum :schema :full))})] + (sync-util/sync-operation :sync database (format "Sync %s" (sync-util/name-for-logging database)) + (mapv (fn [[f step-name]] (assoc (f database) :name step-name)) + (filter + some? + [;; First make sure Tables, Fields, and FK information is up-to-date + [sync-metadata/sync-db-metadata! "metadata"] + ;; Next, run the 'analysis' step where we do things like scan values of fields and update special types + ;; accordingly + (when (= scan :full) + [analyze/analyze-db! "analyze"]) + ;; Finally, update cached FieldValues + (when (= scan :full) + [field-values/update-field-values! "field-values"])]))))) (s/defn sync-table! "Perform all the different sync operations synchronously for a given `table`." diff --git a/src/metabase/sync/util.clj b/src/metabase/sync/util.clj index b41e370efcc9f01faf033325de7976dd23295582..f1766bf59d7dd080abeedf03c985c5b752374f0c 100644 --- a/src/metabase/sync/util.clj +++ b/src/metabase/sync/util.clj @@ -55,16 +55,16 @@ {:style/indent 2} [operation database-or-id f] (fn [] - (when-not (contains? (@operation->db-ids operation) (u/get-id database-or-id)) + (when-not (contains? (@operation->db-ids operation) (u/the-id database-or-id)) (try ;; mark this database as currently syncing so we can prevent duplicate sync attempts (#2337) - (swap! operation->db-ids update operation #(conj (or % #{}) (u/get-id database-or-id))) + (swap! operation->db-ids update operation #(conj (or % #{}) (u/the-id database-or-id))) (log/debug "Sync operations in flight:" (m/filter-vals seq @operation->db-ids)) ;; do our work (f) ;; always take the ID out of the set when we are through (finally - (swap! operation->db-ids update operation #(disj % (u/get-id database-or-id)))))))) + (swap! operation->db-ids update operation #(disj % (u/the-id database-or-id)))))))) (defn- with-sync-events @@ -81,11 +81,11 @@ (fn [] (let [start-time (System/nanoTime) tracking-hash (str (java.util.UUID/randomUUID))] - (events/publish-event! begin-event-name {:database_id (u/get-id database-or-id), :custom_id tracking-hash}) + (events/publish-event! begin-event-name {:database_id (u/the-id database-or-id), :custom_id tracking-hash}) (let [return (f) total-time-ms (int (/ (- (System/nanoTime) start-time) 1000000.0))] - (events/publish-event! end-event-name {:database_id (u/get-id database-or-id) + (events/publish-event! end-event-name {:database_id (u/the-id database-or-id) :custom_id tracking-hash :running_time total-time-ms}) return))))) @@ -240,9 +240,9 @@ ;;; +----------------------------------------------------------------------------------------------------------------+ (defn db->sync-tables - "Return all the Tables that should go through the sync processes for DATABASE-OR-ID." + "Return all the Tables that should go through the sync processes for `database-or-id`." [database-or-id] - (db/select Table, :db_id (u/get-id database-or-id), :active true, :visibility_type nil)) + (db/select Table, :db_id (u/the-id database-or-id), :active true, :visibility_type nil)) ;; The `name-for-logging` function is used all over the sync code to make sure we have easy access to consistently @@ -250,8 +250,8 @@ (defprotocol ^:private NameForLogging (name-for-logging [this] - "Return an appropriate string for logging an object in sync logging messages. - Should be something like \"postgres Database 'test-data'\"")) + "Return an appropriate string for logging an object in sync logging messages. Should be something like \"postgres + Database 'test-data'\"")) (extend-protocol NameForLogging i/DatabaseInstance @@ -398,7 +398,7 @@ database :- i/DatabaseInstance {:keys [start-time end-time]} :- SyncOperationOrStepRunMetadata] {:task task-name - :db_id (u/get-id database) + :db_id (u/the-id database) :started_at start-time :ended_at end-time :duration (.toMillis (t/duration start-time end-time))}) diff --git a/test/metabase/driver/mysql_test.clj b/test/metabase/driver/mysql_test.clj index 48dc3051313eaba06dfc2775945432ebf2efc1d6..e1d85adeb1c527e2ea20983e9394665c815e0146 100644 --- a/test/metabase/driver/mysql_test.clj +++ b/test/metabase/driver/mysql_test.clj @@ -60,12 +60,14 @@ ["Empty Vending Machine" 0]]]]) (defn- db->fields [db] - (let [table-ids (db/select-ids 'Table :db_id (u/get-id db))] + (let [table-ids (db/select-ids 'Table :db_id (u/the-id db))] (set (map (partial into {}) (db/select [Field :name :base_type :special_type] :table_id [:in table-ids]))))) (deftest tiny-int-1-test (mt/test-driver :mysql (mt/dataset tiny-int-ones + ;; trigger a full sync on this database so fields are categorized correctly + (sync/sync-database! (mt/db)) (testing "By default TINYINT(1) should be a boolean" (is (= #{{:name "number-of-cans", :base_type :type/Boolean, :special_type :type/Category} {:name "id", :base_type :type/Integer, :special_type :type/PK} diff --git a/test/metabase/test/data.clj b/test/metabase/test/data.clj index 2babc816a08670df7cb230002bf78a700081953e..8871e1594cbbc1901d549ea781411bce6aaa14cc 100644 --- a/test/metabase/test/data.clj +++ b/test/metabase/test/data.clj @@ -200,8 +200,8 @@ (apply impl/the-field-id (id table-name) (map format-name (cons field-name nested-field-names))))) (defmacro dataset - "Load and sync a temporary Database defined by `dataset`, make it the current DB (for `metabase.test.data` functions - like `id` and `db`), and execute `body`. + "Create a database and load it with the data defined by `dataset`, then do a quick metadata-only sync; make it the + current DB (for `metabase.test.data` functions like `id` and `db`), and execute `body`. `dataset` can be one of the following: diff --git a/test/metabase/test/data/impl.clj b/test/metabase/test/data/impl.clj index 382bc16b40c2c317a9354adbfc5ff5c3b051e27a..e66f9ebc2f2059f3e6d3471ff6986e423bbc166d 100644 --- a/test/metabase/test/data/impl.clj +++ b/test/metabase/test/data/impl.clj @@ -1,6 +1,7 @@ (ns metabase.test.data.impl "Internal implementation of various helper functions in `metabase.test.data`." (:require [clojure.tools.logging :as log] + [clojure.tools.reader.edn :as edn] [metabase.config :as config] [metabase.driver :as driver] [metabase.models.database :refer [Database]] @@ -81,6 +82,9 @@ "Max amount of time to wait for sync to complete." (u/minutes->ms 5)) ; five minutes +(defonce ^:private reference-sync-durations + (delay (edn/read-string (slurp "test_resources/sync-durations.edn")))) + (defn- create-database! [driver {:keys [database-name table-definitions], :as database-definition}] {:pre [(seq database-name)]} (try @@ -98,16 +102,20 @@ (try ;; sync newly added DB (u/with-timeout sync-timeout-ms - (u/profile (format "Sync %s Database %s" driver database-name) - (sync/sync-database! db) - (verify-data-loaded-correctly driver database-definition db) - ;; add extra metadata for fields - (try - (add-extra-metadata! database-definition db) - (catch Throwable e - (println "Error adding extra metadata:" e))))) + (let [reference-duration (or (some-> (get @reference-sync-durations database-name) u/format-nanoseconds) + "NONE") + quick-sync? (not= database-name "test-data")] + (u/profile (format "%s %s Database %s (reference H2 duration: %s)" + (if quick-sync? "QUICK sync" "Sync") driver database-name reference-duration) + ;; only do "quick sync" for non `test-data` datasets, because it can take literally MINUTES on CI. + (sync/sync-database! db (when quick-sync? {:scan :schema})) + ;; add extra metadata for fields + (try + (add-extra-metadata! database-definition db) + (catch Throwable e + (println "Error adding extra metadata:" e)))))) ;; make sure we're returing an up-to-date copy of the DB - (Database (u/get-id db)) + (Database (u/the-id db)) (catch Throwable e (let [e (ex-info "Failed to create test database" {:driver driver @@ -115,7 +123,7 @@ :connection-details connection-details} e)] (println (u/pprint-to-str 'red (Throwable->map e))) - (db/delete! Database :id (u/get-id db)) + (db/delete! Database :id (u/the-id db)) (throw e))))) (catch Throwable e (let [message (format "Failed to create %s '%s' test database" driver database-name)] @@ -289,7 +297,7 @@ (binding [db/*disable-db-logging* true] (let [db (get-or-create-database! driver dbdef)] (assert db) - (assert (db/exists? Database :id (u/get-id db))) + (assert (db/exists? Database :id (u/the-id db))) db))))] (binding [*get-db* #(get-db-for-driver (tx/driver))] (f)))) diff --git a/test/metabase/test/data/sql_jdbc/load_data.clj b/test/metabase/test/data/sql_jdbc/load_data.clj index 850a5854949db8b24a8ec1b8921dd2f5650ef1f6..20e6d85acc5fea39171327d3bf670cf856d72ede 100644 --- a/test/metabase/test/data/sql_jdbc/load_data.clj +++ b/test/metabase/test/data/sql_jdbc/load_data.clj @@ -2,6 +2,7 @@ (:require [clojure.java.jdbc :as jdbc] [clojure.string :as str] [clojure.tools.logging :as log] + [clojure.tools.reader.edn :as edn] [medley.core :as m] [metabase.driver :as driver] [metabase.driver.sql-jdbc.execute :as sql-jdbc.execute] @@ -195,6 +196,9 @@ (jdbc/print-sql-exception-chain e) (throw e)))))) +(defonce ^:private reference-load-durations + (delay (edn/read-string (slurp "test_resources/load-durations.edn")))) + (defn create-db! "Default implementation of `create-db!` for SQL drivers." {:arglists '([driver dbdef & {:keys [skip-drop-db?]}])} @@ -211,8 +215,12 @@ ;; DB rather than on `:server` (no DB in particular) (execute/execute-sql! driver :db dbdef (str/join ";\n" statements))) ;; Now load the data for each Table - (doseq [tabledef table-definitions] - (u/profile (format "load-data for %s %s %s" (name driver) (:database-name dbdef) (:table-name tabledef)) + (doseq [tabledef table-definitions + :let [reference-duration (or (some-> (get @reference-load-durations [(:database-name dbdef) (:table-name tabledef)]) + u/format-nanoseconds) + "NONE")]] + (u/profile (format "load-data for %s %s %s (reference H2 duration: %s)" + (name driver) (:database-name dbdef) (:table-name tabledef) reference-duration) (load-data! driver dbdef tabledef)))) (defn destroy-db! diff --git a/test_resources/load-durations.edn b/test_resources/load-durations.edn new file mode 100644 index 0000000000000000000000000000000000000000..d75950de9ae0816dd5143bbc37ad9cac8a0b8903 --- /dev/null +++ b/test_resources/load-durations.edn @@ -0,0 +1,50 @@ +;; this is a map of how long it takes to load the test data for the dataset (in nanoseconds) for an in-memory H2 +;; database on Cam's local computer (Ryzen 3950x/32 GB of RAM/Java 14) +{["airports" "airport"] 131568400 + ["airports" "continent"] 2131300 + ["airports" "country"] 30031300 + ["airports" "municipality"] 61022600 + ["airports" "region"] 79085800 + ["attempted-murders" "attempts"] 29658300 + ["avian-singles" "messages"] 11632000 + ["avian-singles" "users"] 1475300 + ["basic-field-comments" "basic_field_comments"] 876400 + ["bird-flocks" "bird"] 2902800 + ["bird-flocks" "flock"] 1267600 + ["checkins_interval_15" "checkins"] 1353500 + ["checkins_interval_86400" "checkins"] 2494900 + ["checkins_interval_900" "checkins"] 1477200 + ["comment-after-sync" "comment_after_sync"] 1265800 + ["daily-bird-counts" "bird-count"] 4581800 + ["db-with-some-cruft" "acquired_toucans"] 1267800 + ["db-with-some-cruft" "south_migrationhistory"] 951300 + ["office-checkins" "checkins"] 5350200 + ["places-cam-likes" "places"] 1194400 + ["sad-toucan-incidents" "incidents"] 7324200 + ["sample-dataset" "orders"] 3439047900 + ["sample-dataset" "people"] 1922302400 + ["sample-dataset" "products"] 95636200 + ["sample-dataset" "reviews"] 292738400 + ["string-times" "times"] 2629800 + ["table_with_comment_after_sync_db" "table_with_comment_after_sync"] 997500 + ["table_with_comment_db" "table_with_comment"] 871300 + ["table_with_updated_desc_db" "table_with_updated_desc"] 818600 + ["test-data" "categories"] 10092000 + ["test-data" "checkins"] 130413900 + ["test-data" "users"] 23762400 + ["test-data" "venues"] 29380500 + ["test-data-self-referencing-user" "users"] 5118200 + ["test-data-with-null-date-checkins" "categories"] 8328000 + ["test-data-with-null-date-checkins" "checkins"] 132575300 + ["test-data-with-null-date-checkins" "users"] 6756300 + ["test-data-with-null-date-checkins" "venues"] 16336800 + ["test-data-with-time" "users"] 15280900 + ["test-data-with-timezones" "categories"] 11324400 + ["test-data-with-timezones" "checkins"] 143596000 + ["test-data-with-timezones" "users"] 7185000 + ["test-data-with-timezones" "venues"] 18375000 + ["toucan-microsecond-incidents" "incidents"] 1388100 + ["tupac-sightings" "categories"] 2228800 + ["tupac-sightings" "cities"] 17286900 + ["tupac-sightings" "sightings"] 46284700 + ["update-desc" "update_desc"] 794500} diff --git a/test_resources/sync-durations.edn b/test_resources/sync-durations.edn new file mode 100644 index 0000000000000000000000000000000000000000..be985837065167ffa44d1101ee6949dc3901aedd --- /dev/null +++ b/test_resources/sync-durations.edn @@ -0,0 +1,29 @@ +;; this is a map of how long it takes to sync the dataset (in nanoseconds) for an in-memory H2 database on Cam's local +;; computer (Ryzen 3950x/32 GB of RAM/Java 14) +{"airports" 72271700 + "attempted-murders" 41250600 + "avian-singles" 44766800 + "basic-field-comments" 36389700 + "bird-flocks" 41835800 + "checkins_interval_15" 45435500 + "checkins_interval_86400" 43530100 + "checkins_interval_900" 41998300 + "comment-after-sync" 37872400 + "daily-bird-counts" 37459400 + "db-with-some-cruft" 49419200 + "office-checkins" 41066800 + "places-cam-likes" 40787900 + "sad-toucan-incidents" 48648000 + "sample-dataset" 96251000 + "string-times" 39466300 + "table_with_comment_after_sync_db" 34791000 + "table_with_comment_db" 35052200 + "table_with_updated_desc_db" 35456200 + "test-data" 1138090600 + "test-data-self-referencing-user" 37713700 + "test-data-with-null-date-checkins" 84524900 + "test-data-with-time" 40431800 + "test-data-with-timezones" 60335800 + "toucan-microsecond-incidents" 51240600 + "tupac-sightings" 62742200 + "update-desc" 32664800}