diff --git a/src/metabase/db/liquibase.clj b/src/metabase/db/liquibase.clj index 0cc49e23235faa19696f86424eea13b6740320f0..5bc2df394b6440c3165f74aee1f99e872311dff4 100644 --- a/src/metabase/db/liquibase.clj +++ b/src/metabase/db/liquibase.clj @@ -200,13 +200,16 @@ (catch Exception e (log/error e (trs "Unable to release the Liquibase lock after a migration failure")))))) -(defn- wait-for-migration-lock-to-be-cleared +(defn- lock-service ^LockService [^Liquibase liquibase] + (.getLockService (LockServiceFactory/getInstance) (.getDatabase liquibase))) + +(defn- wait-for-migration-lock "Check and make sure the database isn't locked. If it is, sleep for 2 seconds and then retry several times. There's a chance the lock will end up clearing up so we can run migrations normally." [^Liquibase liquibase] (let [retry-counter (volatile! 0)] (u/auto-retry 5 - (when (migration-lock-exists? liquibase) + (when-not (.acquireLock (lock-service liquibase)) (Thread/sleep 2000) (vswap! retry-counter inc) (throw @@ -216,66 +219,100 @@ " " (trs "You can force-release these locks by running `java -jar metabase.jar migrate release-locks`.")))))) (if (pos? @retry-counter) - (log/warnf "Migration lock was cleared after %d retries." @retry-counter) - (log/info "No migration lock found.")))) - -(defn migrate-up-if-needed! - "Run any unrun `liquibase` migrations, if needed." - [^Liquibase liquibase ^DataSource data-source] - (log/info (trs "Checking if Database has unrun migrations...")) - (if (seq (unrun-migrations data-source)) - (do - (log/info (trs "Database has unrun migrations. Checking if migraton lock is taken...")) - (wait-for-migration-lock-to-be-cleared liquibase) - ;; while we were waiting for the lock, it was possible that another instance finished the migration(s), so make - ;; sure something still needs to be done... - (let [to-run-migrations (unrun-migrations data-source) - unrun-migrations-count (count to-run-migrations)] - (if (pos? unrun-migrations-count) - (let [^Contexts contexts nil - start-time (System/currentTimeMillis)] - (log/info (trs "Running {0} migrations ..." unrun-migrations-count)) - (doseq [^ChangeSet change to-run-migrations] - (log/tracef "To run migration %s" (.getId change))) - (.update liquibase contexts) - (log/info (trs "Migration complete in {0}" (u/format-milliseconds (- (System/currentTimeMillis) start-time))))) - (log/info - (trs "Migration lock cleared, but nothing to do here! Migrations were finished by another instance."))))) - (log/info (trs "No unrun migrations found.")))) + (log/warnf "Migration lock was acquired after %d retries." @retry-counter) + (do (log/info "No migration lock found.") + (log/info "Migration lock acquired."))))) + +(defn holding-lock? + "Check whether the given Liquibase instance is already holding the database migration lock." + [liquibase] + (.hasChangeLogLock (lock-service liquibase))) + +(def ^:private ^:dynamic *lock-depth* 0) + +(defn- assert-locked [liquibase] + (when-not (holding-lock? liquibase) + (throw (ex-info "This operation requires a hold on the liquibase migration lock." + {:lock-exists? (migration-lock-exists? liquibase) + ;; It's possible that the lock was accidentally released by an operation, or force released by + ;; another process, so its useful for debugging to know whether we were still within a locked + ;; scope. + :lock-depth *lock-depth*})))) (defn run-in-scope-locked "Run function `f` in a scope on the Liquibase instance `liquibase`. - Liquibase scopes are used to hold configuration and parameters (akin to binding dynamic variables in - Clojure). This function initializes the database and the resource accessor which are often required." + Liquibase scopes are used to hold configuration and parameters (akin to binding dynamic variables in + Clojure). This function initializes the database and the resource accessor which are often required. + In order to ensure that mutual exclusion of these scopes across all running Metabase instances, we take a lock + in the app database. It's the responsibility of inner functions which require the lock to call [[assert-locked]]." [^Liquibase liquibase f] - (let [database (.getDatabase liquibase) - ^LockService lock-service (.getLockService (LockServiceFactory/getInstance) database) - scope-objects {(.name Scope$Attr/database) database + ;; Disallow nested locking in dev and CI, in order to force a clear lexical boundary where locking begins. + ;; Inner functions that require the lock to be held should + (when (holding-lock? liquibase) + ;; In somehow we encounter this situation in production, rather take a nested lock - it is re-entrant. + (when-not config/is-prod? + (throw (LockException. "Attempted to take a Liquibase lock, but we already are holding it.")))) + (let [database (.getDatabase liquibase) + scope-objects {(.name Scope$Attr/database) database (.name Scope$Attr/resourceAccessor) (.getResourceAccessor liquibase)}] (Scope/child ^Map scope-objects (reify Scope$ScopedRunner (run [_] - (.waitForLock lock-service) + (wait-for-migration-lock liquibase) (try - (f) + (binding [*lock-depth* (inc *lock-depth*)] + (f)) (finally - (.releaseLock lock-service)))))))) + (when (zero? *lock-depth*) + (.releaseLock (lock-service liquibase)))))))))) + +(defmacro with-scope-locked + "Run `body` in a scope on the Liquibase instance `liquibase`. + Liquibase scopes are used to hold configuration and parameters (akin to binding dynamic variables in + Clojure). This function initializes the database and the resource accessor which are often required. + The underlying locks are re-entrant, so it is safe to nest these blocks." + {:style/indent 1} + [liquibase & body] + `(run-in-scope-locked ~liquibase (fn [] ~@body))) + +(defn migrate-up-if-needed! + "Run any unrun `liquibase` migrations, if needed." + [^Liquibase liquibase ^DataSource data-source] + (log/info (trs "Checking if Database has unrun migrations...")) + (if (seq (unrun-migrations data-source)) + (do + (log/info (trs "Database has unrun migrations. Checking if migration lock is taken...")) + (with-scope-locked liquibase + ;; while we were waiting for the lock, it was possible that another instance finished the migration(s), so make + ;; sure something still needs to be done... + (let [to-run-migrations (unrun-migrations data-source) + unrun-migrations-count (count to-run-migrations)] + (if (pos? unrun-migrations-count) + (let [^Contexts contexts nil + start-time (System/currentTimeMillis)] + (log/info (trs "Running {0} migrations ..." unrun-migrations-count)) + (doseq [^ChangeSet change to-run-migrations] + (log/tracef "To run migration %s" (.getId change))) + (.update liquibase contexts) + (log/info (trs "Migration complete in {0}" (u/format-milliseconds (- (System/currentTimeMillis) start-time))))) + (log/info + (trs "Migration lock cleared, but nothing to do here! Migrations were finished by another instance.")))))) + (log/info (trs "No unrun migrations found.")))) (defn update-with-change-log - "Run update with the change log instances in `liquibase`." + "Run update with the change log instances in `liquibase`. Must be called within a scope holding the liquibase lock." ([liquibase] (update-with-change-log liquibase {})) ([^Liquibase liquibase {:keys [^List change-set-filters exec-listener] :or {change-set-filters []}}] + (assert-locked liquibase) (let [change-log (.getDatabaseChangeLog liquibase) database (.getDatabase liquibase) log-iterator (ChangeLogIterator. change-log ^"[Lliquibase.changelog.filter.ChangeSetFilter;" (into-array ChangeSetFilter change-set-filters)) update-visitor (UpdateVisitor. database ^ChangeExecListener exec-listener) runtime-env (RuntimeEnvironment. database (Contexts.) nil)] - (run-in-scope-locked - liquibase - #(.run ^ChangeLogIterator log-iterator update-visitor runtime-env))))) + (.run ^ChangeLogIterator log-iterator update-visitor runtime-env)))) (mu/defn force-migrate-up-if-needed! "Force migrating up. This does three things differently from [[migrate-up-if-needed!]]: @@ -287,39 +324,47 @@ #3295." [^Liquibase liquibase :- (ms/InstanceOfClass Liquibase) ^DataSource data-source :- (ms/InstanceOfClass DataSource)] - ;; have to do this before clear the checksums else it will wait for locks to be released + ;; We should have already released the lock before consolidating the changelog, but include this statement again + ;; here to avoid depending on that non-local implementation detail. It is possible that the lock has been taken + ;; again by another process before we reach this, and it's even possible that we lose yet *another* race again + ;; between the next two lines, but we accept the risk of blocking in that latter case rather than complicating things + ;; further. (release-lock-if-needed! liquibase) + ;; This implicitly clears the lock, so it needs to execute first. (.clearCheckSums liquibase) - (when (seq (unrun-migrations data-source)) - (let [change-log (.getDatabaseChangeLog liquibase) - fail-on-errors (mapv (fn [^ChangeSet change-set] [change-set (.getFailOnError change-set)]) - (.getChangeSets change-log)) - exec-listener (proxy [AbstractChangeExecListener] [] - (willRun [^ChangeSet change-set _database-change-log _database _run-status] - (when (instance? ChangeSet change-set) - (log/info (format "Start executing migration with id %s" (.getId change-set))))) - - (runFailed [^ChangeSet change-set _database-change-log _database ^Exception e] - (log/error (u/format-color 'red "[ERROR] %s" (.getMessage e)))) - - (ran [change-set _database-change-log _database ^ChangeSet$ExecType exec-type] - (when (instance? ChangeSet change-set) - (condp = exec-type - ChangeSet$ExecType/EXECUTED - (log/info (u/format-color 'green "[SUCCESS]")) - - ChangeSet$ExecType/FAILED - (log/error (u/format-color 'red "[ERROR]")) - - (log/info (format "[%s]" (.name exec-type)))))))] - (try - (doseq [^ChangeSet change-set (.getChangeSets change-log)] - (.setFailOnError change-set false)) - (update-with-change-log liquibase {:exec-listener exec-listener}) - (finally - (doseq [[^ChangeSet change-set fail-on-error?] fail-on-errors] - (.setFailOnError change-set fail-on-error?))))))) - + (with-scope-locked liquibase + (when (seq (unrun-migrations data-source)) + (let [change-log (.getDatabaseChangeLog liquibase) + fail-on-errors (mapv (fn [^ChangeSet change-set] [change-set (.getFailOnError change-set)]) + (.getChangeSets change-log)) + exec-listener (proxy [AbstractChangeExecListener] [] + (willRun [^ChangeSet change-set _database-change-log _database _run-status] + (when (instance? ChangeSet change-set) + (log/info (format "Start executing migration with id %s" (.getId change-set))))) + + (runFailed [^ChangeSet _change-set _database-change-log _database ^Exception e] + (log/error (u/format-color 'red "[ERROR] %s" (.getMessage e)))) + + (ran [change-set _database-change-log _database ^ChangeSet$ExecType exec-type] + (when (instance? ChangeSet change-set) + (condp = exec-type + ChangeSet$ExecType/EXECUTED + (log/info (u/format-color 'green "[SUCCESS]")) + + ChangeSet$ExecType/FAILED + (log/error (u/format-color 'red "[ERROR]")) + + (log/info (format "[%s]" (.name exec-type)))))))] + (try + (doseq [^ChangeSet change-set (.getChangeSets change-log)] + (.setFailOnError change-set false)) + (update-with-change-log liquibase {:exec-listener exec-listener}) + (finally + (doseq [[^ChangeSet change-set fail-on-error?] fail-on-errors] + (.setFailOnError change-set fail-on-error?)))))))) + +(def ^:private legacy-migrations-file "migrations/000_legacy_migrations.yaml") +(def ^:private update-migrations-file "migrations/001_update_migrations.yaml") (mu/defn consolidate-liquibase-changesets! "Consolidate all previous DB migrations so they come from single file. @@ -330,16 +375,25 @@ See https://github.com/metabase/metabase/issues/3715 Also see https://github.com/metabase/metabase/pull/34400" - [conn :- (ms/InstanceOfClass java.sql.Connection)] + [conn :- (ms/InstanceOfClass java.sql.Connection) + liquibase :- (ms/InstanceOfClass Liquibase)] (let [liquibase-table-name (changelog-table-name conn) - statement (format "UPDATE %s SET FILENAME = CASE WHEN ID = ? THEN ? WHEN ID < ? THEN ? ELSE ? END" liquibase-table-name)] + conn-spec {:connection conn}] (when-not (fresh-install? conn) - (jdbc/execute! - {:connection conn} - [statement - "v00.00-000" "migrations/001_update_migrations.yaml" - "v45.00-001" "migrations/000_legacy_migrations.yaml" - "migrations/001_update_migrations.yaml"])))) + ;; Skip mutating the table if the filenames are already correct. It assumes we have never moved the boundary + ;; between the two files, i.e. that update-migrations still start from v45. + (when-not (= #{legacy-migrations-file update-migrations-file} + (->> (str "SELECT DISTINCT(FILENAME) AS filename FROM " liquibase-table-name) + (jdbc/query conn-spec) + (into #{} (map :filename)))) + (log/info "Updating liquibase table to reflect consolidated changeset filenames") + (with-scope-locked liquibase + (jdbc/execute! + conn-spec + [(format "UPDATE %s SET FILENAME = CASE WHEN ID = ? THEN ? WHEN ID < ? THEN ? ELSE ? END" liquibase-table-name) + "v00.00-000" update-migrations-file + "v45.00-001" legacy-migrations-file + update-migrations-file])))))) (defn- extract-numbers "Returns contiguous integers parsed from string s" @@ -359,13 +413,14 @@ (throw (IllegalArgumentException. (format "target version must be a number between 44 and the previous major version (%d), inclusive" (config/current-major-version))))) - ;; count and rollback only the applied change set ids which come after the target version (only the "v..." IDs need to be considered) - (let [changeset-query (format "SELECT id FROM %s WHERE id LIKE 'v%%' ORDER BY ORDEREXECUTED ASC" (changelog-table-name conn)) - changeset-ids (map :id (jdbc/query {:connection conn} [changeset-query])) - ;; IDs in changesets do not include the leading 0/1 digit, so the major version is the first number - ids-to-drop (drop-while #(not= (inc target-version) (first (extract-numbers %))) changeset-ids)] - (log/infof "Rolling back app database schema to version %d" target-version) - (.rollback liquibase (count ids-to-drop) "")))) + (with-scope-locked liquibase + ;; count and rollback only the applied change set ids which come after the target version (only the "v..." IDs need to be considered) + (let [changeset-query (format "SELECT id FROM %s WHERE id LIKE 'v%%' ORDER BY ORDEREXECUTED ASC" (changelog-table-name conn)) + changeset-ids (map :id (jdbc/query {:connection conn} [changeset-query])) + ;; IDs in changesets do not include the leading 0/1 digit, so the major version is the first number + ids-to-drop (drop-while #(not= (inc target-version) (first (extract-numbers %))) changeset-ids)] + (log/infof "Rolling back app database schema to version %d" target-version) + (.rollback liquibase (count ids-to-drop) ""))))) (defn latest-applied-major-version "Gets the latest version that was applied to the database." diff --git a/src/metabase/db/setup.clj b/src/metabase/db/setup.clj index ab4d87285f81702907152e36633a5a1acd181246..7f5c30cf470ff3b788b5fc92d322e093af7052ca 100644 --- a/src/metabase/db/setup.clj +++ b/src/metabase/db/setup.clj @@ -74,7 +74,13 @@ (log/info (trs "Setting up Liquibase...")) (liquibase/with-liquibase [liquibase conn] (try - (liquibase/consolidate-liquibase-changesets! conn) + ;; Consolidating the changeset requires the lock, so we may need to release it first. + (when (= :force direction) + (liquibase/release-lock-if-needed! liquibase)) + ;; Releasing the locks does not depend on the changesets, so we skip this step as it might require locking. + (when-not (= :release-locks direction) + (liquibase/consolidate-liquibase-changesets! conn liquibase)) + (log/info (trs "Liquibase is ready.")) (case direction :up (liquibase/migrate-up-if-needed! liquibase data-source) diff --git a/test/metabase/db/liquibase_test.clj b/test/metabase/db/liquibase_test.clj index ffd0e5d87d7cdbe1999e3083619efb883dcc87a4..f760b11dbcca30c20d4745d4ffc95718789a3e93 100644 --- a/test/metabase/db/liquibase_test.clj +++ b/test/metabase/db/liquibase_test.clj @@ -67,9 +67,9 @@ ;; fake a db where we ran all the migrations, including the legacy ones (with-redefs [liquibase/decide-liquibase-file (fn [& _args] @#'liquibase/changelog-legacy-file)] (liquibase/with-liquibase [liquibase conn] - (.update liquibase "")) - (t2/update! (liquibase/changelog-table-name conn) {:filename "migrations/000_migrations.yaml"}) - (liquibase/consolidate-liquibase-changesets! conn) + (.update liquibase "") + (t2/update! (liquibase/changelog-table-name conn) {:filename "migrations/000_migrations.yaml"}) + (liquibase/consolidate-liquibase-changesets! conn liquibase)) (testing "makes sure the change log filename are correctly set" (is (= (set (liquibase-file->included-ids "migrations/000_legacy_migrations.yaml" driver/*driver*)) (t2/select-fn-set :id (liquibase/changelog-table-name conn) :filename "migrations/000_legacy_migrations.yaml"))) diff --git a/test/metabase/db/schema_migrations_test/impl.clj b/test/metabase/db/schema_migrations_test/impl.clj index 6b640f4e4c85d4870a5fa817ccee53d9d1bff43e..bce97d9bb0ca96cf77ec0d62f2044453623f6e13 100644 --- a/test/metabase/db/schema_migrations_test/impl.clj +++ b/test/metabase/db/schema_migrations_test/impl.clj @@ -189,14 +189,12 @@ accept?) (ChangeSetFilterResult. accept? "decision according to range" (class this)))))] change-log-service (.getChangeLogService (ChangeLogHistoryServiceFactory/getInstance) database)] - (liquibase/run-in-scope-locked - liquibase - (fn [] - ;; Calling .listUnrunChangeSets has the side effect of creating the Liquibase tables - ;; and initializing checksums so that they match the ones generated in production. - (.listUnrunChangeSets liquibase nil (LabelExpression.)) - (.generateDeploymentId change-log-service) - (liquibase/update-with-change-log liquibase {:change-set-filters change-set-filters})))))) + (liquibase/with-scope-locked liquibase + ;; Calling .listUnrunChangeSets has the side effect of creating the Liquibase tables + ;; and initializing checksums so that they match the ones generated in production. + (.listUnrunChangeSets liquibase nil (LabelExpression.)) + (.generateDeploymentId change-log-service) + (liquibase/update-with-change-log liquibase {:change-set-filters change-set-filters}))))) (defn- test-migrations-for-driver [driver [start-id end-id] f] (log/debug (u/format-color 'yellow "Testing migrations for driver %s..." driver))