diff --git a/src/metabase/core.clj b/src/metabase/core.clj index 03f0c9422e13301f0ecfc036283a0cb027b8ee6f..e5e617ac75d1e75fb82f641072dd9978389e325d 100644 --- a/src/metabase/core.clj +++ b/src/metabase/core.clj @@ -84,6 +84,9 @@ (task/stop-scheduler!) (server/stop-web-server!) (prometheus/shutdown!) + ;; This timeout was chosen based on a 30s default termination grace period in Kubernetes. + (let [timeout-seconds 20] + (mdb/release-migration-locks! timeout-seconds)) (log/info "Metabase Shutdown COMPLETE")) (defenterprise ensure-audit-db-installed! diff --git a/src/metabase/db.clj b/src/metabase/db.clj index 49d3eb76f2638145bd4a9270bbefc49e5c6c07fb..054cad91f6d355da240527e6e35712ea1dd1b162 100644 --- a/src/metabase/db.clj +++ b/src/metabase/db.clj @@ -73,6 +73,13 @@ (reset! (:status mdb.connection/*application-db*) ::setup-finished)))) :done) +(defn release-migration-locks! + "Wait up to `timeout-seconds` for the current process to release all migration locks, otherwise force release them." + [timeout-seconds] + (if (db-is-set-up?) + :done + (mdb.setup/release-migration-locks! (data-source) timeout-seconds))) + (defn memoize-for-application-db "Like [[clojure.core/memoize]], but only memoizes for the current application database; memoized values will be ignored if the app DB is dynamically rebound. For TTL memoization with [[clojure.core.memoize]], set diff --git a/src/metabase/db/liquibase.clj b/src/metabase/db/liquibase.clj index adb7b6a44d5322191170a7c216c1e4dc3a6839fe..31ef2b2f35b3bf616038a2c38ac33de63ab2efae 100644 --- a/src/metabase/db/liquibase.clj +++ b/src/metabase/db/liquibase.clj @@ -31,6 +31,10 @@ (set! *warn-on-reflection* true) +(defonce ^{:doc "The set of Liquibase instances which potentially have taken locks by this process."} + potentially-locked-instances + (atom #{})) + (comment ;; load our custom migrations metabase.db.custom-migrations/keep-me) @@ -198,7 +202,7 @@ (try (force-release-locks! liquibase) (catch Exception e - (log/error e "Unable to release the Liquibase lock after a migration failure"))))) + (log/error e "Unable to release the Liquibase lock"))))) (defn- lock-service ^LockService [^Liquibase liquibase] (.getLockService (LockServiceFactory/getInstance) (.getDatabase liquibase))) @@ -206,10 +210,10 @@ (defn- wait-for-migration-lock "Check and make sure the database isn't locked. If it is, sleep for 2 seconds and then retry several times. There's a chance the lock will end up clearing up so we can run migrations normally." - [^Liquibase liquibase] + [^LockService lock-service] (let [retry-counter (volatile! 0)] (u/auto-retry 5 - (when-not (.acquireLock (lock-service liquibase)) + (when-not (.acquireLock lock-service) (Thread/sleep 2000) (vswap! retry-counter inc) (throw @@ -228,6 +232,60 @@ [liquibase] (.hasChangeLogLock (lock-service liquibase))) +(defn- wait-until [done? ^long sleep-ms timeout-ms] + (let [deadline (+ (System/nanoTime) (* 1e6 timeout-ms))] + (loop [] + (if (done?) + :done + (if (>= (System/nanoTime) deadline) + :timed-out + (do (Thread/sleep sleep-ms) + (recur))))))) + +(defn- locked-instances + "Scan through a global set of potentially locking Liquibase objects, to retrieve the corresponding Lock Service + instances, and filter by their `hasChangeLock` flag. Returns the list of locking instances in the current process." + [] + (filter holding-lock? @potentially-locked-instances)) + +(defn wait-for-all-locks + "Wait up to a maximum of `timeout-seconds` for the given Liquibase instance to release the migration lock." + [sleep-ms timeout-ms] + (log/warn (locked-instances)) + (let [done? #(empty? (locked-instances))] + (if (done?) + :none + (do (log/infof "Waiting for migration lock(s) to be released (max %.1f secs)" (/ timeout-ms 1000)) + (wait-until done? sleep-ms timeout-ms))))) + +(defn- liquibase->url [^Liquibase liquibase] + ;; Need to this cast to get access to the metadata. We currently only use JDBC app databases. + (let [conn ^JdbcConnection (.. liquibase getDatabase getConnection)] + (-> conn .getMetaData .getURL))) + +(defn release-concurrent-locks! + "Release any locks held by this process corresponding to the same database." + [conn-or-data-source] + ;; Check whether there are Liquibase locks held by the current process - we don't want to release the database locks + ;; if they are held by another server, for example if this host is part of an "old" fleet shutting down while new + ;; servers starting up, of which one is performing the database upgrade to later Metabase version. + ;; + (when-let [instances (not-empty (locked-instances))] + ;; We cannot use the existing instances to clear the locks, as their connections are blocking on their current + ;; long-running transaction. Since we cannot "clone" a connection (they have "forgotten" their password), so we + ;; will create a new Liquibase instance using a fresh database connection. + (with-liquibase [liquibase conn-or-data-source] + ;; We rely on the way that Liquibase normalizes the connection URL to check whether the blocking and fresh + ;; Liquibase instances are pointing to the same database. + (let [url (liquibase->url liquibase)] + (doseq [instance instances] + (when (= url (liquibase->url instance)) + ;; We assume that the lock is being held for the purpose of migrations, since the other cases where we take + ;; locks are very fast, and in practice this method is only called after we have waited for a while to see + ;; if the lock was released on its own. + (log/warn "Releasing liquibase lock before migrations finished") + (release-lock-if-needed! liquibase))))))) + (def ^:private ^:dynamic *lock-depth* 0) (defn- assert-locked [liquibase] @@ -235,7 +293,7 @@ (throw (ex-info "This operation requires a hold on the liquibase migration lock." {:lock-exists? (migration-lock-exists? liquibase) ;; It's possible that the lock was accidentally released by an operation, or force released by - ;; another process, so its useful for debugging to know whether we were still within a locked + ;; another process, so it's useful for debugging to know whether we were still within a locked ;; scope. :lock-depth *lock-depth*})))) @@ -253,18 +311,23 @@ (when-not config/is-prod? (throw (LockException. "Attempted to take a Liquibase lock, but we already are holding it.")))) (let [database (.getDatabase liquibase) + lock-service (lock-service liquibase) scope-objects {(.name Scope$Attr/database) database (.name Scope$Attr/resourceAccessor) (.getResourceAccessor liquibase)}] (Scope/child ^Map scope-objects (reify Scope$ScopedRunner (run [_] - (wait-for-migration-lock liquibase) + (swap! potentially-locked-instances conj liquibase) + (wait-for-migration-lock lock-service) (try (binding [*lock-depth* (inc *lock-depth*)] (f)) (finally (when (zero? *lock-depth*) - (.releaseLock (lock-service liquibase)))))))))) + (.releaseLock lock-service) + ;; There is theoretically a chance that another thread will open a new locked scope between + ;; these two statements, but in practice we do not expect concurrent usage within a process. + (swap! potentially-locked-instances disj liquibase))))))))) (defmacro with-scope-locked "Run `body` in a scope on the Liquibase instance `liquibase`. diff --git a/src/metabase/db/setup.clj b/src/metabase/db/setup.clj index 82d73299fd3e97fabf79703bdeda762921d7529f..ee326caaf0c28da621349ff2c564972e1dadcbee 100644 --- a/src/metabase/db/setup.clj +++ b/src/metabase/db/setup.clj @@ -167,6 +167,20 @@ (run-schema-migrations! data-source auto-migrate?)))) :done) +(defn release-migration-locks! + "Wait up to `timeout-seconds` for the current process to release all migration locks, otherwise force release them." + [data-source timeout-seconds] + (let [sleep-ms 100 + timeout-ms (* 1000 timeout-seconds)] + (case (liquibase/wait-for-all-locks sleep-ms timeout-ms) + :none nil + :done (log/info "Migration lock(s) have been released") + :timed-out (do (log/warn "Releasing liquibase locks on shutdown") + ;; There's an infinitesimal chance that we released the lock and another server took it between + ;; the timeout, and the mutations we now make to these lock tables - but we can't detect that. + (liquibase/release-concurrent-locks! data-source)))) + :done) + ;;;; Toucan Setup. ;;; Done at namespace load time these days. diff --git a/test/metabase/db/liquibase_test.clj b/test/metabase/db/liquibase_test.clj index f760b11dbcca30c20d4745d4ffc95718789a3e93..aef9fbc35cc58f2c183f759682458f9a3552a152 100644 --- a/test/metabase/db/liquibase_test.clj +++ b/test/metabase/db/liquibase_test.clj @@ -81,3 +81,46 @@ (set/union (set (liquibase-file->included-ids "migrations/000_legacy_migrations.yaml" driver/*driver*)) (set (liquibase-file->included-ids "migrations/001_update_migrations.yaml" driver/*driver*))))))))) + +(deftest wait-for-all-locks-test + (mt/test-drivers #{:h2 :mysql :postgres} + (mt/with-temp-empty-app-db [conn driver/*driver*] + ;; We don't need a long time for tests, keep it zippy. + (let [sleep-ms 5 + timeout-ms 10] + (liquibase/with-liquibase [liquibase conn] + (testing "Will not wait if no locks are taken" + (is (= :none (liquibase/wait-for-all-locks sleep-ms timeout-ms)))) + (testing "Will timeout if a lock is not released" + (liquibase/with-scope-locked liquibase + (is (= :timed-out (liquibase/wait-for-all-locks sleep-ms timeout-ms))))) + (testing "Will return successfully if the lock is released while we are waiting" + (let [migrate-ms 100 + timeout-ms 200 + locked (promise)] + (future + (liquibase/with-scope-locked liquibase + (deliver locked true) + (Thread/sleep migrate-ms))) + @locked + (is (= :done (liquibase/wait-for-all-locks sleep-ms timeout-ms)))))))))) + +(deftest release-all-locks-if-needed!-test + (mt/test-drivers #{:h2 :mysql :postgres} + (mt/with-temp-empty-app-db [conn driver/*driver*] + (liquibase/with-liquibase [liquibase conn] + (testing "When we release the locks from outside the migration...\n" + (let [locked (promise) + released (promise) + locked? (promise)] + (future + (liquibase/with-scope-locked liquibase + (is (liquibase/holding-lock? liquibase)) + (deliver locked true) + @released + (deliver locked? (liquibase/holding-lock? liquibase)))) + @locked + (liquibase/release-concurrent-locks! conn) + (deliver released true) + (testing "The lock was released before the migration finished" + (is (not @locked?)))))))))