From 2f6f878b43f7c47efbfcf1a9de978d8a84b7d777 Mon Sep 17 00:00:00 2001
From: github-automation-metabase
 <166700802+github-automation-metabase@users.noreply.github.com>
Date: Fri, 20 Dec 2024 17:38:00 -0500
Subject: [PATCH] Batch query_execution, audit_log and view_log truncations
 (#51490) (#51602)

Co-authored-by: Noah Moss <32746338+noahmoss@users.noreply.github.com>
---
 src/metabase/task/truncate_audit_tables.clj   | 60 ++++++++++++++-----
 .../task/truncate_audit_tables_test.clj       | 21 ++++++-
 2 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/src/metabase/task/truncate_audit_tables.clj b/src/metabase/task/truncate_audit_tables.clj
index f0babdf3eb0..8c0e928bb9f 100644
--- a/src/metabase/task/truncate_audit_tables.clj
+++ b/src/metabase/task/truncate_audit_tables.clj
@@ -6,6 +6,7 @@
    [clojurewerkz.quartzite.triggers :as triggers]
    [java-time.api :as t]
    [metabase.config :as config]
+   [metabase.db :as mdb]
    [metabase.models.setting :as setting :refer [defsetting]]
    [metabase.models.task-history :as task-history]
    [metabase.plugins.classloader :as classloader]
@@ -45,7 +46,6 @@
   :audit      :never
   :getter     (fn []
                 (let [env-var-value (setting/get-value-of-type :integer :audit-max-retention-days)]
-                  (def env-var-value env-var-value)
                   (cond
                     (nil? env-var-value)
                     default-retention-days
@@ -70,23 +70,54 @@
 Twice a day, Metabase will delete rows older than this threshold. The minimum value is 30 days (Metabase will treat entered values of 1 to 29 the same as 30).
 If set to 0, Metabase will keep all rows.")
 
+(defsetting audit-table-truncation-batch-size
+  (deferred-tru "Batch size to use for deletion of old rows for audit-related tables (like query_execution). Can be only set as an environment variable.")
+  :visibility :internal
+  :setter     :none
+  :type       :integer
+  :default    50000
+  :audit      :never
+  :export?    false)
+
+(defn- truncate-table-batched!
+  [table-name time-column]
+  (t2/query-one
+   (case (mdb/db-type)
+     (:postgres :h2)
+     {:delete-from (keyword table-name)
+      :where [:in
+              :id
+              {:select [:id]
+               :from (keyword table-name)
+               :where [:<=
+                       (keyword time-column)
+                       (t/minus (t/offset-date-time) (t/days (audit-max-retention-days)))]
+               :limit (audit-table-truncation-batch-size)}]}
+
+     (:mysql :mariadb)
+     {:delete-from (keyword table-name)
+      :where [:<=
+              (keyword time-column)
+              (t/minus (t/offset-date-time) (t/days (audit-max-retention-days)))]
+      :limit (audit-table-truncation-batch-size)})))
+
 (defn- truncate-table!
   "Given a model, deletes all rows older than the configured threshold"
   [model time-column]
   (when-not (infinite? (audit-max-retention-days))
     (let [table-name (name (t2/table-name model))]
-      (task-history/with-task-history {:task "task-history-cleanup"}
-        (try
-          (log/infof "Cleaning up %s table" table-name)
-          (let [rows-deleted (t2/delete!
-                              model
-                              time-column
-                              [:<= (t/minus (t/offset-date-time) (t/days (audit-max-retention-days)))])]
-            (if (> rows-deleted 0)
-              (log/infof "%s cleanup successful, %d rows were deleted" table-name rows-deleted)
-              (log/infof "%s cleanup successful, no rows were deleted" table-name)))
-          (catch Throwable e
-            (log/errorf e "%s cleanup failed" table-name)))))))
+      (try
+        (log/infof "Cleaning up %s table" table-name)
+        (loop [total-rows-deleted 0]
+          (let [batch-rows-deleted (truncate-table-batched! table-name time-column)]
+            ;; Only try to delete another batch if the last batch was full
+            (if (= batch-rows-deleted (audit-table-truncation-batch-size))
+              (recur (+ total-rows-deleted (long batch-rows-deleted)))
+              (if (not= total-rows-deleted 0)
+                (log/infof "%s cleanup successful, %d rows were deleted" table-name total-rows-deleted)
+                (log/infof "%s cleanup successful, no rows were deleted" table-name)))))
+        (catch Throwable e
+          (log/errorf e "%s cleanup failed" table-name))))))
 
 (defenterprise audit-models-to-truncate
   "List of models to truncate. OSS implementation only truncates `query_execution` table."
@@ -98,7 +129,8 @@ If set to 0, Metabase will keep all rows.")
   []
   (run!
    (fn [{:keys [model timestamp-col]}]
-     (truncate-table! model timestamp-col))
+     (task-history/with-task-history {:task "task-history-cleanup"}
+       (truncate-table! model timestamp-col)))
    (audit-models-to-truncate)))
 
 (jobs/defjob ^{:doc "Triggers the removal of `query_execution` rows older than the configured threshold."} TruncateAuditTables [_]
diff --git a/test/metabase/task/truncate_audit_tables_test.clj b/test/metabase/task/truncate_audit_tables_test.clj
index ddba88fabb4..858ec63a73e 100644
--- a/test/metabase/task/truncate_audit_tables_test.clj
+++ b/test/metabase/task/truncate_audit_tables_test.clj
@@ -8,8 +8,7 @@
    [metabase.task.truncate-audit-tables :as task.truncate-audit-tables]
    [metabase.test :as mt]
    [metabase.test.fixtures :as fixtures]
-   [toucan2.core :as t2]
-   [toucan2.tools.with-temp :as t2.with-temp]))
+   [toucan2.core :as t2]))
 
 (use-fixtures :once (fixtures/initialize :db))
 
@@ -45,7 +44,7 @@
 (deftest truncate-table-test
   (testing "truncate-table accurately truncates a table according to the value of the audit-max-retention-days env var"
     (mt/with-premium-features #{}
-      (t2.with-temp/with-temp
+      (mt/with-temp
         [:model/QueryExecution {qe1-id :id} (merge (query-execution-defaults)
                                                    {:started_at (t/offset-date-time)})
          ;; 31 days ago
@@ -79,3 +78,19 @@
                 (#'task.truncate-audit-tables/truncate-audit-tables!)
                 (is (= #{qe1-id}
                        (t2/select-fn-set :id :model/QueryExecution {:where [:in :id [qe1-id qe2-id qe3-id]]})))))))))))
+
+(deftest batched-deletion-test
+  (testing "Deletion is batched into multiple queries"
+    (mt/with-temporary-setting-values [audit-table-truncation-batch-size 2]
+      (mt/with-temp
+        [:model/QueryExecution {qe1-id :id} (merge (query-execution-defaults)
+                                                   {:started_at (t/minus (t/offset-date-time) (t/years 4))})
+         :model/QueryExecution {qe2-id :id} (merge (query-execution-defaults)
+                                                   {:started_at (t/minus (t/offset-date-time) (t/years 4))})
+         :model/QueryExecution {qe3-id :id} (merge (query-execution-defaults)
+                                                   {:started_at (t/minus (t/offset-date-time) (t/years 4))})]
+        (t2/with-call-count [call-count]
+          (#'task.truncate-audit-tables/truncate-table! :model/QueryExecution :started_at)
+          ;; Should make deletion queries in two batches (2 rows, then 1 row)
+          (is (= 2 (call-count)))
+          (is (nil? (t2/select-fn-set :id :model/QueryExecution {:where [:in :id [qe1-id qe2-id qe3-id]]}))))))))
-- 
GitLab