From 1f95447352a091087fbcaa61c5745d2734854d31 Mon Sep 17 00:00:00 2001
From: Chris Truter <crisptrutski@users.noreply.github.com>
Date: Fri, 27 Sep 2024 07:15:18 +0200
Subject: [PATCH] Periodically recreate the search index (#48137)

---
 .clj-kondo/config.edn              |  5 ++++-
 src/metabase/api/search.clj        |  6 +++++-
 src/metabase/search/config.clj     |  6 ++++--
 src/metabase/search/impl.clj       | 23 ++++++++++++--------
 src/metabase/task.clj              |  7 ++++++
 src/metabase/task/search_index.clj | 34 +++++++++++++++++++++---------
 test/metabase/api/search_test.clj  | 32 ++++++++++++++++++++++++++++
 test/metabase/search/impl_test.clj | 16 ++++++++++++++
 test/metabase/task_test.clj        |  7 ++++++
 9 files changed, 113 insertions(+), 23 deletions(-)

diff --git a/.clj-kondo/config.edn b/.clj-kondo/config.edn
index 401cecc5495..2a3a25c052a 100644
--- a/.clj-kondo/config.edn
+++ b/.clj-kondo/config.edn
@@ -373,7 +373,10 @@
                                      metabase.sync.util}                                            ; TODO -- consolidate these into a real API namespace.
     metabase.task                  #{metabase.task
                                      metabase.task.index-values
-                                     metabase.task.persist-refresh}                                 ; TODO -- consolidate these into a real API namespace.
+                                     metabase.task.persist-refresh
+                                     ;; We need to expose this to the metabase.search module.
+                                     ;; Ideally, it would just live inside that module.
+                                     metabase.task.search-index}                                    ; TODO -- consolidate these into a real API namespace.
     metabase.troubleshooting       #{metabase.troubleshooting}
     metabase.types                 #{metabase.types}
     metabase.upload                #{metabase.upload}
diff --git a/src/metabase/api/search.clj b/src/metabase/api/search.clj
index 0d906420047..0e9a06e5eab 100644
--- a/src/metabase/api/search.clj
+++ b/src/metabase/api/search.clj
@@ -6,6 +6,8 @@
    [metabase.public-settings :as public-settings]
    [metabase.search :as search]
    [metabase.server.middleware.offset-paging :as mw.offset-paging]
+   [metabase.task :as task]
+   [metabase.task.search-index :as task.search-index]
    [metabase.util :as u]
    [metabase.util.malli.schema :as ms]
    [ring.util.response :as response]))
@@ -47,7 +49,9 @@
 
     (search/supports-index?)
     (do
-      (search/reindex!)
+      (if (task/job-exists? task.search-index/job-key)
+        (task/trigger-now! task.search-index/job-key)
+        (search/reindex!))
       {:status-code 200})
 
     :else
diff --git a/src/metabase/search/config.clj b/src/metabase/search/config.clj
index 503fa9b804d..72bfff03bae 100644
--- a/src/metabase/search/config.clj
+++ b/src/metabase/search/config.clj
@@ -101,6 +101,8 @@
 (def SearchContext
   "Map with the various allowed search parameters, used to construct the SQL query."
   [:map {:closed true}
+   ;; display related
+   [:calculate-available-models? {:optional true} :boolean]
    ;;
    ;; required
    ;;
@@ -111,8 +113,8 @@
    [:model-ancestors?   :boolean]
    [:models             [:set SearchableModel]]
    ;; TODO this is optional only for tests, clean those up!
-   [:search-engine       {:optional true} keyword?]
-   [:search-string      [:maybe ms/NonBlankString]]
+   [:search-engine      {:optional true} keyword?]
+   [:search-string      {:optional true} [:maybe ms/NonBlankString]]
    ;;
    ;; optional
    ;;
diff --git a/src/metabase/search/impl.clj b/src/metabase/search/impl.clj
index 0397e1e45f0..27375e237ed 100644
--- a/src/metabase/search/impl.clj
+++ b/src/metabase/search/impl.clj
@@ -199,22 +199,27 @@
 
 (defmulti supported-engine? "Does this instance support the given engine?" keyword)
 
-(defmethod supported-engine? :in-place [_] true)
-(defmethod supported-engine? :fulltext [_] (search.fulltext/supported-db? (mdb/db-type)))
+(defmethod supported-engine? :search.engine/in-place [_] true)
+(defmethod supported-engine? :search.engine/fulltext [_] (search.fulltext/supported-db? (mdb/db-type)))
 
 (def ^:private default-engine :search.engine/in-place)
 
+(defn- known-engine? [engine]
+  (let [registered? #(contains? (methods supported-engine?) %)]
+    (some registered? (cons engine (ancestors engine)))))
+
 (defn- parse-engine [value]
   (or (when-not (str/blank? value)
-        (cond
-          (not (contains? (methods supported-engine?) value))
-          (log/warnf "Search-engine is unknown: %s" value)
+        (let [engine (keyword "search.engine" value)]
+          (cond
+            (not (known-engine? engine))
+            (log/warnf "Search-engine is unknown: %s" value)
 
-          (not (supported-engine? value))
-          (log/warnf "Search-engine is not supported: %s" value)
+            (not (supported-engine? engine))
+            (log/warnf "Search-engine is not supported: %s" value)
 
-          :else
-          (keyword "search.engine" value)))
+            :else
+            engine)))
       default-engine))
 
 ;; This forwarding is here for tests, we should clean those up.
diff --git a/src/metabase/task.clj b/src/metabase/task.clj
index 26db485d7cf..4c70f28c2ff 100644
--- a/src/metabase/task.clj
+++ b/src/metabase/task.clj
@@ -308,6 +308,13 @@
     (instance? JobKey x) x
     (string? x)          (JobKey. ^String x)))
 
+(defn job-exists?
+  "Check whether there is a Job with the given key."
+  [job-key]
+  (boolean
+   (when-let [s (scheduler)]
+     (qs/get-job s (->job-key job-key)))))
+
 (defn job-info
   "Get info about a specific Job (`job-key` can be either a String or `JobKey`).
 
diff --git a/src/metabase/task/search_index.clj b/src/metabase/task/search_index.clj
index 3a66099a57b..7fc5d52b99e 100644
--- a/src/metabase/task/search_index.clj
+++ b/src/metabase/task/search_index.clj
@@ -1,8 +1,8 @@
 (ns metabase.task.search-index
   (:require
    [clojurewerkz.quartzite.jobs :as jobs]
+   [clojurewerkz.quartzite.schedule.simple :as simple]
    [clojurewerkz.quartzite.triggers :as triggers]
-   [metabase.public-settings :as public-settings]
    [metabase.search :as search]
    [metabase.task :as task]
    [metabase.util.log :as log])
@@ -11,22 +11,36 @@
 
 (set! *warn-on-reflection* true)
 
-;; We need each instance to initialize on start-up currently, this will need to be refined.
-(jobs/defjob ^{DisallowConcurrentExecution false
+;; This is problematic multi-instance deployments, see below.
+(def ^:private recreated? (atom false))
+
+(def job-key
+  "Key used to define and trigger the search-index task."
+  (jobs/key "metabase.task.search-index.job"))
+
+(jobs/defjob ^{DisallowConcurrentExecution true
                :doc                        "Populate Search Index"}
   SearchIndexing [_ctx]
-  (when (public-settings/experimental-fulltext-search-enabled)
-    (log/info "Recreating search index from latest schema")
-    (search/init-index! {:force-reset? true})
-    (log/info "Populating search index")
-    (search/reindex!)
+  (when (search/supports-index?)
+    (if (not @recreated?)
+      (do (log/info "Recreating search index from the latest schema")
+          ;; Each instance in a multi-instance deployment will recreate the table the first time it is selected to run
+          ;; the job, resulting in a momentary lack of search results.
+          ;; One solution to this would be to store metadata about the index in another table, which we can use to
+          ;; determine whether it was built by another version of Metabase and should be rebuilt.
+          (search/init-index! {:force-reset? (not @recreated?)})
+          (reset! recreated? true))
+      (do (log/info "Reindexing searchable entities")
+          (search/reindex!)))
     (log/info "Done indexing.")))
 
 (defmethod task/init! ::SearchIndex [_]
   (let [job     (jobs/build
                  (jobs/of-type SearchIndexing)
-                 (jobs/with-identity (jobs/key "metabase.task.search-index.job")))
+                 (jobs/with-identity job-key))
         trigger (triggers/build
                  (triggers/with-identity (triggers/key "metabase.task.search-index.trigger"))
-                 (triggers/start-now))]
+                 (triggers/start-now)
+                 (triggers/with-schedule
+                  (simple/schedule (simple/with-interval-in-hours 1))))]
     (task/schedule-task! job trigger)))
diff --git a/test/metabase/api/search_test.clj b/test/metabase/api/search_test.clj
index 16b71cb4a6b..e73fbf36c27 100644
--- a/test/metabase/api/search_test.clj
+++ b/test/metabase/api/search_test.clj
@@ -20,13 +20,21 @@
    [metabase.models.permissions-group :as perms-group]
    [metabase.models.revision :as revision]
    [metabase.public-settings.premium-features :as premium-features]
+   [metabase.search :as search]
    [metabase.search.config :as search.config]
+   [metabase.search.fulltext :as search.fulltext]
    [metabase.search.scoring :as scoring]
    [metabase.test :as mt]
    [metabase.util :as u]
    [toucan2.core :as t2]
    [toucan2.tools.with-temp :as t2.with-temp]))
 
+(comment
+  ;; We need this to ensure the engine hierarchy is registered
+  search.fulltext/keep-me)
+
+(set! *warn-on-reflection* true)
+
 (def ^:private default-collection {:id false :name nil :authority_level nil :type nil})
 
 (def ^:private default-search-row
@@ -300,6 +308,14 @@
       (is (= 2 (:limit (search-request :crowberto :q "test" :limit "2" :offset "3"))))
       (is (= 3 (:offset (search-request :crowberto :q "test" :limit "2" :offset "3")))))))
 
+(deftest custom-engine-test
+  (when (search/supports-index?)
+    (testing "It can use an alternate search engine"
+      (with-search-items-in-root-collection "test"
+        (let [resp (search-request :crowberto :q "test" :search_engine "fulltext")]
+          ;; The index is not populated here, so there's not much interesting to assert.
+          (is (= "search.engine/fulltext" (:engine resp))))))))
+
 (deftest archived-models-test
   (testing "It returns some stuff when you get results"
     (with-search-items-in-root-collection "test"
@@ -1558,3 +1574,19 @@
                                            :name "top level col"
                                            :type "foo"}]}
                    (:collection leaf-card-response)))))))))
+
+(deftest force-reindex-test
+  (when (search/supports-index?)
+    (mt/with-temp [Card {id :id} {:name "It boggles the mind!"}]
+      (let [search-results #(:data (mt/user-http-request :rasta :get 200 "search" :q "boggle" :search_engine "fulltext"))]
+        (try
+          (t2/delete! :search_index)
+          (catch Exception _))
+        (is (empty? (search-results)))
+        (mt/user-http-request :crowberto :post 200 "search/force-reindex")
+        (is (loop [attempts-left 5]
+              (if (some (comp #{id} :id) (search-results))
+                ::success
+                (when (pos? attempts-left)
+                  (Thread/sleep 200)
+                  (recur (dec attempts-left))))))))))
diff --git a/test/metabase/search/impl_test.clj b/test/metabase/search/impl_test.clj
index c577e3ae0d2..301a851ee54 100644
--- a/test/metabase/search/impl_test.clj
+++ b/test/metabase/search/impl_test.clj
@@ -8,6 +8,7 @@
    [java-time.api :as t]
    [metabase.api.common :as api]
    [metabase.config :as config]
+   [metabase.search :as search]
    [metabase.search.config :as search.config]
    [metabase.search.impl :as search.impl]
    [metabase.search.legacy :as search.legacy]
@@ -15,6 +16,21 @@
    [toucan2.core :as t2]
    [toucan2.tools.with-temp :as t2.with-temp]))
 
+(deftest ^:parallel parse-engine-test
+  (testing "Default engine"
+    (is (= :search.engine/in-place (#'search.impl/parse-engine nil))))
+  (testing "Unknown engine resolves to the default"
+    (is (=  (#'search.impl/parse-engine nil)
+            (#'search.impl/parse-engine "vespa"))))
+  (testing "Registered engines"
+    (is (= :search.engine/in-place (#'search.impl/parse-engine "in-place")))
+    (when (search/supports-index?)
+      (is (= :search.engine/fulltext (#'search.impl/parse-engine "fulltext")))))
+  (when (search/supports-index?)
+    (testing "Subclasses"
+      (is (= :search.engine/hybrid (#'search.impl/parse-engine "hybrid")))
+      (is (= :search.engine/minimal (#'search.impl/parse-engine "minimal"))))))
+
 (deftest ^:parallel order-clause-test
   (testing "it includes all columns and normalizes the query"
     (is (= [[:case
diff --git a/test/metabase/task_test.clj b/test/metabase/task_test.clj
index 8cb13fb5554..917f3c5595e 100644
--- a/test/metabase/task_test.clj
+++ b/test/metabase/task_test.clj
@@ -63,6 +63,13 @@
      {:cron-expression     (.getCronExpression trigger)
       :misfire-instruction (.getMisfireInstruction trigger)})))
 
+(deftest job-exists?-test
+  (with-temp-scheduler-and-cleanup!
+    (is (false? (task/job-exists? (.getKey (job)))))
+    (task/schedule-task! (job) (trigger-1))
+    (is (true? (task/job-exists? (.getKey (job)))))
+    (is (false? (task/job-exists? "not-found")))))
+
 (deftest schedule-job-test
   (testing "can we schedule a job?"
     (with-temp-scheduler-and-cleanup!
-- 
GitLab