From 1f95447352a091087fbcaa61c5745d2734854d31 Mon Sep 17 00:00:00 2001 From: Chris Truter <crisptrutski@users.noreply.github.com> Date: Fri, 27 Sep 2024 07:15:18 +0200 Subject: [PATCH] Periodically recreate the search index (#48137) --- .clj-kondo/config.edn | 5 ++++- src/metabase/api/search.clj | 6 +++++- src/metabase/search/config.clj | 6 ++++-- src/metabase/search/impl.clj | 23 ++++++++++++-------- src/metabase/task.clj | 7 ++++++ src/metabase/task/search_index.clj | 34 +++++++++++++++++++++--------- test/metabase/api/search_test.clj | 32 ++++++++++++++++++++++++++++ test/metabase/search/impl_test.clj | 16 ++++++++++++++ test/metabase/task_test.clj | 7 ++++++ 9 files changed, 113 insertions(+), 23 deletions(-) diff --git a/.clj-kondo/config.edn b/.clj-kondo/config.edn index 401cecc5495..2a3a25c052a 100644 --- a/.clj-kondo/config.edn +++ b/.clj-kondo/config.edn @@ -373,7 +373,10 @@ metabase.sync.util} ; TODO -- consolidate these into a real API namespace. metabase.task #{metabase.task metabase.task.index-values - metabase.task.persist-refresh} ; TODO -- consolidate these into a real API namespace. + metabase.task.persist-refresh + ;; We need to expose this to the metabase.search module. + ;; Ideally, it would just live inside that module. + metabase.task.search-index} ; TODO -- consolidate these into a real API namespace. metabase.troubleshooting #{metabase.troubleshooting} metabase.types #{metabase.types} metabase.upload #{metabase.upload} diff --git a/src/metabase/api/search.clj b/src/metabase/api/search.clj index 0d906420047..0e9a06e5eab 100644 --- a/src/metabase/api/search.clj +++ b/src/metabase/api/search.clj @@ -6,6 +6,8 @@ [metabase.public-settings :as public-settings] [metabase.search :as search] [metabase.server.middleware.offset-paging :as mw.offset-paging] + [metabase.task :as task] + [metabase.task.search-index :as task.search-index] [metabase.util :as u] [metabase.util.malli.schema :as ms] [ring.util.response :as response])) @@ -47,7 +49,9 @@ (search/supports-index?) (do - (search/reindex!) + (if (task/job-exists? task.search-index/job-key) + (task/trigger-now! task.search-index/job-key) + (search/reindex!)) {:status-code 200}) :else diff --git a/src/metabase/search/config.clj b/src/metabase/search/config.clj index 503fa9b804d..72bfff03bae 100644 --- a/src/metabase/search/config.clj +++ b/src/metabase/search/config.clj @@ -101,6 +101,8 @@ (def SearchContext "Map with the various allowed search parameters, used to construct the SQL query." [:map {:closed true} + ;; display related + [:calculate-available-models? {:optional true} :boolean] ;; ;; required ;; @@ -111,8 +113,8 @@ [:model-ancestors? :boolean] [:models [:set SearchableModel]] ;; TODO this is optional only for tests, clean those up! - [:search-engine {:optional true} keyword?] - [:search-string [:maybe ms/NonBlankString]] + [:search-engine {:optional true} keyword?] + [:search-string {:optional true} [:maybe ms/NonBlankString]] ;; ;; optional ;; diff --git a/src/metabase/search/impl.clj b/src/metabase/search/impl.clj index 0397e1e45f0..27375e237ed 100644 --- a/src/metabase/search/impl.clj +++ b/src/metabase/search/impl.clj @@ -199,22 +199,27 @@ (defmulti supported-engine? "Does this instance support the given engine?" keyword) -(defmethod supported-engine? :in-place [_] true) -(defmethod supported-engine? :fulltext [_] (search.fulltext/supported-db? (mdb/db-type))) +(defmethod supported-engine? :search.engine/in-place [_] true) +(defmethod supported-engine? :search.engine/fulltext [_] (search.fulltext/supported-db? (mdb/db-type))) (def ^:private default-engine :search.engine/in-place) +(defn- known-engine? [engine] + (let [registered? #(contains? (methods supported-engine?) %)] + (some registered? (cons engine (ancestors engine))))) + (defn- parse-engine [value] (or (when-not (str/blank? value) - (cond - (not (contains? (methods supported-engine?) value)) - (log/warnf "Search-engine is unknown: %s" value) + (let [engine (keyword "search.engine" value)] + (cond + (not (known-engine? engine)) + (log/warnf "Search-engine is unknown: %s" value) - (not (supported-engine? value)) - (log/warnf "Search-engine is not supported: %s" value) + (not (supported-engine? engine)) + (log/warnf "Search-engine is not supported: %s" value) - :else - (keyword "search.engine" value))) + :else + engine))) default-engine)) ;; This forwarding is here for tests, we should clean those up. diff --git a/src/metabase/task.clj b/src/metabase/task.clj index 26db485d7cf..4c70f28c2ff 100644 --- a/src/metabase/task.clj +++ b/src/metabase/task.clj @@ -308,6 +308,13 @@ (instance? JobKey x) x (string? x) (JobKey. ^String x))) +(defn job-exists? + "Check whether there is a Job with the given key." + [job-key] + (boolean + (when-let [s (scheduler)] + (qs/get-job s (->job-key job-key))))) + (defn job-info "Get info about a specific Job (`job-key` can be either a String or `JobKey`). diff --git a/src/metabase/task/search_index.clj b/src/metabase/task/search_index.clj index 3a66099a57b..7fc5d52b99e 100644 --- a/src/metabase/task/search_index.clj +++ b/src/metabase/task/search_index.clj @@ -1,8 +1,8 @@ (ns metabase.task.search-index (:require [clojurewerkz.quartzite.jobs :as jobs] + [clojurewerkz.quartzite.schedule.simple :as simple] [clojurewerkz.quartzite.triggers :as triggers] - [metabase.public-settings :as public-settings] [metabase.search :as search] [metabase.task :as task] [metabase.util.log :as log]) @@ -11,22 +11,36 @@ (set! *warn-on-reflection* true) -;; We need each instance to initialize on start-up currently, this will need to be refined. -(jobs/defjob ^{DisallowConcurrentExecution false +;; This is problematic multi-instance deployments, see below. +(def ^:private recreated? (atom false)) + +(def job-key + "Key used to define and trigger the search-index task." + (jobs/key "metabase.task.search-index.job")) + +(jobs/defjob ^{DisallowConcurrentExecution true :doc "Populate Search Index"} SearchIndexing [_ctx] - (when (public-settings/experimental-fulltext-search-enabled) - (log/info "Recreating search index from latest schema") - (search/init-index! {:force-reset? true}) - (log/info "Populating search index") - (search/reindex!) + (when (search/supports-index?) + (if (not @recreated?) + (do (log/info "Recreating search index from the latest schema") + ;; Each instance in a multi-instance deployment will recreate the table the first time it is selected to run + ;; the job, resulting in a momentary lack of search results. + ;; One solution to this would be to store metadata about the index in another table, which we can use to + ;; determine whether it was built by another version of Metabase and should be rebuilt. + (search/init-index! {:force-reset? (not @recreated?)}) + (reset! recreated? true)) + (do (log/info "Reindexing searchable entities") + (search/reindex!))) (log/info "Done indexing."))) (defmethod task/init! ::SearchIndex [_] (let [job (jobs/build (jobs/of-type SearchIndexing) - (jobs/with-identity (jobs/key "metabase.task.search-index.job"))) + (jobs/with-identity job-key)) trigger (triggers/build (triggers/with-identity (triggers/key "metabase.task.search-index.trigger")) - (triggers/start-now))] + (triggers/start-now) + (triggers/with-schedule + (simple/schedule (simple/with-interval-in-hours 1))))] (task/schedule-task! job trigger))) diff --git a/test/metabase/api/search_test.clj b/test/metabase/api/search_test.clj index 16b71cb4a6b..e73fbf36c27 100644 --- a/test/metabase/api/search_test.clj +++ b/test/metabase/api/search_test.clj @@ -20,13 +20,21 @@ [metabase.models.permissions-group :as perms-group] [metabase.models.revision :as revision] [metabase.public-settings.premium-features :as premium-features] + [metabase.search :as search] [metabase.search.config :as search.config] + [metabase.search.fulltext :as search.fulltext] [metabase.search.scoring :as scoring] [metabase.test :as mt] [metabase.util :as u] [toucan2.core :as t2] [toucan2.tools.with-temp :as t2.with-temp])) +(comment + ;; We need this to ensure the engine hierarchy is registered + search.fulltext/keep-me) + +(set! *warn-on-reflection* true) + (def ^:private default-collection {:id false :name nil :authority_level nil :type nil}) (def ^:private default-search-row @@ -300,6 +308,14 @@ (is (= 2 (:limit (search-request :crowberto :q "test" :limit "2" :offset "3")))) (is (= 3 (:offset (search-request :crowberto :q "test" :limit "2" :offset "3"))))))) +(deftest custom-engine-test + (when (search/supports-index?) + (testing "It can use an alternate search engine" + (with-search-items-in-root-collection "test" + (let [resp (search-request :crowberto :q "test" :search_engine "fulltext")] + ;; The index is not populated here, so there's not much interesting to assert. + (is (= "search.engine/fulltext" (:engine resp)))))))) + (deftest archived-models-test (testing "It returns some stuff when you get results" (with-search-items-in-root-collection "test" @@ -1558,3 +1574,19 @@ :name "top level col" :type "foo"}]} (:collection leaf-card-response))))))))) + +(deftest force-reindex-test + (when (search/supports-index?) + (mt/with-temp [Card {id :id} {:name "It boggles the mind!"}] + (let [search-results #(:data (mt/user-http-request :rasta :get 200 "search" :q "boggle" :search_engine "fulltext"))] + (try + (t2/delete! :search_index) + (catch Exception _)) + (is (empty? (search-results))) + (mt/user-http-request :crowberto :post 200 "search/force-reindex") + (is (loop [attempts-left 5] + (if (some (comp #{id} :id) (search-results)) + ::success + (when (pos? attempts-left) + (Thread/sleep 200) + (recur (dec attempts-left)))))))))) diff --git a/test/metabase/search/impl_test.clj b/test/metabase/search/impl_test.clj index c577e3ae0d2..301a851ee54 100644 --- a/test/metabase/search/impl_test.clj +++ b/test/metabase/search/impl_test.clj @@ -8,6 +8,7 @@ [java-time.api :as t] [metabase.api.common :as api] [metabase.config :as config] + [metabase.search :as search] [metabase.search.config :as search.config] [metabase.search.impl :as search.impl] [metabase.search.legacy :as search.legacy] @@ -15,6 +16,21 @@ [toucan2.core :as t2] [toucan2.tools.with-temp :as t2.with-temp])) +(deftest ^:parallel parse-engine-test + (testing "Default engine" + (is (= :search.engine/in-place (#'search.impl/parse-engine nil)))) + (testing "Unknown engine resolves to the default" + (is (= (#'search.impl/parse-engine nil) + (#'search.impl/parse-engine "vespa")))) + (testing "Registered engines" + (is (= :search.engine/in-place (#'search.impl/parse-engine "in-place"))) + (when (search/supports-index?) + (is (= :search.engine/fulltext (#'search.impl/parse-engine "fulltext"))))) + (when (search/supports-index?) + (testing "Subclasses" + (is (= :search.engine/hybrid (#'search.impl/parse-engine "hybrid"))) + (is (= :search.engine/minimal (#'search.impl/parse-engine "minimal")))))) + (deftest ^:parallel order-clause-test (testing "it includes all columns and normalizes the query" (is (= [[:case diff --git a/test/metabase/task_test.clj b/test/metabase/task_test.clj index 8cb13fb5554..917f3c5595e 100644 --- a/test/metabase/task_test.clj +++ b/test/metabase/task_test.clj @@ -63,6 +63,13 @@ {:cron-expression (.getCronExpression trigger) :misfire-instruction (.getMisfireInstruction trigger)}))) +(deftest job-exists?-test + (with-temp-scheduler-and-cleanup! + (is (false? (task/job-exists? (.getKey (job))))) + (task/schedule-task! (job) (trigger-1)) + (is (true? (task/job-exists? (.getKey (job))))) + (is (false? (task/job-exists? "not-found"))))) + (deftest schedule-job-test (testing "can we schedule a job?" (with-temp-scheduler-and-cleanup! -- GitLab