From 775d9c941d938d4aa78f54f16c88da4806f89937 Mon Sep 17 00:00:00 2001 From: Chris Truter <crisptrutski@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:56:15 +0200 Subject: [PATCH] Some polish around experiment search (#47986) --- dev/src/dev/search.clj | 33 +++++++++++---------- src/metabase/search.clj | 20 ++++++------- src/metabase/search/postgres/core.clj | 23 ++++++++------ src/metabase/search/postgres/index.clj | 12 ++++---- test/metabase/search/postgres/core_test.clj | 19 +++++++----- 5 files changed, 60 insertions(+), 47 deletions(-) diff --git a/dev/src/dev/search.clj b/dev/src/dev/search.clj index 7472bfe5d67..58dc51aa29d 100644 --- a/dev/src/dev/search.clj +++ b/dev/src/dev/search.clj @@ -25,37 +25,38 @@ (t2/count :search_index) ;; doesn't work, need to drop to lower level postgres functions - (basic-view (search.postgres/hybrid "satis:*")) + (basic-view (#'search.postgres/hybrid "satis:*")) ;; nope, neither get it as the lexeme is not similar enough - (basic-view (search.postgres/hybrid "satisfactory")) + (basic-view (#'search.postgres/hybrid "satisfactory")) (basic-view (legacy-results "satisfactory")) (defn- mini-bench [n engine search-term & args] #_{:clj-kondo/ignore [:discouraged-var]} - (time - (dotimes [_ n] - (vec - (apply - (case engine - :index-only search.index/search - :legacy legacy-results - :hybrid search.postgres/hybrid - :hybrid-multi search.postgres/hybrid-multi) - search-term - args))))) + (let [f (case engine + :index-only search.index/search + :legacy legacy-results + :hybrid @#'search.postgres/hybrid + :hybrid-multi @#'search.postgres/hybrid-multi + :minimal @#'search.postgres/minimal)] + (time + (dotimes [_ n] + (doall (apply f search-term args)))))) (mini-bench 500 :legacy nil) (mini-bench 500 :legacy "sample") ;; 30x speed-up for test-data on my machine (mini-bench 500 :index-only "sample") + ;; No noticeaable degradation, without permissions and filters + (mini-bench 500 :minimal "sample") ;; but joining to the "hydrated query" reverses the advantage (mini-bench 100 :legacy nil) (mini-bench 100 :legacy "sample") ;; slower than fetching everything... (mini-bench 100 :hybrid "sample") - ;; doing both filters... still a little bit more overhead with the join + ;; using index + LIKE on the join ... still a little bit more overhead (mini-bench 100 :hybrid "sample" {:search-string "sample"}) - ;; oh! this monstrocity is actually 2x faaster than baseline B-) - (mini-bench 100 :hybrid-multi "sample")) + ;; oh! this monstrocity is actually 2x faster than baseline B-) + (mini-bench 100 :hybrid-multi "sample") + (mini-bench 100 :minimal "sample")) diff --git a/src/metabase/search.clj b/src/metabase/search.clj index 077040c259b..52940b1c480 100644 --- a/src/metabase/search.clj +++ b/src/metabase/search.clj @@ -27,17 +27,17 @@ [] (= :postgres (metabase.db/db-type))) +(def ^:private default-engine :in-place) + (defn- query-fn [search-engine] - (case search-engine - :fulltext (if (is-postgres?) - search.postgres/search - (do (log/warn ":fulltext search not supported for your AppDb, using :in-place") - search.impl/in-place)) - :minimal (if (is-postgres?) - search.postgres/search-minimal - (do (log/warn ":minimal search not supported for your AppDb, using :in-place") - search.impl/in-place)) - :in-place search.impl/in-place)) + (or + (case search-engine + :fulltext (when (is-postgres?) search.postgres/search) + :minimal (when (is-postgres?) search.postgres/search) + :in-place search.impl/in-place) + + (log/warnf "%s search not supported for your AppDb, using %s" search-engine default-engine) + default-engine)) (defn supports-index? "Does this instance support a search index, e.g. has the right kind of AppDb" diff --git a/src/metabase/search/postgres/core.clj b/src/metabase/search/postgres/core.clj index de50c0ebc10..2ab4c0962e6 100644 --- a/src/metabase/search/postgres/core.clj +++ b/src/metabase/search/postgres/core.clj @@ -40,7 +40,7 @@ :archived? archived? :model-ancestors? true}))) -(defn hybrid +(defn- hybrid "Use the index for appling the search string, but rely on the legacy code path for rendering the display data, applying permissions, additional filtering, etc. @@ -60,7 +60,7 @@ (sql/format {:quoted true}) t2/reducible-query)) -(defn hybrid-multi +(defn- hybrid-multi "Perform multiple legacy searches to see if its faster. Perverse!" [search-term & {:as search-ctx}] (when-not @#'search.index/initialized? @@ -88,17 +88,22 @@ (map :legacy_input) (map #(json/parse-string % keyword)))) -(defn search-minimal - "Perform a basic search that only uses the index" - [search-ctx] - (minimal (:search-string search-ctx) - (dissoc search-ctx :search-string))) +(def ^:private default-engine hybrid-multi) + +(defn- search-fn [search-engine] + (case search-engine + :hybrid hybrid + :hubrid-multi hybrid-multi + :minimal minimal + :fulltext default-engine + default-engine)) (defn search "Return a reducible-query corresponding to searching the entities via a tsvector." [search-ctx] - (hybrid-multi (:search-string search-ctx) - (dissoc search-ctx :search-string))) + (let [f (search-fn (:search-engine search-ctx))] + (f (:search-string search-ctx) + (dissoc search-ctx :search-string)))) (defn init! "Ensure that the search index exists, and has been populated with all the entities." diff --git a/src/metabase/search/postgres/index.clj b/src/metabase/search/postgres/index.clj index 50f3eb8d405..ba593d0f5fa 100644 --- a/src/metabase/search/postgres/index.clj +++ b/src/metabase/search/postgres/index.clj @@ -90,7 +90,7 @@ :collection_id :database_id :display_data - :legacy_data + :legacy_input :table_id :archived]) (update :display_data json/generate-string) @@ -175,10 +175,12 @@ [search-term] {:select [:model_id :model] :from [active-table] - :where [:raw - "search_vector @@ to_tsquery('" - tsv-language "', " - [:lift (to-tsquery-expr search-term)] ")"]}) + :where (if-not search-term + [:= [:inline 1] [:inline 1]] + [:raw + "search_vector @@ to_tsquery('" + tsv-language "', " + [:lift (to-tsquery-expr search-term)] ")"])}) (defn search "Use the index table to search for records." diff --git a/test/metabase/search/postgres/core_test.clj b/test/metabase/search/postgres/core_test.clj index af8550ea5a1..f296eefd240 100644 --- a/test/metabase/search/postgres/core_test.clj +++ b/test/metabase/search/postgres/core_test.clj @@ -9,7 +9,7 @@ [toucan2.realize :as t2.realize])) (def ^:private hybrid - (comp t2.realize/realize search.postgres/hybrid)) + (comp t2.realize/realize #'search.postgres/hybrid)) (def ^:private hybrid-multi #'search.postgres/hybrid-multi) @@ -18,14 +18,19 @@ #_{:clj-kondo/ignore [:metabase/test-helpers-use-non-thread-safe-functions]} (defmacro with-setup [& body] `(when (is-postgres?) + ;; TODO add more extensive data to search (mt/dataset ~'test-data (search.postgres/init! true) ~@body))) +(def ^:private example-terms + "Search queries which should give consistent, non-trivial results across engines, for the test data." + [#_nil "data" "dash" "peop" "venue" "rasta"]) + (deftest hybrid-test (with-setup (testing "consistent results between all searches for certain queries\n" - (doseq [term ["satisfaction" "e-commerce" "example" "rasta" "new" "revenue" "collection"]] + (doseq [term example-terms] (testing (str "consistent results, but not ordering\n" term) (is (= (set (legacy-results term)) (set (hybrid term))))))))) @@ -47,19 +52,19 @@ (deftest hybrid-multi-test (with-setup - (testing "consistent results between both hybrid implementations" - (doseq [term ["satisfaction" "e-commerce" "example" "rasta" "new" "revenue" "collection"]] + (testing "consistent results between both hybrid implementations\n" + (doseq [term example-terms] (testing term (is (= (hybrid term) (hybrid-multi term)))))))) (defn- remove-time [m] - (dissoc m :create_at)) + (dissoc m :created_at :updated_at :last_edited_at)) (deftest minimal-test (with-setup - (testing "consistent results between both hybrid implementations" - (doseq [term ["satisfaction" "e-commerce" "example" "new" "revenue"]] + (testing "consistent results between both hybrid implementations\n" + (doseq [term example-terms] (testing term ;; Timestamps are not strings after round trip, but this doesn't matter (is (= (map remove-time (hybrid term)) -- GitLab