Skip to content
Snippets Groups Projects
Unverified Commit 162437a2 authored by Chris Truter's avatar Chris Truter Committed by GitHub
Browse files

Delete Hybrid Index Search (#50444)

parent 7828b5e2
Branches
Tags
No related merge requests found
(ns dev.search
(:require
[metabase.api.common :as api]
[metabase.search :as search]
[metabase.search.postgres.core :as search.postgres]
[metabase.search.postgres.index :as search.index]
[metabase.search.postgres.index-test :refer [legacy-results]]
[toucan2.core :as t2]))
(defn- basic-view [xs]
(mapv (juxt :model :id :name) xs))
(comment
(#'search.index/drop-table! @#'search.index/*active-table*)
(search.index/reset-index!)
(search.index/maybe-create-pending!)
(search.index/activate-pending!)
{:initialized? @@#'search.index/initialized? :reindexing? @@#'search.index/reindexing?}
(zipmap [:active :next :retired] (map #'search.index/exists?
[@#'search.index/*active-table*
@#'search.index/pending-table
@#'search.index/retired-table]))
(search.postgres/init! true)
(search.postgres/init! false)
(t2/count search.index/*active-table*)
;; doesn't work, need to drop to lower level postgres functions
(basic-view (#'search.postgres/hybrid "satis:*"))
;; nope, neither get it as the lexeme is not similar enough
(basic-view (#'search.postgres/hybrid "satisfactory"))
(basic-view (legacy-results "satisfactory"))
(defn- mini-bench [n engine search-term & args]
#_{:clj-kondo/ignore [:discouraged-var]}
(let [f (case (keyword "search.engine" (name engine))
:search.engine/index-only search.index/search
:search.engine/legacy legacy-results
:search.engine/hybrid @#'search.postgres/hybrid
:search.engine/fulltext @#'search.postgres/fulltext)]
(time
(dotimes [_ n]
(doall (apply f search-term args))))))
(mini-bench 500 :legacy nil)
(mini-bench 500 :legacy "sample")
;; 30x speed-up for test-data on my machine
(mini-bench 500 :index-only "sample")
;; No noticeable degradation, without permissions and filters
(mini-bench 500 :fulltext "sample")
;; but joining to the "hydrated query" reverses the advantage
(mini-bench 100 :legacy nil)
(mini-bench 100 :legacy "sample")
;; slower than fetching everything...
(mini-bench 100 :hybrid "sample")
;; using index + LIKE on the join ... still a little bit more overhead
(mini-bench 100 :hybrid "sample" {:search-string "sample"})
(mini-bench 100 :minimal "sample"))
(defn- test-search [user search-string & [search-engine]]
(let [user-id (:id user)
user-perms #{"/"}]
(binding [api/*current-user* (atom user)
api/*current-user-id* user-id
api/*is-superuser?* true
api/*current-user-permissions-set* (atom user-perms)]
(search/search
(search/search-context
{:archived nil
:created-at nil
:created-by #{}
:current-user-id user-id
:is-superuser? true
:current-user-perms user-perms
:filter-items-in-personal-collection nil
:last-edited-at nil
:last-edited-by #{}
:limit 50
:model-ancestors? nil
:models search/all-models
:offset 0
:search-engine (some-> search-engine name)
:search-native-query nil
:search-string search-string
:table-db-id nil
:verified nil
:ids nil})))))
(comment
(require '[clj-async-profiler.core :as prof])
(prof/serve-ui 8081)
(let [user (t2/select-one :model/User :is_superuser true)]
(prof/profile
#_{:clj-kondo/ignore [:discouraged-var]}
(time
(count
(dotimes [_ 1000]
(test-search user "trivia"))))))
(let [user (t2/select-one :model/User :is_superuser true)]
(prof/profile
#_{:event :alloc}
#_{:clj-kondo/ignore [:discouraged-var]}
(time
(count
(dotimes [_ 1000]
(test-search user "trivia" :minimal)))))))
(ns metabase.search.postgres.core
(:require
[cheshire.core :as json]
[honey.sql :as sql]
[honey.sql.helpers :as sql.helpers]
[metabase.api.common :as api]
[metabase.search.config :as search.config]
[metabase.search.filter :as search.filter]
[metabase.search.legacy :as search.legacy]
[metabase.search.permissions :as search.permissions]
[metabase.search.postgres.index :as search.index]
[metabase.search.postgres.ingestion :as search.ingestion]
......@@ -18,63 +15,11 @@
(set! *warn-on-reflection* true)
(defn- user-params [search-ctx]
(cond
(:current-user-id search-ctx)
(select-keys search-ctx [:is-superuser? :current-user-id :current-user-perms])
api/*current-user-id*
{:is-superuser? api/*is-superuser?*
:current-user-id api/*current-user-id*
:current-user-perms @api/*current-user-permissions-set*}
:else
{:is-superuser? true
;; this does not matter, we won't use it.
:current-user-id 1
:current-user-perms #{"/"}}))
(defn- in-place-query [{:keys [models search-term archived?] :as search-ctx}]
(search.legacy/full-search-query
(merge
(user-params search-ctx)
{:search-string search-term
:models (or models
(if api/*current-user-id*
search.config/all-models
;; For REPL convenience, skip these models as
;; they require the user to be initialized.
(disj search.config/all-models "indexed-entity")))
:archived? archived?
:model-ancestors? true})))
(defn- hybrid
"Use the index for using the search string, but rely on the legacy code path for rendering
the display data, applying permissions, additional filtering, etc.
NOTE: this is less efficient than legacy search even. We plan to replace it with something
less feature complete but much faster."
[search-term & {:as search-ctx}]
(when-not @#'search.index/initialized?
(throw (ex-info "Search index is not initialized. Use [[init!]] to ensure it exists."
{:search-engine :postgres})))
(-> (sql.helpers/with [:index-query (search.index/search-query search-term search-ctx)]
[:source-query (in-place-query search-ctx)])
(sql.helpers/select :sq.*)
(sql.helpers/from [:source-query :sq])
(sql.helpers/join [:index-query :iq] [:and
[:= :sq.model :iq.model]
[:= :sq.id :iq.model_id]])
(sql/format {:quoted true})
t2/reducible-query))
(defn- parse-datetime [s]
(when s
(OffsetDateTime/parse s)))
(defn- rehydrate [context index-row]
;; Useful for debugging scoring
#_(dissoc index-row :legacy_input :created_at :updated_at :last_edited_at)
(-> (merge
(json/parse-string (:legacy_input index-row) keyword)
(select-keys index-row [:total_score :pinned]))
......@@ -120,8 +65,7 @@
(defn- search-fn [search-engine]
(case search-engine
:search.engine/hybrid hybrid
:search.engine/fulltext fulltext
:search.engine/fulltext fulltext
default-engine))
(defn search
......
......@@ -278,8 +278,8 @@
(defn search
"Use the index table to search for records."
[search-term & [search-ctx]]
(map (juxt :model_id :model)
(t2/query (search-query search-term search-ctx))))
(map (juxt :model :name)
(t2/query (search-query search-term search-ctx [:model :name]))))
(defn reset-index!
"Ensure we have a blank slate; in case the table schema or stored data format has changed."
......
(ns metabase.search.postgres.core-test
(:require
[clojure.test :refer [deftest is testing]]
[macaw.util :as u]
[metabase.db :as mdb]
[metabase.models]
[metabase.search :as search]
[metabase.search.postgres.core :as search.postgres]
[metabase.search.postgres.index-test :refer [legacy-results]]
[metabase.test :as mt]
[toucan2.core :as t2]
[toucan2.realize :as t2.realize]))
(comment
;; We load this to ensure all the search-models are registered
metabase.models/keep-me)
(def ^:private hybrid
(comp t2.realize/realize #'search.postgres/hybrid))
#_{:clj-kondo/ignore [:metabase/test-helpers-use-non-thread-safe-functions]}
(defmacro with-setup [& body]
`(when (= :postgres (mdb/db-type))
;; TODO add more extensive data to search
(mt/dataset ~'test-data
(mt/with-temp [:model/User {user-id# :id} {:email "someone@somewhere.com"}]
(t2/insert! :model/Collection {:name "Some Collection" :personal_owner_id user-id#})
(search.postgres/init! true)
~@body))))
(def ^:private example-terms
"Search queries which should give consistent, non-trivial results across engines, for the test data."
[#_nil "data" "dash" "peop" "venue" "rasta"])
(deftest hybrid-test
(with-setup
(testing "consistent results between all searches for certain queries\n"
(doseq [term example-terms]
(testing (str "consistent results, but not ordering\n" term)
(is (= (set (legacy-results term))
(set (hybrid term)))))))))
(deftest permissions-test
(with-setup
;; Rasta Toucan has friends, like Lucky Pidgeon
;; ... plus any additional ones that leaked in from dev or other tests
(is (< 1 (count (hybrid "collection"))))
(testing "Rasta can only see his own collections"
(is (->> {:current-user-id (mt/user->id :rasta)
:is-superuser? false
:current-user-perms #{"/none/"}}
(hybrid "collection")
(map :name)
(not-any? #{"Some Collection"}))))))
(defn- normalize* [xs]
(into #{}
(map (comp #(dissoc % :bookmark :pinned :total_score :scores)
u/strip-nils
#(update % :archived boolean)))
xs))
(deftest fulltext-test
(with-setup
(testing "consistent results with index-based implementations\n"
(doseq [term (take 1 example-terms)]
(testing term
(is (= (normalize* (hybrid term))
(normalize* (#'search.postgres/fulltext
term
{:current-user-id (mt/user->id :crowberto)
:is-superuser? true
:archived? false
:current-user-perms #{"/"}
:model-ancestors? false
:models search/all-models
:search-string term})))))))))
......@@ -3,29 +3,15 @@
[clojure.test :refer [deftest is testing]]
[java-time.api :as t]
[metabase.db :as mdb]
[metabase.search.postgres.core :as search.postgres]
[metabase.search.postgres.index :as search.index]
[metabase.search.postgres.ingestion :as search.ingestion]
[metabase.search.test-util :as search.tu]
[metabase.test :as mt]
[metabase.util :as u]
[toucan2.core :as t2]))
(set! *warn-on-reflection* true)
(defn legacy-results
"Use the source tables directly to search for records."
[search-term & {:as opts}]
(-> (assoc opts :search-engine :search.engine/in-place :search-term search-term)
(#'search.postgres/in-place-query)
t2/query))
(def legacy-models
"Just the identity of the matches"
(comp (partial mapv (juxt :id :model)) legacy-results))
(defn- legacy-hits [term]
(count (legacy-results term)))
(defn- index-hits [term]
(count (search.index/search term)))
......@@ -87,28 +73,18 @@
(t2/update! :model/Database db-id {:name alternate-name})
(is (= alternate-name (db-name-fn)))))))
(deftest consistent-subset-test
(with-index
(testing "It's consistent with in-place search on various full words"
(doseq [term ["e-commerce" "example" "rasta" "new" "collection" "revenue"]]
(testing term
(is (= (set (legacy-models term))
(set (search.index/search term)))))))))
(deftest partial-word-test
(with-index
(testing "It does not match partial words"
;; does not include revenue
(is (< (index-hits "venue")
;; but this one does
(legacy-hits "venue"))))
(is (= #{"venues"} (into #{} (comp (map second) (map u/lower-case-en)) (search.index/search "venue")))))
;; no longer works without english dictionary
#_(testing "Unless their lexemes are matching"
(doseq [[a b] [["revenue" "revenues"]
["collect" "collection"]]]
(is (= (search.index/search a)
(search.index/search b)))))
;; no longer works without using the english dictionary
(testing "Unless their lexemes are matching"
(doseq [[a b] [["revenue" "revenues"]
["collect" "collection"]]]
(is (= (search.index/search a)
(search.index/search b)))))
(testing "Or we match a completion of the final word"
(is (seq (search.index/search "sat")))
......@@ -120,14 +96,6 @@
(deftest either-test
(with-index
(testing "legacy search does not understand stop words or logical operators"
(is (= 3 (legacy-hits "satisfaction")))
;; Add some slack because things are leaking into this "test" database :-(
(is (<= 2 (legacy-hits "or")))
(is (<= 4 (legacy-hits "its the satisfaction of it")))
(is (<= 1 (legacy-hits "user")))
(is (<= 6 (legacy-hits "satisfaction or user"))))
(testing "We get results for both terms"
(is (= 3 (index-hits "satisfaction")))
(is (<= 1 (index-hits "user"))))
......@@ -155,10 +123,7 @@
(is (= 2 (index-hits "revenue")))
(is (= #_1 2 (index-hits "projected revenue")))
(testing "only sometimes do these occur sequentially in a phrase"
(is (= 1 (index-hits "\"projected revenue\""))))
(testing "legacy search has a bunch of results"
(is (= 3 (legacy-hits "projected revenue")))
(is (= 0 (legacy-hits "\"projected revenue\""))))))
(is (= 1 (index-hits "\"projected revenue\""))))))
;; lower level search expression tests
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment