-
Chris Truter authoredChris Truter authored
Code owners
Assign users and groups as approvers for specific file changes. Learn more.
index.clj 10.51 KiB
(ns metabase.search.postgres.index
(:require
[cheshire.core :as json]
[clojure.string :as str]
[honey.sql.helpers :as sql.helpers]
[metabase.search.spec :as search.spec]
[metabase.util :as u]
[toucan2.core :as t2]))
(def ^:private active-table :search_index)
(def ^:private pending-table :search_index_next)
(def ^:private retired-table :search_index_retired)
(defonce ^:private initialized? (atom false))
(defonce ^:private reindexing? (atom false))
(def ^:private tsv-language "simple")
(defn- exists? [table-name]
(t2/exists? :information_schema.tables :table_name (name table-name)))
(defn- drop-table! [table-name]
(boolean
(when (exists? table-name)
(t2/query (sql.helpers/drop-table table-name)))))
(defn- rename-table! [old new]
(when (and (exists? old) (not (exists? new)))
(-> (sql.helpers/alter-table old)
(sql.helpers/rename-table new)
t2/query)))
(defn maybe-create-pending!
"Create a non-active search index table."
[]
(when (not @reindexing?)
(when-not (exists? pending-table)
(-> (sql.helpers/create-table pending-table)
(sql.helpers/with-columns
[[:id :bigint [:primary-key] [:raw "GENERATED BY DEFAULT AS IDENTITY"]]
;; entity
[:model_id :int :not-null]
[:model [:varchar 254] :not-null] ;; TODO We could shrink this to just what we need.
[:name :text]
;; search
[:search_vector :tsvector :not-null]
[:with_native_query_vector :tsvector :not-null]
;; results
[:display_data :text :not-null]
[:legacy_input :text :not-null]
;; scoring related
[:dashboardcard_count :int]
[:last_viewed_at :timestamp]
[:model_rank :int :not-null]
[:official_collection :boolean]
[:pinned :boolean]
[:verified :boolean]
[:view_count :int]
;; permission related entities
[:collection_id :int]
[:database_id :int]
;; filter related
[:archived :boolean :not-null [:default false]]
[:creator_id :int]
[:last_edited_at :timestamp]
[:last_editor_id :int]
[:model_created_at :timestamp]
[:model_updated_at :timestamp]
;; useful for tracking the speed and age of the index
[:created_at :timestamp
[:default [:raw "CURRENT_TIMESTAMP"]]
:not-null]
[:updated_at :timestamp :not-null]])
t2/query)
;; TODO I strongly suspect that there are more indexes that would help performance, we should examine EXPLAIN.
(let [idx-prefix (str/replace (str (name active-table) "_" (random-uuid)) #"-" "_")
table-name (name pending-table)]
(doseq [stmt ["CREATE UNIQUE INDEX IF NOT EXISTS %s_identity_idx ON %s (model, model_id)"
"CREATE INDEX IF NOT EXISTS %s_tsvector_idx ON %s USING gin (search_vector)"
"CREATE INDEX IF NOT EXISTS %s_native_tsvector_idx ON %s USING gin (with_native_query_vector)"
;; Spam all the indexes for now, let's see if they get used on Stats / Ephemeral envs.
"CREATE INDEX IF NOT EXISTS %s_model_archived_idx ON %s (model, archived)"
"CREATE INDEX IF NOT EXISTS %s_archived_idx ON %s (archived)"]]
(t2/query (format stmt idx-prefix table-name)))))
(reset! reindexing? true)))
(defn activate-pending!
"Make the pending index active if it exists. Returns true if it did so."
[]
;; ... just in case it wasn't cleaned up last time.
(drop-table! retired-table)
(when (exists? pending-table)
(t2/with-transaction [_conn]
(rename-table! active-table retired-table)
(rename-table! pending-table active-table))
(reset! reindexing? false)
(drop-table! retired-table)
true))
(defn- entity->entry [entity]
(-> entity
(select-keys
;; remove attrs that get aliased
(remove #{:id :created_at :updated_at :native_query}
(conj search.spec/attr-columns
:model :model_rank
:display_data :legacy_input)))
(update :display_data json/generate-string)
(update :legacy_input json/generate-string)
(assoc
:updated_at :%now
:model_id (:id entity)
:model_created_at (:created_at entity)
:model_updated_at (:updated_at entity)
:search_vector [:||
[:setweight [:to_tsvector [:inline tsv-language] [:cast (:name entity) :text]]
[:inline "A"]]
[:setweight [:to_tsvector
[:inline tsv-language]
[:cast
(:searchable_text entity "")
:text]]
[:inline "B"]]]
:with_native_query_vector [:||
[:setweight [:to_tsvector [:inline tsv-language] [:cast (:name entity) :text]]
[:inline "A"]]
[:setweight [:to_tsvector
[:inline tsv-language]
[:cast
(str (:searchable_text entity) " " (:native_query entity))
:text]]
[:inline "B"]]])))
(defn- upsert! [table entry]
(t2/query
{:insert-into table
:values [entry]
:on-conflict [:model :model_id]
:do-update-set entry}))
(defn- batch-upsert! [table entries]
(when (seq entries)
(t2/query
;; The cost of dynamically calculating these keys should be small compared to the IO cost, so unoptimized.
(let [update-keys (vec (disj (set (keys (first entries))) :id :model :model_id))
excluded-kw (fn [column] (keyword (str "excluded." (name column))))]
{:insert-into table
:values entries
:on-conflict [:model :model_id]
:do-update-set (zipmap update-keys (map excluded-kw update-keys))}))))
(defn update!
"Create the given search index entries"
[entity]
(let [entry (entity->entry entity)]
(when @initialized?
(upsert! active-table entry))
(when @reindexing?
(upsert! pending-table entry))))
(defn delete!
"Remove any entries corresponding directly to a given model instance."
[id search-models]
;; In practice, we expect this to be 1-1, but the data model does not preclude it.
(when (seq search-models)
(when @initialized?
(t2/delete! active-table :model_id id :model [:in search-models]))
(when @reindexing?
(t2/delete! pending-table :model_id id :model [:in search-models]))))
(defn- quote* [s]
(str "'" (str/replace s "'" "''") "'"))
(defn- process-phrase [word-or-phrase]
;; a phrase is quoted even if the closing quotation mark has not been typed yet
(cond
;; trailing quotation mark
(= word-or-phrase "\"") nil
;; quoted phrases must be matched sequentially
(str/starts-with? word-or-phrase "\"")
(as-> word-or-phrase <>
;; remove the quote mark(s)
(str/replace <> #"^\"|\"$" "")
(str/trim <>)
(str/split <> #"\s+")
(map quote* <>)
(str/join " <-> " <>))
;; negation
(str/starts-with? word-or-phrase "-")
(str "!" (quote* (subs word-or-phrase 1)))
;; just a regular word
:else
(quote* word-or-phrase)))
(defn- split-preserving-quotes
"Break up the words in the search input, preserving quoted and partially quoted segments."
[s]
(re-seq #"\"[^\"]*(?:\"|$)|[^\s\"]+|\s+" (u/lower-case-en s)))
(defn- process-clause [words-and-phrases]
(->> words-and-phrases
(remove #{"and"})
(map process-phrase)
(remove str/blank?)
(str/join " & ")))
(defn- complete-last-word
"Add wildcards at the end of the final word, so that we match ts completions."
[expression]
(str/replace expression #"(\S+)(?=\s*$)" "$1:*"))
(defn- to-tsquery-expr
"Given the user input, construct a query in the Postgres tsvector query language."
[input]
(str
(when input
(let [trimmed (str/trim input)
complete? (not (str/ends-with? trimmed "\""))
;; TODO also only complete if search-typeahead-enabled and the context is the search palette
maybe-complete (if complete? complete-last-word identity)]
(->> (split-preserving-quotes trimmed)
(remove str/blank?)
(partition-by #{"or"})
(remove #(= (first %) "or"))
(map process-clause)
(str/join " | ")
maybe-complete)))))
(defn batch-update!
"Create the given search index entries in bulk"
[entities]
(let [entries (map entity->entry entities)]
(when @initialized?
(batch-upsert! active-table entries))
(when @reindexing?
(batch-upsert! pending-table entries))
(count entries)))
(defn search-query
"Query fragment for all models corresponding to a query parameter `:search-term`."
([search-term search-ctx]
(search-query search-term search-ctx [:model_id :model]))
([search-term search-ctx select-items]
{:select select-items
:from [active-table]
;; Using a join allows us to share the query expression between our SELECT and WHERE clauses.
:join [[[:raw "to_tsquery('"
tsv-language "', "
[:lift (to-tsquery-expr search-term)] ")"]
:query] [:= 1 1]]
:where (if (str/blank? search-term)
[:= [:inline 1] [:inline 1]]
[:raw
(str
(if (:search-native-query search-ctx)
"with_native_query_vector"
"search_vector")
" @@ query")])}))
(defn search
"Use the index table to search for records."
[search-term & [search-ctx]]
(map (juxt :model_id :model)
(t2/query (search-query search-term search-ctx))))
(defn reset-index!
"Ensure we have a blank slate; in case the table schema or stored data format has changed."
[]
(reset! reindexing? false)
(drop-table! pending-table)
(maybe-create-pending!)
(activate-pending!)
(reset! initialized? true))
(defn ensure-ready!
"Ensure the index is ready to be populated. Return false if it was already ready."
[force-recreation?]
(if (or force-recreation? (not (exists? active-table)))
(reset-index!)
(reset! initialized? true)))