From a536c2d7bf24d111cace04abeedff5f33ac751d5 Mon Sep 17 00:00:00 2001
From: John Swanson <john.swanson@metabase.com>
Date: Fri, 26 Apr 2024 15:47:05 -0700
Subject: [PATCH] Add `collection.effective_ancestors` to model search results
 (#41746)

* Add `collection.effective_ancestors` to search

Fixes #41909

The ultimate goal here is to be able to display a UI that showed the entire (effective) path to each collection.

This was a bit more painful than I'd hoped. The primary issue is that we
tied two things together:

- scoring results

- "serializing" results (really, partially serializing results, because
we do more work to shape the data later)

Why is this a problem? The `top-scorers` function took a transducer and
returned the *serialized* top scorers. Later, when we need to operate
on *all* the top results (in this case, to collect all the collection
IDs and names that we know of), we're looking at the serialized results.

In our case, we needed some data (collection location) that wasn't
included after serialization. We could add it, of course, but it's
frustrating to need to change the API of what data is *returned* from
the endpoint for purely internal purposes (changing the data available
during *processing*).

I ended up pulling serialization out of the scoring function. At some
point down the road, we should go ahead and move the `serialize`
function out of the `search.scoring` namespace entirely - there's really
no reason for it to belong there as far as I can tell.
---
 src/metabase/api/search.clj           | 59 ++++++++++++++++++++++++---
 src/metabase/search/config.clj        |  3 ++
 src/metabase/search/scoring.clj       | 26 +++++++-----
 test/metabase/api/search_test.clj     | 46 ++++++++++++++++++++-
 test/metabase/search/filter_test.clj  |  1 +
 test/metabase/search/scoring_test.clj | 10 +++--
 6 files changed, 125 insertions(+), 20 deletions(-)

diff --git a/src/metabase/api/search.clj b/src/metabase/api/search.clj
index 9bbe1211745..7d9b87bd001 100644
--- a/src/metabase/api/search.clj
+++ b/src/metabase/api/search.clj
@@ -332,7 +332,8 @@
                                (filter (fn [[_k v]] (= v :text)))
                                (map first)
                                (remove #{:collection_authority_level :moderated_status
-                                         :initial_sync_status :pk_ref :location}))
+                                         :initial_sync_status :pk_ref :location
+                                         :collection_location}))
         case-clauses      (as-> columns-to-search <>
                             (map (fn [col] [:like [:lower col] match]) <>)
                             (interleave <> (repeat [:inline 0]))
@@ -442,6 +443,47 @@
                    :last_editor_common_name (get user-id->common-name last_editor_id)))
           results)))
 
+(defn add-dataset-collection-hierarchy
+  "Adds `collection_effective_ancestors` to *datasets* in the search results."
+  [search-results]
+  (let [;; this helper function takes a search result (with `collection_id` and `collection_location`) and returns the
+        ;; effective location of the result.
+        result->loc (fn [{:keys [collection_id collection_location]}]
+                      (:effective_location
+                       (t2/hydrate
+                        (if (nil? collection_id)
+                          collection/root-collection
+                          {:location collection_location})
+                        :effective_location)))
+        ;; a map of collection-ids to collection info
+        col-id->name&loc (into {}
+                               (for [item search-results
+                                     :when (= (:model item) "dataset")]
+                                 [(:collection_id item)
+                                  {:name (:collection_name item)
+                                   :effective_location (result->loc item)}]))
+        ;; the set of all collection IDs where we *don't* know the collection name. For example, if `col-id->name&loc`
+        ;; contained `{1 {:effective_location "/2/" :name "Foo"}}`, we need to look up the name of collection `2`.
+        to-fetch         (into #{} (comp (keep :effective_location)
+                                         (mapcat collection/location-path->ids)
+                                         ;; already have these names
+                                         (remove col-id->name&loc))
+                               (vals col-id->name&loc))
+        ;; the complete map of collection IDs to names
+        id->name         (merge (if (seq to-fetch)
+                                  (t2/select-pk->fn :name :model/Collection :id [:in to-fetch])
+                                  {})
+                                (update-vals col-id->name&loc :name))
+        annotate         (fn [x]
+                           (cond-> x
+                             (= (:model x) "dataset")
+                             (assoc :collection_effective_ancestors
+                                    (if-let [loc (result->loc x)]
+                                      (->> (collection/location-path->ids loc)
+                                           (map (fn [id] {:id id :name (id->name id)})))
+                                      []))))]
+    (map annotate search-results)))
+
 (mu/defn ^:private search
   "Builds a search query that includes all the searchable entities and runs it"
   [search-ctx :- SearchContext]
@@ -467,8 +509,10 @@
                             (map #(update % :pk_ref json/parse-string))
                             (map (partial scoring/score-and-result (:search-string search-ctx)))
                             (filter #(pos? (:score %))))
-        total-results      (hydrate-user-metadata
-                            (scoring/top-results reducible-results search.config/max-filtered-results xf))
+        total-results       (cond->> (scoring/top-results reducible-results search.config/max-filtered-results xf)
+                              true hydrate-user-metadata
+                              (:model-ancestors? search-ctx) (add-dataset-collection-hierarchy)
+                              true (map scoring/serialize))
         add-perms-for-col  (fn [item]
                              (cond-> item
                                (mi/instance-of? :model/Collection item)
@@ -502,6 +546,7 @@
            filter-items-in-personal-collection
            offset
            search-string
+           model-ancestors?
            table-db-id
            search-native-query
            verified]}      :- [:map {:closed true}
@@ -517,6 +562,7 @@
                                [:offset                              {:optional true} [:maybe ms/Int]]
                                [:table-db-id                         {:optional true} [:maybe ms/PositiveInt]]
                                [:search-native-query                 {:optional true} [:maybe boolean?]]
+                               [:model-ancestors?                    {:optional true} [:maybe boolean?]]
                                [:verified                            {:optional true} [:maybe true?]]]] :- SearchContext
   (when (some? verified)
     (premium-features/assert-has-any-features
@@ -526,7 +572,8 @@
         ctx    (cond-> {:search-string      search-string
                         :current-user-perms @api/*current-user-permissions-set*
                         :archived?          (boolean archived)
-                        :models             models}
+                        :models             models
+                        :model-ancestors?   (boolean model-ancestors?)}
                  (some? created-at)                          (assoc :created-at created-at)
                  (seq created-by)                            (assoc :created-by created-by)
                  (some? filter-items-in-personal-collection) (assoc :filter-items-in-personal-collection filter-items-in-personal-collection)
@@ -587,7 +634,7 @@
 
   A search query that has both filters applied will only return models and cards."
   [q archived context created_at created_by table_db_id models last_edited_at last_edited_by
-   filter_items_in_personal_collection search_native_query verified]
+   filter_items_in_personal_collection model_ancestors search_native_query verified]
   {q                                   [:maybe ms/NonBlankString]
    archived                            [:maybe :boolean]
    table_db_id                         [:maybe ms/PositiveInt]
@@ -598,6 +645,7 @@
    created_by                          [:maybe (ms/QueryVectorOf ms/PositiveInt)]
    last_edited_at                      [:maybe ms/NonBlankString]
    last_edited_by                      [:maybe (ms/QueryVectorOf ms/PositiveInt)]
+   model_ancestors                     [:maybe :boolean]
    search_native_query                 [:maybe true?]
    verified                            [:maybe true?]}
   (api/check-valid-page-params mw.offset-paging/*limit* mw.offset-paging/*offset*)
@@ -617,6 +665,7 @@
                              :models              models-set
                              :limit               mw.offset-paging/*limit*
                              :offset              mw.offset-paging/*offset*
+                             :model-ancestors?    model_ancestors
                              :search-native-query search_native_query
                              :verified            verified}))
         duration   (- (System/currentTimeMillis) start-time)
diff --git a/src/metabase/search/config.clj b/src/metabase/search/config.clj
index 2f3a741f3d5..d7f60bc8b54 100644
--- a/src/metabase/search/config.clj
+++ b/src/metabase/search/config.clj
@@ -99,6 +99,7 @@
    [:map {:closed true}
     [:search-string                                        [:maybe ms/NonBlankString]]
     [:archived?                                            :boolean]
+    [:model-ancestors?                                     :boolean]
     [:current-user-perms                                   [:set perms.u/PathSchema]]
     [:models                                               [:set SearchableModel]]
     [:filter-items-in-personal-collection {:optional true} [:enum "only" "exclude"]]
@@ -138,6 +139,7 @@
    :collection_id       :integer
    :collection_name     :text
    :collection_type     :text
+   :collection_location :text
    :collection_authority_level :text
    ;; returned for Card and Dashboard
    :collection_position :integer
@@ -270,6 +272,7 @@
   [_]
   (conj default-columns :collection_id :collection_position :dataset_query :display :creator_id
         [:collection.name :collection_name]
+        [:collection.location :collection_location]
         [:collection.authority_level :collection_authority_level]
         bookmark-col dashboardcard-count-col))
 
diff --git a/src/metabase/search/scoring.clj b/src/metabase/search/scoring.clj
index e8e2a798923..111316c9a57 100644
--- a/src/metabase/search/scoring.clj
+++ b/src/metabase/search/scoring.clj
@@ -301,12 +301,12 @@
      (max (- stale-time days-ago) 0)
      stale-time)))
 
-(defn- serialize
+(defn serialize
   "Massage the raw result from the DB and match data into something more useful for the client"
-  [result all-scores relevant-scores]
-  (let [{:keys [name display_name collection_id collection_name collection_authority_level collection_type]} result
-        matching-columns            (into #{} (remove nil? (map :column relevant-scores)))
-        match-context-thunk         (first (keep :match-context-thunk relevant-scores))]
+  [{:as result :keys [all-scores relevant-scores name display_name collection_id collection_name
+                      collection_authority_level collection_type collection_effective_ancestors]}]
+  (let [matching-columns    (into #{} (remove nil? (map :column relevant-scores)))
+        match-context-thunk (first (keep :match-context-thunk relevant-scores))]
     (-> result
         (assoc
          :name           (if (and (contains? matching-columns :display_name) display_name)
@@ -316,14 +316,20 @@
                                     (empty?
                                      (remove matching-columns search.config/displayed-columns)))
                            (match-context-thunk))
-         :collection     {:id              collection_id
-                          :name            collection_name
-                          :authority_level collection_authority_level
-                          :type            collection_type}
+         :collection     (merge {:id              collection_id
+                                 :name            collection_name
+                                 :authority_level collection_authority_level
+                                 :type            collection_type}
+                                (when collection_effective_ancestors
+                                  {:effective_ancestors collection_effective_ancestors}))
          :scores          all-scores)
         (update :dataset_query #(some-> % json/parse-string mbql.normalize/normalize))
         (dissoc
+         :all-scores
+         :relevant-scores
+         :collection_effective_ancestors
          :collection_id
+         :collection_location
          :collection_name
          :collection_type
          :display_name))))
@@ -397,7 +403,7 @@
                           0
                           text-matches)))
       {:score total-score
-       :result (serialize result all-scores relevant-scores)}
+       :result (assoc result :all-scores all-scores :relevant-scores relevant-scores)}
       {:score 0})))
 
 (defn compare-score
diff --git a/test/metabase/api/search_test.clj b/test/metabase/api/search_test.clj
index 32f5bc6a624..a2b84d4540a 100644
--- a/test/metabase/api/search_test.clj
+++ b/test/metabase/api/search_test.clj
@@ -141,7 +141,10 @@
 
 (defn- default-results-with-collection []
   (on-search-types #{"dashboard" "pulse" "card" "dataset" "action"}
-                   #(assoc % :collection {:id true, :name (if (= (:model %) "action") nil true) :authority_level nil :type nil})
+                   #(assoc % :collection {:id true,
+                                          :name (if (= (:model %) "action") nil true)
+                                          :authority_level nil
+                                          :type nil})
                    (default-search-results)))
 
 (defn- do-with-search-items [search-string in-root-collection? f]
@@ -919,6 +922,7 @@
                                                 :archived?          false
                                                 :models             search.config/all-models
                                                 :current-user-perms #{"/"}
+                                                :model-ancestors?   false
                                                 :limit-int          100}))]
           ;; warm it up, in case the DB call depends on the order of test execution and it needs to
           ;; do some initialization
@@ -1658,3 +1662,43 @@
              (->> (mt/user-http-request :lucky :get 200 "search" :archived true :q "test")
                   :data
                   (map :name)))))))
+
+(deftest model-ancestors-gets-ancestor-collections
+  (testing "Collection names are correct"
+    (mt/with-temp [Collection {top-col-id :id} {:name "top level col" :location "/"}
+                   Collection {mid-col-id :id} {:name "middle level col" :location (str "/" top-col-id "/")}
+                   Card {leaf-card-id :id} {:type :model :collection_id mid-col-id :name "leaf model"}
+                   Card {top-card-id :id} {:type :model :collection_id top-col-id :name "top model"}]
+      (is (= #{[leaf-card-id [{:name "top level col" :id top-col-id}]]
+               [top-card-id []]}
+             (->> (mt/user-http-request :rasta :get 200 "search" :model_ancestors true :q "model" :models ["dataset"])
+                  :data
+                  (map (juxt :id #(get-in % [:collection :effective_ancestors])))
+                  (into #{}))))))
+  (testing "Models not in a collection work correctly"
+    (mt/with-temp [Card {card-id :id} {:type :model
+                                       :name "model"
+                                       :collection_id nil}]
+      (is (= #{[card-id []]}
+             (->> (mt/user-http-request :rasta :get 200 "search" :model_ancestors true :q "model" :models ["dataset"])
+                  :data
+                  (map (juxt :id #(get-in % [:collection :effective_ancestors])))
+                  (into #{}))))))
+  (testing "Non-models don't get collection_ancestors"
+    (mt/with-temp [Card _ {:name "question"
+                           :collection_id nil}]
+      (is (not (contains? (->> (mt/user-http-request :rasta :get 200 "search" :model_ancestors true :q "question" :models ["dataset" "card"])
+                               :data
+                               (filter #(= (:name %) "question"))
+                               first
+                               :collection)
+                          :effective_ancestors)))))
+  (testing "If `model_parents` is not passed, it doesn't get populated"
+    (mt/with-temp [Collection {top-col-id :id} {:name "top level col" :location "/"}
+                   Collection {mid-col-id :id} {:name "middle level col" :location (str "/" top-col-id "/")}
+                   Card _ {:type :model :collection_id mid-col-id :name "leaf model"}
+                   Card _ {:type :model :collection_id top-col-id :name "top model"}]
+      (is (= [nil nil]
+             (->> (mt/user-http-request :rasta :get 200 "search" :model_ancestors false :q "model" :models ["dataset"])
+                  :data
+                  (map #(get-in % [:collection :effective_ancestors]))))))))
diff --git a/test/metabase/search/filter_test.clj b/test/metabase/search/filter_test.clj
index ac449840ae4..70bbb3ba3be 100644
--- a/test/metabase/search/filter_test.clj
+++ b/test/metabase/search/filter_test.clj
@@ -11,6 +11,7 @@
   {:search-string       nil
    :archived?           false
    :models             search.config/all-models
+   :model-ancestors?   false
    :current-user-perms #{"/"}})
 
 (deftest ^:parallel ->applicable-models-test
diff --git a/test/metabase/search/scoring_test.clj b/test/metabase/search/scoring_test.clj
index aaf168aef4d..af80593f174 100644
--- a/test/metabase/search/scoring_test.clj
+++ b/test/metabase/search/scoring_test.clj
@@ -280,11 +280,13 @@
                   :database 1}
           result {:name          "card"
                   :model         "card"
-                  :dataset_query (json/generate-string query)}]
-      (is (= query (-> result (#'scoring/serialize {} {}) :dataset_query)))))
+                  :dataset_query (json/generate-string query)
+                  :all-scores {}
+                  :relevant-scores {}}]
+      (is (= query (-> result scoring/serialize :dataset_query)))))
   (testing "Doesn't error on other models without a query"
-    (is (nil? (-> {:name "dash" :model "dashboard"}
-                  (#'scoring/serialize {} {})
+    (is (nil? (-> {:name "dash" :model "dashboard" :all-scores {} :relevant-scores {}}
+                  (#'scoring/serialize)
                   :dataset_query)))))
 
 (deftest ^:parallel force-weight-test
-- 
GitLab