From afa50d0feb28a18f04a3d6dfd1297aefb7593031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cam=20Sa=C3=BCl?= <cammsaul@gmail.com>
Date: Fri, 13 Nov 2015 14:52:18 -0800
Subject: [PATCH] Mongo Query Processor 2.0 :scream_cat:

---
 circle.yml                                    |  6 +++
 src/metabase/db/metadata_queries.clj          | 38 +++++++--------
 src/metabase/driver/mongo/query_processor.clj | 48 +++++++++++--------
 .../driver/query_processor/annotate.clj       |  3 +-
 src/metabase/driver/sync.clj                  | 37 ++++++--------
 src/metabase/util.clj                         | 17 +++++--
 test/metabase/driver/query_processor_test.clj | 15 ++++--
 test/metabase/models/field_test.clj           |  2 +-
 8 files changed, 94 insertions(+), 72 deletions(-)

diff --git a/circle.yml b/circle.yml
index 1d764718a56..22afd4a0aeb 100644
--- a/circle.yml
+++ b/circle.yml
@@ -8,6 +8,12 @@ machine:
     version: 2.7.3
 dependencies:
   override:
+    - sudo apt-get purge mongodb-org*
+    - sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 7F0CEB10
+    - echo "deb http://repo.mongodb.org/apt/ubuntu precise/mongodb-org/3.0 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-3.0.list
+    - sudo apt-get update
+    - sudo apt-get install -y mongodb-org
+    - sudo service mongod restart
     - lein deps
     - pip install awscli==1.7.3
 database:
diff --git a/src/metabase/db/metadata_queries.clj b/src/metabase/db/metadata_queries.clj
index 5ba4501207f..df601b41961 100644
--- a/src/metabase/db/metadata_queries.clj
+++ b/src/metabase/db/metadata_queries.clj
@@ -1,37 +1,35 @@
 (ns metabase.db.metadata-queries
   "Predefined QP queries for getting metadata about an external database."
   (:require [metabase.driver :as driver]
+            [metabase.driver.sync :as sync]
             [metabase.util :as u]))
 
-;; TODO - These queries have to be evaluated by the query processor and macroexpanded at runtime every time they're ran.
-;; It would be more efficient if we could let the QP could macroexpand normally for predefined queries like these
-
 (defn- field-query [field query]
-  (->> (driver/process-query
-        {:type :query
-         :database ((u/deref-> field :table :db) :id)
-         :query (assoc query
-                       :source_table ((u/deref-> field :table) :id))})
-       :data
-       :rows))
+  (-> (driver/process-query
+       {:type     :query
+        :database ((u/deref-> field :table :db) :id)
+        :query    (assoc query
+                         :source_table ((u/deref-> field :table) :id))})
+      :data
+      :rows))
 
 (defn field-distinct-values
-  "Return the distinct values of FIELD."
+  "Return the distinct values of FIELD.
+   This is used to create a `FieldValues` object for `:category` Fields."
   [{field-id :id :as field}]
-  (->> (field-query field {:aggregation ["rows"]  ; should we add a limit here? In case someone is dumb and tries to get millions of distinct values?
-                           :breakout [field-id]}) ; or should we let them do it
-       (map first)))
+  (mapv first (field-query field {:breakout [field-id]
+                                  :limit    sync/low-cardinality-threshold})))
 
 (defn field-distinct-count
   "Return the distinct count of FIELD."
   [{field-id :id :as field}]
-  (->> (field-query field {:aggregation ["distinct" field-id]})
-       first
-       first))
+  (-> (field-query field {:aggregation ["distinct" field-id]})
+      first
+      first))
 
 (defn field-count
   "Return the count of FIELD."
   [{field-id :id :as field}]
-  (->> (field-query field {:aggregation ["count" field-id]})
-       first
-       first))
+  (-> (field-query field {:aggregation ["count" field-id]})
+      first
+      first))
diff --git a/src/metabase/driver/mongo/query_processor.clj b/src/metabase/driver/mongo/query_processor.clj
index b4a12d2a557..7f684cd8ccb 100644
--- a/src/metabase/driver/mongo/query_processor.clj
+++ b/src/metabase/driver/mongo/query_processor.clj
@@ -285,7 +285,7 @@
    :distinct "count"
    :sum      "sum"})
 
-(defn- ag-type->group-by-clause [{:keys [aggregation-type field]}]
+(defn- aggregation->rvalue [{:keys [aggregation-type field]}]
   (if-not field
     (case aggregation-type
       :count {$sum 1})
@@ -298,24 +298,32 @@
       :sum      {$sum (->rvalue field)})))
 
 (defn- handle-breakout+aggregation [{breakout-fields :breakout, {ag-type :aggregation-type, :as aggregation} :aggregation} pipeline]
-  (when (and ag-type
-             (not= ag-type :rows))
-    (let [ag-field-name (ag-type->field-name ag-type)]
-      (filter identity
-              [(when (seq breakout-fields)
-                 {$project {"_id"      "$_id"
-                            "___group" (into {} (for [field breakout-fields] ; create a totally sweet made-up column called __group
-                                                  {(->lvalue field) (->rvalue field)}))}}) ; to store the fields we'd like to group by
-               {$group {"_id"         (when (seq breakout-fields)
-                                        "$___group")
-                        ag-field-name (ag-type->group-by-clause aggregation)}}
-               {$sort {"_id" 1}}
-               {$project (merge {"_id"         false
-                                 ag-field-name (if (= ag-type :distinct)
-                                                 {$size "$count"} ; HACK
-                                                 true)}
-                                (into {} (for [field breakout-fields]
-                                           {(->lvalue field) (format "$_id.%s" (->lvalue field))})))}]))))
+  (let [aggregation? (and ag-type
+                          (not= ag-type :rows))
+        breakout?    (seq breakout-fields)]
+    (when (or aggregation? breakout?)
+      (let [ag-field-name (ag-type->field-name ag-type)]
+        (filter identity
+                [ ;; create a totally sweet made-up column called __group to store the fields we'd like to group by
+                 (when breakout?
+                   {$project {"_id"      "$_id"
+                              "___group" (into {} (for [field breakout-fields]
+                                                    {(->lvalue field) (->rvalue field)}))}})
+                 ;; Now project onto the __group and the aggregation rvalue
+                 {$group (merge {"_id" (when breakout?
+                                         "$___group")}
+                                (when aggregation
+                                  {ag-field-name (aggregation->rvalue aggregation)}))}
+                 ;; Sort by _id (___group)
+                 {$sort {"_id" 1}}
+                 ;; now project back to the fields we expect
+                 {$project (merge {"_id" false}
+                                  (when aggregation?
+                                    {ag-field-name (if (= ag-type :distinct)
+                                                     {$size "$count"} ; HACK
+                                                     true)})
+                                  (into {} (for [field breakout-fields]
+                                             {(->lvalue field) (format "$_id.%s" (->lvalue field))})))}])))))
 
 
 ;;; ### order-by
@@ -412,6 +420,6 @@
       (log-monger-form generated-pipeline)
       (->> (with-mongo-connection [_ database]
              (mc/aggregate *mongo-connection* source-table-name generated-pipeline
-                           :allow-disk-use (not (config/is-test?))))
+                           :allow-disk-use true))
            unescape-names
            unstringify-dates))))
diff --git a/src/metabase/driver/query_processor/annotate.clj b/src/metabase/driver/query_processor/annotate.clj
index 775dfb011d3..b368c248694 100644
--- a/src/metabase/driver/query_processor/annotate.clj
+++ b/src/metabase/driver/query_processor/annotate.clj
@@ -113,7 +113,6 @@
   (let [expected-keys (set (map :field-name fields))
         _             (assert (every? keyword? expected-keys))
         missing-keys  (set/difference actual-keys expected-keys)]
-
     (when (seq missing-keys)
       (log/warn (u/format-color 'yellow "There are fields we weren't expecting in the results: %s\nExpected: %s\nActual: %s"
                   missing-keys expected-keys actual-keys)))
@@ -206,7 +205,7 @@
                                            :destination_id [not= nil]))))
   ;; Fetch the destination Fields referenced by the ForeignKeys
   ([fields fk-ids id->dest-id]
-   (when (seq (vals id->dest-id))
+   (when (seq id->dest-id)
      (fk-field->dest-fn fields fk-ids id->dest-id (sel :many :id->fields [Field :id :name :display_name :table_id :description :base_type :special_type :preview_display]
                                                        :id [in (vals id->dest-id)]))))
   ;; Return a function that will return the corresponding destination Field for a given Field
diff --git a/src/metabase/driver/sync.clj b/src/metabase/driver/sync.clj
index e2144f8bdad..6ae37cc0783 100644
--- a/src/metabase/driver/sync.clj
+++ b/src/metabase/driver/sync.clj
@@ -357,31 +357,22 @@
 
 ;; ## sync-field
 
-(defmacro ^:private sync-field->>
-  "Like `->>`, but wrap each form with `try-apply`, and pass FIELD along to the next if the previous form returned `nil`."
-  [field & fns]
-  `(->> ~field
-        ~@(->> fns
-               (map (fn [f]
-                      (let [[f & args] (if (list? f) f [f])]
-                        `((fn [field#]
-                             (or (u/try-apply ~f ~@args field#)
-                                 field#)))))))))
-
 (defn- sync-field!
   "Sync the metadata for FIELD, marking urls, categories, etc. when applicable."
   [driver field]
-  {:pre [driver
-         field]}
-  (sync-field->> field
-                 (maybe-driver-specific-sync-field! driver)
-                 set-field-display-name-if-needed!
-                 (mark-url-field! driver)
-                 (mark-no-preview-display-field! driver)
-                 mark-category-field-or-update-field-values!
-                 (mark-json-field! driver)
-                 auto-assign-field-special-type-by-name!
-                 (sync-field-nested-fields! driver)))
+  {:pre [driver field]}
+  (loop [field field, [f & more] [(partial maybe-driver-specific-sync-field! driver)
+                                  set-field-display-name-if-needed!
+                                  (partial mark-url-field! driver)
+                                  (partial mark-no-preview-display-field! driver)
+                                  mark-category-field-or-update-field-values!
+                                  (partial mark-json-field! driver)
+                                  auto-assign-field-special-type-by-name!
+                                  (partial sync-field-nested-fields! driver)]]
+    (let [field (or (u/try-apply f field)
+                    field)]
+      (when (seq more)
+        (recur field more)))))
 
 
 ;; Each field-syncing function below should return FIELD with any updates that we made, or nil.
@@ -453,7 +444,7 @@
 
 ;; ### mark-category-field-or-update-field-values!
 
-(def ^:const ^:private low-cardinality-threshold
+(def ^:const low-cardinality-threshold
   "Fields with less than this many distinct values should automatically be marked with `special_type = :category`."
   40)
 
diff --git a/src/metabase/util.clj b/src/metabase/util.clj
index 1545b32d866..9b3a6756f35 100644
--- a/src/metabase/util.clj
+++ b/src/metabase/util.clj
@@ -166,7 +166,7 @@
        :hour    (trunc-with-format "yyyy-MM-dd'T'HH:00:00")
        :day     (trunc-with-format "yyyy-MM-dd")
        :week    (let [day-of-week (date-extract :day-of-week date)
-                      date        (relative-date :day (- day-of-week) date)]
+                      date        (relative-date :day (- (dec day-of-week)) date)]
                   (trunc-with-format "yyyy-MM-dd" date))
        :month   (trunc-with-format "yyyy-MM")
        :quarter (let [year    (date-extract :year date)
@@ -426,9 +426,20 @@
                                  (pprint-to-str (filtered-stacktrace e))))))))))
 
 (defn try-apply
-  "Like `apply`, but wraps F inside a `try-catch` block and logs exceptions caught."
+  "Like `apply`, but wraps F inside a `try-catch` block and logs exceptions caught.
+   (This is actaully more flexible than `apply` -- the last argument doesn't have to be
+   a sequence:
+
+     (try-apply vector :a :b [:c :d]) -> [:a :b :c :d]
+     (apply vector :a :b [:c :d])     -> [:a :b :c :d]
+     (try-apply vector :a :b :c :d)   -> [:a :b :c :d]
+     (apply vector :a :b :c :d)       -> Not ok - :d is not a sequence
+
+   This allows us to use `try-apply` in more situations than we'd otherwise be able to."
   [^clojure.lang.IFn f & args]
-  (apply (wrap-try-catch f) args))
+  (apply (wrap-try-catch f) (concat (butlast args) (if (sequential? (last args))
+                                                     (last args)
+                                                     [(last args)]))))
 
 (defn wrap-try-catch!
   "Re-intern FN-SYMB as a new fn that wraps the original with a `try-catch`. Intended for debugging.
diff --git a/test/metabase/driver/query_processor_test.clj b/test/metabase/driver/query_processor_test.clj
index 19718a5fa7c..b4bd00560d9 100644
--- a/test/metabase/driver/query_processor_test.clj
+++ b/test/metabase/driver/query_processor_test.clj
@@ -430,10 +430,20 @@
      breakout user_id
      order user_id+))
 
+;; ### BREAKOUT w/o AGGREGATION
+;; This should act as a "distinct values" query and return ordered results
+(qp-expect-with-all-datasets
+    {:cols    [(checkins-col :user_id)]
+     :columns [(format-name "user_id")]
+     :rows    [[1] [2] [3] [4] [5] [6] [7] [8] [9] [10]]}
+  (Q breakout user_id of checkins
+     limit 10))
+
+
 ;; ### "BREAKOUT" - MULTIPLE COLUMNS W/ IMPLICT "ORDER_BY"
 ;; Fields should be implicitly ordered :ASC for all the fields in `breakout` that are not specified in `order_by`
 (qp-expect-with-all-datasets
-    {:rows    [[1 1 1] [1 5 1] [1 7 1] [1 10 1] [1 13 1] [1 16 1] [1 26 1] [1 31 1] [1 35 1] [1 36 1]],
+    {:rows    [[1 1 1] [1 5 1] [1 7 1] [1 10 1] [1 13 1] [1 16 1] [1 26 1] [1 31 1] [1 35 1] [1 36 1]]
      :columns [(format-name "user_id")
                (format-name "venue_id")
                "count"]
@@ -821,8 +831,7 @@
    [897 "Wearing a Biggie Shirt"]
    [499 "In the Expa Office"]]
   (Q dataset tupac-sightings
-     return rows
-     of sightings
+     return rows of sightings
      fields id category_id->categories.name
      order timestamp-
      limit 10))
diff --git a/test/metabase/models/field_test.clj b/test/metabase/models/field_test.clj
index 7ead0d66632..05bb383ab55 100644
--- a/test/metabase/models/field_test.clj
+++ b/test/metabase/models/field_test.clj
@@ -8,7 +8,7 @@
 ;; Check that setting a Field's special_type to :category will cause a corresponding FieldValues to be created asynchronously
 (expect
     [nil
-     75
+     40
      :done]
   (let [orig-special-type      (sel :one :field [Field :special_type] :id (id :categories :name))
         set-field-special-type (fn [special-type]
-- 
GitLab