From e26723ec222465d3879d733fa07d356f560bd73e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cam=20Sa=C3=BCl?= <cammsaul@gmail.com>
Date: Tue, 26 Jan 2016 15:10:29 -0800
Subject: [PATCH] Run Druid unit tests on CI :persevere:

---
 circle.yml                                    |   4 +-
 src/metabase/driver/druid/query_processor.clj |   2 +-
 src/metabase/models/database.clj              |   6 +-
 test/metabase/api/database_test.clj           |  76 +++++-----
 test/metabase/test/data/druid.clj             | 137 +++++++++++-------
 5 files changed, 128 insertions(+), 97 deletions(-)

diff --git a/circle.yml b/circle.yml
index 40f37811b53..8302f70e66f 100644
--- a/circle.yml
+++ b/circle.yml
@@ -25,11 +25,11 @@ test:
     # 0) runs unit tests w/ H2 local DB. Runs against H2, Mongo, MySQL
     # 1) runs unit tests w/ Postgres local DB. Runs against H2, SQL Server
     # 2) runs unit tests w/ MySQL local DB. Runs against H2, Postgres, SQLite
-    # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift
+    # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift, Druid
     # 4) runs Eastwood linter & Bikeshed linter && ./bin/reflection-linter
     # 5) runs JS linter + JS test
     # 6) runs lein uberjar. (We don't run bin/build because we're not really concerned about `npm install` (etc) in this test, which runs elsewhere)
-    - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac:
+    - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift,druid lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac:
         parallel: true
 deployment:
   master:
diff --git a/src/metabase/driver/druid/query_processor.clj b/src/metabase/driver/druid/query_processor.clj
index c4610f23ba7..6b4ed1f4175 100644
--- a/src/metabase/driver/druid/query_processor.clj
+++ b/src/metabase/driver/druid/query_processor.clj
@@ -61,7 +61,7 @@
 (def ^:private ^:const query-type->default-query
   (let [defaults {:intervals   ["-5000/5000"]
                   :granularity :all
-                  :context     {:timeout 5000}}]
+                  :context     {:timeout 60000}}]
     {::select     (merge defaults {:queryType  :select
                                    :pagingSpec {:threshold 100 #_qp/absolute-max-results}})
      ::timeseries (merge defaults {:queryType :timeseries})
diff --git a/src/metabase/models/database.clj b/src/metabase/models/database.clj
index 5f5380e5e5c..3b8fcd083f7 100644
--- a/src/metabase/models/database.clj
+++ b/src/metabase/models/database.clj
@@ -14,9 +14,9 @@
 
 (defn- post-select [{:keys [engine] :as database}]
   (if-not engine database
-                 (assoc database :features (if-let [driver ((resolve 'metabase.driver/engine->driver) engine)]
-                                             ((resolve 'metabase.driver/features) driver)
-                                             []))))
+          (assoc database :features (or (when-let [driver ((resolve 'metabase.driver/engine->driver) engine)]
+                                          (seq ((resolve 'metabase.driver/features) driver)))
+                                        []))))
 
 (defn- pre-cascade-delete [{:keys [id]}]
   (cascade-delete 'Card  :database_id id)
diff --git a/test/metabase/api/database_test.clj b/test/metabase/api/database_test.clj
index 9d47c7a66cf..d6f86e666f1 100644
--- a/test/metabase/api/database_test.clj
+++ b/test/metabase/api/database_test.clj
@@ -40,7 +40,7 @@
       :is_full_sync    true
       :organization_id nil
       :description     nil
-      :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver (:engine db))))})))
+      :features        (mapv name (driver/features (driver/engine->driver (:engine db))))})))
 
 
 ;; # DB LIFECYCLE ENDPOINTS
@@ -71,7 +71,7 @@
          :is_full_sync    false
          :organization_id nil
          :description     nil
-         :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))})
+         :features        (mapv name (driver/features (driver/engine->driver :postgres)))})
     (create-db db-name false)))
 
 ;; ## DELETE /api/database/:id
@@ -113,41 +113,41 @@
 ;; Database details *should not* come back for Rasta since she's not a superuser
 (let [db-name (str "A" (random-name))] ; make sure this name comes before "test-data"
   (expect-eval-actual-first
-      (set (filter identity
-                   (conj (for [engine datasets/all-valid-engines]
-                           (datasets/when-testing-engine engine
-                             (match-$ (get-or-create-test-data-db! (driver/engine->driver engine))
-                               {:created_at      $
-                                :engine          (name $engine)
-                                :id              $
-                                :updated_at      $
-                                :name            "test-data"
-                                :is_sample       false
-                                :is_full_sync    true
-                                :organization_id nil
-                                :description     nil
-                                :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver engine)))})))
-                         (match-$ (sel :one Database :name db-name)
-                           {:created_at      $
-                            :engine          "postgres"
-                            :id              $
-                            :updated_at      $
-                            :name            $
-                            :is_sample       false
-                            :is_full_sync    true
-                            :organization_id nil
-                            :description     nil
-                            :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))}))))
-    (do
-      ;; Delete all the randomly created Databases we've made so far
-      (cascade-delete Database :id [not-in (set (filter identity
-                                                        (for [engine datasets/all-valid-engines]
-                                                          (datasets/when-testing-engine engine
-                                                            (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))])
-      ;; Add an extra DB so we have something to fetch besides the Test DB
-      (create-db db-name)
-      ;; Now hit the endpoint
-      (set ((user->client :rasta) :get 200 "database")))))
+      (set (filter identity (conj (for [engine datasets/all-valid-engines]
+                                    (datasets/when-testing-engine engine
+                                      (match-$ (get-or-create-test-data-db! (driver/engine->driver engine))
+                                        {:created_at      $
+                                         :engine          (name $engine)
+                                         :id              $
+                                         :updated_at      $
+                                         :name            "test-data"
+                                         :is_sample       false
+                                         :is_full_sync    true
+                                         :organization_id nil
+                                         :description     nil
+                                         :features        (mapv name (driver/features (driver/engine->driver engine)))})))
+                                  ;; (?) I don't remember why we have to do this for postgres but not any other of the bonus drivers
+                                  (match-$ (sel :one Database :name db-name)
+                                    {:created_at      $
+                                     :engine          "postgres"
+                                     :id              $
+                                     :updated_at      $
+                                     :name            $
+                                     :is_sample       false
+                                     :is_full_sync    true
+                                     :organization_id nil
+                                     :description     nil
+                                     :features        (mapv name (driver/features (driver/engine->driver :postgres)))}))))
+      (do
+        ;; Delete all the randomly created Databases we've made so far
+        (cascade-delete Database :id [not-in (set (filter identity
+                                                          (for [engine datasets/all-valid-engines]
+                                                            (datasets/when-testing-engine engine
+                                                              (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))])
+        ;; Add an extra DB so we have something to fetch besides the Test DB
+        (create-db db-name)
+        ;; Now hit the endpoint
+        (set ((user->client :rasta) :get 200 "database")))))
 
 
 ;; ## GET /api/meta/table/:id/query_metadata
@@ -163,7 +163,7 @@
        :is_full_sync    true
        :organization_id nil
        :description     nil
-       :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :h2)))
+       :features        (mapv name (driver/features (driver/engine->driver :h2)))
        :tables [(match-$ (Table (id :categories))
                   {:description     nil
                    :entity_type     nil
diff --git a/test/metabase/test/data/druid.clj b/test/metabase/test/data/druid.clj
index 96527912555..0d2b1766940 100644
--- a/test/metabase/test/data/druid.clj
+++ b/test/metabase/test/data/druid.clj
@@ -1,19 +1,74 @@
 (ns metabase.test.data.druid
-  (:require [cheshire.core :as json]
-            [clojure.java.io :as io]
+  (:require [clojure.java.io :as io]
+            [cheshire.core :as json]
+            [environ.core :refer [env]]
             [metabase.driver.druid :as druid]
-            (metabase.test.data dataset-definitions
+            (metabase.test.data [dataset-definitions :as defs]
                                 [datasets :as datasets]
                                 [interface :as i])
             [metabase.test.util :refer [resolve-private-fns]]
             [metabase.util :as u])
   (:import metabase.driver.druid.DruidDriver))
 
-(def ^:private ^:const temp-dir (System/getProperty "java.io.tmpdir"))
-(def ^:private ^:const source-filename "checkins.json")
+(defn- database->connection-details [& _]
+  {:host (or (env :mb-druid-host)
+             (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_HOST`.")))
+   :port (Integer/parseInt (or (env :mb-druid-port)
+                               (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_PORT`."))))})
+
+(extend DruidDriver
+  i/IDatasetLoader
+  (merge i/IDatasetLoaderDefaultsMixin
+         {:engine                       (constantly :druid)
+          :database->connection-details database->connection-details
+          :create-db!                   (constantly "nil")
+          :destroy-db!                  (constantly nil)}))
+
+
+
+;;; Setting Up a Server w/ Druid Test Data
+
+;; Unfortunately the process of loading test data onto an external server for CI purposes is a little involved. Before testing against Druid, you'll need to perform the following steps:
+;; For EC2 instances, make sure to expose ports `8082` & `8090` for Druid while loading data. Once data has finished loading, you only need to expose port `8082`.
+;;
+;; 1.  Setup Zookeeper
+;;     1A.  Download & extract Zookeeper from `http://zookeeper.apache.org/releases.html#download`
+;;     1B.  Create `zookeeper/conf/zoo.cfg` -- see the Getting Started Guide: `http://zookeeper.apache.org/doc/r3.4.6/zookeeperStarted.html`
+;;     1C.  `zookeeper/bin/zkServer.sh start`
+;;     1D.  `zookeeper/bin/zkServer.sh status` (to make sure it started correctly)
+;; 2.  Setup Druid
+;;     2A.  Download & extract Druid from `http://druid.io/downloads.html`
+;;     2B.  `cp druid/run_druid_server.sh druid/run_historical.sh` and bump the `-Xmx` setting to `6g` or so
+;;     2C.  `cd druid && ./run_druid_server.sh coordinator`
+;;     2D.  `cd druid && ./run_druid_server.sh broker`
+;;     2E.  `cd druid && ./run_historical.sh historical`
+;;     2E.  `cd druid && ./run_druid_server.sh overlord`
+;; 3.  Generate flattened test data file. Optionally pick a <filename>
+;;     3A.  From this namespace in the REPL, run `(generate-json-for-batch-ingestion <filename>)`
+;;     3B.  `scp` or otherwise upload this file to the box running druid (if applicable)
+;; 4.  Launch Druid Indexing Task
+;;     4A.  Run the indexing task on the remote instance.
+;;
+;;            (run-indexing-task <remote-host> :base-dir <dir-where-you-uploaded-file>, :filename <file>)
+;;            e.g.
+;;            (run-indexing-task "http://ec2-52-90-109-199.compute-1.amazonaws.com", :base-dir "/home/ec2-user", :filename "checkins.json")
+;;
+;;          The task will keep you apprised of its progress until it completes (takes 1-2 minutes)
+;;     4B.  Keep an eye on `<host>:8082/druid/v2/datasources`. (e.g. "http://ec2-52-90-109-199.compute-1.amazonaws.com:8082/druid/v2/datasources")
+;;          This endpoint will return an empty array until the broker knows about the newly ingested segments. When it returns an array with the string `"checkins"` you're ready to
+;;          run the tests.
+;;     4C.  Kill the `overlord` process once the data has finished loading.
+;; 5.  Running Tests
+;;     5A.  You can run tests like `ENGINES=druid MB_DRUID_PORT=8082 MB_DRUID_HOST=http://ec2-52-90-109-199.compute-1.amazonaws.com lein test`
+
+(def ^:private ^:const default-filename "Default filename for batched ingestion data file."
+  "checkins.json")
+
+
+;;; Generating Data File
 
 (defn- flattened-test-data []
-  (let [dbdef    (i/flatten-dbdef metabase.test.data.dataset-definitions/test-data "checkins")
+  (let [dbdef    (i/flatten-dbdef defs/test-data "checkins")
         tabledef (first (:table-definitions dbdef))]
     (->> (:rows tabledef)
          (map (partial zipmap (map :field-name (:field-definitions tabledef))))
@@ -29,7 +84,21 @@
         (json/generate-stream row writer)
         (.append writer \newline)))))
 
-(def ^:private ^:const indexing-task
+(defn- generate-json-for-batch-ingestion
+  "Generate the file to be used for a batched data ingestion for Druid."
+  ([]
+   (generate-json-for-batch-ingestion default-filename))
+  ([filename]
+   (write-dbdef-to-json (flattened-test-data) filename)))
+
+
+;;; Running Indexing Task
+
+(defn- indexing-task
+  "Create a batched ingestion task dictionary."
+  [{:keys [base-dir filename]
+    :or   {base-dir "/home/ec2-user"
+           filename default-filename}}]
   {:type :index
    :spec {:dataSchema {:dataSource      "checkins"
                        :parser          {:type      :string
@@ -53,18 +122,20 @@
                                          :intervals          ["2000/2016"]}}
           :ioConfig   {:type     :index
                        :firehose {:type    :local
-                                  :baseDir temp-dir
-                                  :filter  source-filename}}}})
+                                  :baseDir base-dir
+                                  :filter  filename}}}})
 
-(def ^:private ^:const indexer-endpoint "http://localhost:8090/druid/indexer/v1/task")
 (def ^:private ^:const indexer-timeout-seconds
   "Maximum number of seconds we should wait for the indexing task to finish before deciding it's failed."
-  120)
+  180)
 
 (resolve-private-fns metabase.driver.druid GET POST)
 
-(defn- run-indexing-task []
-  (let [{:keys [task]} (POST indexer-endpoint, :body indexing-task)
+(defn- run-indexing-task
+  "Run a batched ingestion task on HOST."
+  [host & {:as indexing-task-args}]
+  (let [indexer-endpoint (str host ":8090/druid/indexer/v1/task")
+        {:keys [task]} (POST indexer-endpoint, :body (indexing-task indexing-task-args))
         status-url     (str indexer-endpoint "/" task "/status")]
     (println "STATUS URL: " (str indexer-endpoint "/" task "/status"))
     (loop [remaining-seconds indexer-timeout-seconds]
@@ -77,43 +148,3 @@
             (throw (Exception. (str "Indexing task failed:\n" (u/pprint-to-str status)))))
           (Thread/sleep 1000)
           (recur (dec remaining-seconds)))))))
-
-(defn- setup-druid-test-data* []
-  (println (u/format-color 'blue "Loading druid test data..."))
-  (write-dbdef-to-json (flattened-test-data) (str temp-dir "/" source-filename))
-  (run-indexing-task))
-
-#_(defn- setup-druid-test-data
-  {:expectations-options :before-run}
-  []
-  (datasets/when-testing-engine :druid
-    (setup-druid-test-data*)))
-
-;; TODO - needs to wait until http://localhost:8082/druid/v2/datasources/checkins?interval=-5000/5000 returns data
-#_{:dimensions [:venue_name
-              :venue_category_name
-              :user_password
-              :venue_longitude
-              :user_name
-              :id
-              :venue_latitude
-              :user_last_login
-              :venue_price]
- :metrics [:count]}
-
-
-(defn- database->connection-details [this context dbdef]
-  {:host "http://localhost"
-   :port 8082})
-
-(extend DruidDriver
-  i/IDatasetLoader
-  (merge i/IDatasetLoaderDefaultsMixin
-         {:engine                       (constantly :druid)
-          :database->connection-details database->connection-details
-          :create-db!                   (constantly "nil")
-          :destroy-db!                  (constantly nil)}))
-
-
-;; TODO - don't log druid query during sync
-;; TODO - make `:paging` a feature?
-- 
GitLab