Run Druid unit tests on CI

e26723ec · Cam Saül · d913b59a · e26723ec · e26723ec · e26723ec
Commit e26723ec authored 9 years ago by Cam Saül
--- a/circle.yml
+++ b/circle.yml
@@ -25,11 +25,11 @@ test:
    # 0) runs unit tests w/ H2 local DB. Runs against H2, Mongo, MySQL
    # 1) runs unit tests w/ Postgres local DB. Runs against H2, SQL Server
    # 2) runs unit tests w/ MySQL local DB. Runs against H2, Postgres, SQLite
-    # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift
+    # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift, Druid
    # 4) runs Eastwood linter & Bikeshed linter && ./bin/reflection-linter
    # 5) runs JS linter + JS test
    # 6) runs lein uberjar. (We don't run bin/build because we're not really concerned about `npm install` (etc) in this test, which runs elsewhere)
-    - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac:
+    - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift,druid lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac:
        parallel: true
 deployment:
  master:

--- a/src/metabase/driver/druid/query_processor.clj
+++ b/src/metabase/driver/druid/query_processor.clj
@@ -61,7 +61,7 @@
 (def ^:private ^:const query-type->default-query
  (let [defaults {:intervals   ["-5000/5000"]
                  :granularity :all
-                  :context     {:timeout 5000}}]
+                  :context     {:timeout 60000}}]
    {::select     (merge defaults {:queryType  :select
                                   :pagingSpec {:threshold 100 #_qp/absolute-max-results}})
     ::timeseries (merge defaults {:queryType :timeseries})

--- a/src/metabase/models/database.clj
+++ b/src/metabase/models/database.clj
@@ -14,9 +14,9 @@

 (defn- post-select [{:keys [engine] :as database}]
  (if-not engine database
-                 (assoc database :features (if-let [driver ((resolve 'metabase.driver/engine->driver) engine)]
-                                             ((resolve 'metabase.driver/features) driver)
-                                             []))))
+          (assoc database :features (or (when-let [driver ((resolve 'metabase.driver/engine->driver) engine)]
+                                          (seq ((resolve 'metabase.driver/features) driver)))
+                                        []))))

 (defn- pre-cascade-delete [{:keys [id]}]
  (cascade-delete 'Card  :database_id id)

--- a/test/metabase/api/database_test.clj
+++ b/test/metabase/api/database_test.clj
@@ -40,7 +40,7 @@
      :is_full_sync    true
      :organization_id nil
      :description     nil
-      :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver (:engine db))))})))
+      :features        (mapv name (driver/features (driver/engine->driver (:engine db))))})))


 ;; # DB LIFECYCLE ENDPOINTS
@@ -71,7 +71,7 @@
         :is_full_sync    false
         :organization_id nil
         :description     nil
-         :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))})
+         :features        (mapv name (driver/features (driver/engine->driver :postgres)))})
    (create-db db-name false)))

 ;; ## DELETE /api/database/:id
@@ -113,41 +113,41 @@
 ;; Database details *should not* come back for Rasta since she's not a superuser
 (let [db-name (str "A" (random-name))] ; make sure this name comes before "test-data"
  (expect-eval-actual-first
-      (set (filter identity
-                   (conj (for [engine datasets/all-valid-engines]
-                           (datasets/when-testing-engine engine
-                             (match-$ (get-or-create-test-data-db! (driver/engine->driver engine))
-                               {:created_at      $
-                                :engine          (name $engine)
-                                :id              $
-                                :updated_at      $
-                                :name            "test-data"
-                                :is_sample       false
-                                :is_full_sync    true
-                                :organization_id nil
-                                :description     nil
-                                :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver engine)))})))
-                         (match-$ (sel :one Database :name db-name)
-                           {:created_at      $
-                            :engine          "postgres"
-                            :id              $
-                            :updated_at      $
-                            :name            $
-                            :is_sample       false
-                            :is_full_sync    true
-                            :organization_id nil
-                            :description     nil
-                            :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))}))))
-    (do
-      ;; Delete all the randomly created Databases we've made so far
-      (cascade-delete Database :id [not-in (set (filter identity
-                                                        (for [engine datasets/all-valid-engines]
-                                                          (datasets/when-testing-engine engine
-                                                            (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))])
-      ;; Add an extra DB so we have something to fetch besides the Test DB
-      (create-db db-name)
-      ;; Now hit the endpoint
-      (set ((user->client :rasta) :get 200 "database")))))
+      (set (filter identity (conj (for [engine datasets/all-valid-engines]
+                                    (datasets/when-testing-engine engine
+                                      (match-$ (get-or-create-test-data-db! (driver/engine->driver engine))
+                                        {:created_at      $
+                                         :engine          (name $engine)
+                                         :id              $
+                                         :updated_at      $
+                                         :name            "test-data"
+                                         :is_sample       false
+                                         :is_full_sync    true
+                                         :organization_id nil
+                                         :description     nil
+                                         :features        (mapv name (driver/features (driver/engine->driver engine)))})))
+                                  ;; (?) I don't remember why we have to do this for postgres but not any other of the bonus drivers
+                                  (match-$ (sel :one Database :name db-name)
+                                    {:created_at      $
+                                     :engine          "postgres"
+                                     :id              $
+                                     :updated_at      $
+                                     :name            $
+                                     :is_sample       false
+                                     :is_full_sync    true
+                                     :organization_id nil
+                                     :description     nil
+                                     :features        (mapv name (driver/features (driver/engine->driver :postgres)))}))))
+      (do
+        ;; Delete all the randomly created Databases we've made so far
+        (cascade-delete Database :id [not-in (set (filter identity
+                                                          (for [engine datasets/all-valid-engines]
+                                                            (datasets/when-testing-engine engine
+                                                              (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))])
+        ;; Add an extra DB so we have something to fetch besides the Test DB
+        (create-db db-name)
+        ;; Now hit the endpoint
+        (set ((user->client :rasta) :get 200 "database")))))


 ;; ## GET /api/meta/table/:id/query_metadata
@@ -163,7 +163,7 @@
       :is_full_sync    true
       :organization_id nil
       :description     nil
-       :features        (mapv name (metabase.driver/features (metabase.driver/engine->driver :h2)))
+       :features        (mapv name (driver/features (driver/engine->driver :h2)))
       :tables [(match-$ (Table (id :categories))
                  {:description     nil
                   :entity_type     nil

--- a/test/metabase/test/data/druid.clj
+++ b/test/metabase/test/data/druid.clj
 (ns metabase.test.data.druid
-  (:require [cheshire.core :as json]
-            [clojure.java.io :as io]
+  (:require [clojure.java.io :as io]
+            [cheshire.core :as json]
+            [environ.core :refer [env]]
            [metabase.driver.druid :as druid]
-            (metabase.test.data dataset-definitions
+            (metabase.test.data [dataset-definitions :as defs]
                                [datasets :as datasets]
                                [interface :as i])
            [metabase.test.util :refer [resolve-private-fns]]
            [metabase.util :as u])
  (:import metabase.driver.druid.DruidDriver))

-(def ^:private ^:const temp-dir (System/getProperty "java.io.tmpdir"))
-(def ^:private ^:const source-filename "checkins.json")
+(defn- database->connection-details [& _]
+  {:host (or (env :mb-druid-host)
+             (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_HOST`.")))
+   :port (Integer/parseInt (or (env :mb-druid-port)
+                               (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_PORT`."))))})
+
+(extend DruidDriver
+  i/IDatasetLoader
+  (merge i/IDatasetLoaderDefaultsMixin
+         {:engine                       (constantly :druid)
+          :database->connection-details database->connection-details
+          :create-db!                   (constantly "nil")
+          :destroy-db!                  (constantly nil)}))
+
+
+
+;;; Setting Up a Server w/ Druid Test Data
+
+;; Unfortunately the process of loading test data onto an external server for CI purposes is a little involved. Before testing against Druid, you'll need to perform the following steps:
+;; For EC2 instances, make sure to expose ports `8082` & `8090` for Druid while loading data. Once data has finished loading, you only need to expose port `8082`.
+;;
+;; 1.  Setup Zookeeper
+;;     1A.  Download & extract Zookeeper from `http://zookeeper.apache.org/releases.html#download`
+;;     1B.  Create `zookeeper/conf/zoo.cfg` -- see the Getting Started Guide: `http://zookeeper.apache.org/doc/r3.4.6/zookeeperStarted.html`
+;;     1C.  `zookeeper/bin/zkServer.sh start`
+;;     1D.  `zookeeper/bin/zkServer.sh status` (to make sure it started correctly)
+;; 2.  Setup Druid
+;;     2A.  Download & extract Druid from `http://druid.io/downloads.html`
+;;     2B.  `cp druid/run_druid_server.sh druid/run_historical.sh` and bump the `-Xmx` setting to `6g` or so
+;;     2C.  `cd druid && ./run_druid_server.sh coordinator`
+;;     2D.  `cd druid && ./run_druid_server.sh broker`
+;;     2E.  `cd druid && ./run_historical.sh historical`
+;;     2E.  `cd druid && ./run_druid_server.sh overlord`
+;; 3.  Generate flattened test data file. Optionally pick a <filename>
+;;     3A.  From this namespace in the REPL, run `(generate-json-for-batch-ingestion <filename>)`
+;;     3B.  `scp` or otherwise upload this file to the box running druid (if applicable)
+;; 4.  Launch Druid Indexing Task
+;;     4A.  Run the indexing task on the remote instance.
+;;
+;;            (run-indexing-task <remote-host> :base-dir <dir-where-you-uploaded-file>, :filename <file>)
+;;            e.g.
+;;            (run-indexing-task "http://ec2-52-90-109-199.compute-1.amazonaws.com", :base-dir "/home/ec2-user", :filename "checkins.json")
+;;
+;;          The task will keep you apprised of its progress until it completes (takes 1-2 minutes)
+;;     4B.  Keep an eye on `<host>:8082/druid/v2/datasources`. (e.g. "http://ec2-52-90-109-199.compute-1.amazonaws.com:8082/druid/v2/datasources")
+;;          This endpoint will return an empty array until the broker knows about the newly ingested segments. When it returns an array with the string `"checkins"` you're ready to
+;;          run the tests.
+;;     4C.  Kill the `overlord` process once the data has finished loading.
+;; 5.  Running Tests
+;;     5A.  You can run tests like `ENGINES=druid MB_DRUID_PORT=8082 MB_DRUID_HOST=http://ec2-52-90-109-199.compute-1.amazonaws.com lein test`
+
+(def ^:private ^:const default-filename "Default filename for batched ingestion data file."
+  "checkins.json")
+
+
+;;; Generating Data File

 (defn- flattened-test-data []
-  (let [dbdef    (i/flatten-dbdef metabase.test.data.dataset-definitions/test-data "checkins")
+  (let [dbdef    (i/flatten-dbdef defs/test-data "checkins")
        tabledef (first (:table-definitions dbdef))]
    (->> (:rows tabledef)
         (map (partial zipmap (map :field-name (:field-definitions tabledef))))
@@ -29,7 +84,21 @@
        (json/generate-stream row writer)
        (.append writer \newline)))))

-(def ^:private ^:const indexing-task
+(defn- generate-json-for-batch-ingestion
+  "Generate the file to be used for a batched data ingestion for Druid."
+  ([]
+   (generate-json-for-batch-ingestion default-filename))
+  ([filename]
+   (write-dbdef-to-json (flattened-test-data) filename)))
+
+
+;;; Running Indexing Task
+
+(defn- indexing-task
+  "Create a batched ingestion task dictionary."
+  [{:keys [base-dir filename]
+    :or   {base-dir "/home/ec2-user"
+           filename default-filename}}]
  {:type :index
   :spec {:dataSchema {:dataSource      "checkins"
                       :parser          {:type      :string
@@ -53,18 +122,20 @@
                                         :intervals          ["2000/2016"]}}
          :ioConfig   {:type     :index
                       :firehose {:type    :local
-                                  :baseDir temp-dir
-                                  :filter  source-filename}}}})
+                                  :baseDir base-dir
+                                  :filter  filename}}}})

-(def ^:private ^:const indexer-endpoint "http://localhost:8090/druid/indexer/v1/task")
 (def ^:private ^:const indexer-timeout-seconds
  "Maximum number of seconds we should wait for the indexing task to finish before deciding it's failed."
-  120)
+  180)

 (resolve-private-fns metabase.driver.druid GET POST)

-(defn- run-indexing-task []
-  (let [{:keys [task]} (POST indexer-endpoint, :body indexing-task)
+(defn- run-indexing-task
+  "Run a batched ingestion task on HOST."
+  [host & {:as indexing-task-args}]
+  (let [indexer-endpoint (str host ":8090/druid/indexer/v1/task")
+        {:keys [task]} (POST indexer-endpoint, :body (indexing-task indexing-task-args))
        status-url     (str indexer-endpoint "/" task "/status")]
    (println "STATUS URL: " (str indexer-endpoint "/" task "/status"))
    (loop [remaining-seconds indexer-timeout-seconds]
@@ -77,43 +148,3 @@
            (throw (Exception. (str "Indexing task failed:\n" (u/pprint-to-str status)))))
          (Thread/sleep 1000)
          (recur (dec remaining-seconds)))))))
-
-(defn- setup-druid-test-data* []
-  (println (u/format-color 'blue "Loading druid test data..."))
-  (write-dbdef-to-json (flattened-test-data) (str temp-dir "/" source-filename))
-  (run-indexing-task))
-
-#_(defn- setup-druid-test-data
-  {:expectations-options :before-run}
-  []
-  (datasets/when-testing-engine :druid
-    (setup-druid-test-data*)))
-
-;; TODO - needs to wait until http://localhost:8082/druid/v2/datasources/checkins?interval=-5000/5000 returns data
-#_{:dimensions [:venue_name
-              :venue_category_name
-              :user_password
-              :venue_longitude
-              :user_name
-              :id
-              :venue_latitude
-              :user_last_login
-              :venue_price]
- :metrics [:count]}
-
-
-(defn- database->connection-details [this context dbdef]
-  {:host "http://localhost"
-   :port 8082})
-
-(extend DruidDriver
-  i/IDatasetLoader
-  (merge i/IDatasetLoaderDefaultsMixin
-         {:engine                       (constantly :druid)
-          :database->connection-details database->connection-details
-          :create-db!                   (constantly "nil")
-          :destroy-db!                  (constantly nil)}))
-
-
-;; TODO - don't log druid query during sync
-;; TODO - make `:paging` a feature?