From e26723ec222465d3879d733fa07d356f560bd73e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cam=20Sa=C3=BCl?= <cammsaul@gmail.com> Date: Tue, 26 Jan 2016 15:10:29 -0800 Subject: [PATCH] Run Druid unit tests on CI :persevere: --- circle.yml | 4 +- src/metabase/driver/druid/query_processor.clj | 2 +- src/metabase/models/database.clj | 6 +- test/metabase/api/database_test.clj | 76 +++++----- test/metabase/test/data/druid.clj | 137 +++++++++++------- 5 files changed, 128 insertions(+), 97 deletions(-) diff --git a/circle.yml b/circle.yml index 40f37811b53..8302f70e66f 100644 --- a/circle.yml +++ b/circle.yml @@ -25,11 +25,11 @@ test: # 0) runs unit tests w/ H2 local DB. Runs against H2, Mongo, MySQL # 1) runs unit tests w/ Postgres local DB. Runs against H2, SQL Server # 2) runs unit tests w/ MySQL local DB. Runs against H2, Postgres, SQLite - # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift + # 3) runs unit tests w/ H2 local DB. Runs against H2, Redshift, Druid # 4) runs Eastwood linter & Bikeshed linter && ./bin/reflection-linter # 5) runs JS linter + JS test # 6) runs lein uberjar. (We don't run bin/build because we're not really concerned about `npm install` (etc) in this test, which runs elsewhere) - - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac: + - case $CIRCLE_NODE_INDEX in 0) ENGINES=h2,mongo,mysql lein test ;; 1) ENGINES=h2,sqlserver MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) ENGINES=h2,postgres,sqlite MB_DB_TYPE=mysql MB_DB_DBNAME=circle_test MB_DB_PORT=3306 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 3) ENGINES=h2,redshift,druid lein test ;; 4) lein eastwood 2>&1 | grep -v Reflection && lein bikeshed 2>&1 | grep -v Reflection && ./bin/reflection-linter ;; 5) npm install && npm run lint && npm run build && npm run test ;; 6) lein uberjar ;; esac: parallel: true deployment: master: diff --git a/src/metabase/driver/druid/query_processor.clj b/src/metabase/driver/druid/query_processor.clj index c4610f23ba7..6b4ed1f4175 100644 --- a/src/metabase/driver/druid/query_processor.clj +++ b/src/metabase/driver/druid/query_processor.clj @@ -61,7 +61,7 @@ (def ^:private ^:const query-type->default-query (let [defaults {:intervals ["-5000/5000"] :granularity :all - :context {:timeout 5000}}] + :context {:timeout 60000}}] {::select (merge defaults {:queryType :select :pagingSpec {:threshold 100 #_qp/absolute-max-results}}) ::timeseries (merge defaults {:queryType :timeseries}) diff --git a/src/metabase/models/database.clj b/src/metabase/models/database.clj index 5f5380e5e5c..3b8fcd083f7 100644 --- a/src/metabase/models/database.clj +++ b/src/metabase/models/database.clj @@ -14,9 +14,9 @@ (defn- post-select [{:keys [engine] :as database}] (if-not engine database - (assoc database :features (if-let [driver ((resolve 'metabase.driver/engine->driver) engine)] - ((resolve 'metabase.driver/features) driver) - [])))) + (assoc database :features (or (when-let [driver ((resolve 'metabase.driver/engine->driver) engine)] + (seq ((resolve 'metabase.driver/features) driver))) + [])))) (defn- pre-cascade-delete [{:keys [id]}] (cascade-delete 'Card :database_id id) diff --git a/test/metabase/api/database_test.clj b/test/metabase/api/database_test.clj index 9d47c7a66cf..d6f86e666f1 100644 --- a/test/metabase/api/database_test.clj +++ b/test/metabase/api/database_test.clj @@ -40,7 +40,7 @@ :is_full_sync true :organization_id nil :description nil - :features (mapv name (metabase.driver/features (metabase.driver/engine->driver (:engine db))))}))) + :features (mapv name (driver/features (driver/engine->driver (:engine db))))}))) ;; # DB LIFECYCLE ENDPOINTS @@ -71,7 +71,7 @@ :is_full_sync false :organization_id nil :description nil - :features (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))}) + :features (mapv name (driver/features (driver/engine->driver :postgres)))}) (create-db db-name false))) ;; ## DELETE /api/database/:id @@ -113,41 +113,41 @@ ;; Database details *should not* come back for Rasta since she's not a superuser (let [db-name (str "A" (random-name))] ; make sure this name comes before "test-data" (expect-eval-actual-first - (set (filter identity - (conj (for [engine datasets/all-valid-engines] - (datasets/when-testing-engine engine - (match-$ (get-or-create-test-data-db! (driver/engine->driver engine)) - {:created_at $ - :engine (name $engine) - :id $ - :updated_at $ - :name "test-data" - :is_sample false - :is_full_sync true - :organization_id nil - :description nil - :features (mapv name (metabase.driver/features (metabase.driver/engine->driver engine)))}))) - (match-$ (sel :one Database :name db-name) - {:created_at $ - :engine "postgres" - :id $ - :updated_at $ - :name $ - :is_sample false - :is_full_sync true - :organization_id nil - :description nil - :features (mapv name (metabase.driver/features (metabase.driver/engine->driver :postgres)))})))) - (do - ;; Delete all the randomly created Databases we've made so far - (cascade-delete Database :id [not-in (set (filter identity - (for [engine datasets/all-valid-engines] - (datasets/when-testing-engine engine - (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))]) - ;; Add an extra DB so we have something to fetch besides the Test DB - (create-db db-name) - ;; Now hit the endpoint - (set ((user->client :rasta) :get 200 "database"))))) + (set (filter identity (conj (for [engine datasets/all-valid-engines] + (datasets/when-testing-engine engine + (match-$ (get-or-create-test-data-db! (driver/engine->driver engine)) + {:created_at $ + :engine (name $engine) + :id $ + :updated_at $ + :name "test-data" + :is_sample false + :is_full_sync true + :organization_id nil + :description nil + :features (mapv name (driver/features (driver/engine->driver engine)))}))) + ;; (?) I don't remember why we have to do this for postgres but not any other of the bonus drivers + (match-$ (sel :one Database :name db-name) + {:created_at $ + :engine "postgres" + :id $ + :updated_at $ + :name $ + :is_sample false + :is_full_sync true + :organization_id nil + :description nil + :features (mapv name (driver/features (driver/engine->driver :postgres)))})))) + (do + ;; Delete all the randomly created Databases we've made so far + (cascade-delete Database :id [not-in (set (filter identity + (for [engine datasets/all-valid-engines] + (datasets/when-testing-engine engine + (:id (get-or-create-test-data-db! (driver/engine->driver engine)))))))]) + ;; Add an extra DB so we have something to fetch besides the Test DB + (create-db db-name) + ;; Now hit the endpoint + (set ((user->client :rasta) :get 200 "database"))))) ;; ## GET /api/meta/table/:id/query_metadata @@ -163,7 +163,7 @@ :is_full_sync true :organization_id nil :description nil - :features (mapv name (metabase.driver/features (metabase.driver/engine->driver :h2))) + :features (mapv name (driver/features (driver/engine->driver :h2))) :tables [(match-$ (Table (id :categories)) {:description nil :entity_type nil diff --git a/test/metabase/test/data/druid.clj b/test/metabase/test/data/druid.clj index 96527912555..0d2b1766940 100644 --- a/test/metabase/test/data/druid.clj +++ b/test/metabase/test/data/druid.clj @@ -1,19 +1,74 @@ (ns metabase.test.data.druid - (:require [cheshire.core :as json] - [clojure.java.io :as io] + (:require [clojure.java.io :as io] + [cheshire.core :as json] + [environ.core :refer [env]] [metabase.driver.druid :as druid] - (metabase.test.data dataset-definitions + (metabase.test.data [dataset-definitions :as defs] [datasets :as datasets] [interface :as i]) [metabase.test.util :refer [resolve-private-fns]] [metabase.util :as u]) (:import metabase.driver.druid.DruidDriver)) -(def ^:private ^:const temp-dir (System/getProperty "java.io.tmpdir")) -(def ^:private ^:const source-filename "checkins.json") +(defn- database->connection-details [& _] + {:host (or (env :mb-druid-host) + (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_HOST`."))) + :port (Integer/parseInt (or (env :mb-druid-port) + (throw (Exception. "In order to test Druid, you must specify `MB_DRUID_PORT`."))))}) + +(extend DruidDriver + i/IDatasetLoader + (merge i/IDatasetLoaderDefaultsMixin + {:engine (constantly :druid) + :database->connection-details database->connection-details + :create-db! (constantly "nil") + :destroy-db! (constantly nil)})) + + + +;;; Setting Up a Server w/ Druid Test Data + +;; Unfortunately the process of loading test data onto an external server for CI purposes is a little involved. Before testing against Druid, you'll need to perform the following steps: +;; For EC2 instances, make sure to expose ports `8082` & `8090` for Druid while loading data. Once data has finished loading, you only need to expose port `8082`. +;; +;; 1. Setup Zookeeper +;; 1A. Download & extract Zookeeper from `http://zookeeper.apache.org/releases.html#download` +;; 1B. Create `zookeeper/conf/zoo.cfg` -- see the Getting Started Guide: `http://zookeeper.apache.org/doc/r3.4.6/zookeeperStarted.html` +;; 1C. `zookeeper/bin/zkServer.sh start` +;; 1D. `zookeeper/bin/zkServer.sh status` (to make sure it started correctly) +;; 2. Setup Druid +;; 2A. Download & extract Druid from `http://druid.io/downloads.html` +;; 2B. `cp druid/run_druid_server.sh druid/run_historical.sh` and bump the `-Xmx` setting to `6g` or so +;; 2C. `cd druid && ./run_druid_server.sh coordinator` +;; 2D. `cd druid && ./run_druid_server.sh broker` +;; 2E. `cd druid && ./run_historical.sh historical` +;; 2E. `cd druid && ./run_druid_server.sh overlord` +;; 3. Generate flattened test data file. Optionally pick a <filename> +;; 3A. From this namespace in the REPL, run `(generate-json-for-batch-ingestion <filename>)` +;; 3B. `scp` or otherwise upload this file to the box running druid (if applicable) +;; 4. Launch Druid Indexing Task +;; 4A. Run the indexing task on the remote instance. +;; +;; (run-indexing-task <remote-host> :base-dir <dir-where-you-uploaded-file>, :filename <file>) +;; e.g. +;; (run-indexing-task "http://ec2-52-90-109-199.compute-1.amazonaws.com", :base-dir "/home/ec2-user", :filename "checkins.json") +;; +;; The task will keep you apprised of its progress until it completes (takes 1-2 minutes) +;; 4B. Keep an eye on `<host>:8082/druid/v2/datasources`. (e.g. "http://ec2-52-90-109-199.compute-1.amazonaws.com:8082/druid/v2/datasources") +;; This endpoint will return an empty array until the broker knows about the newly ingested segments. When it returns an array with the string `"checkins"` you're ready to +;; run the tests. +;; 4C. Kill the `overlord` process once the data has finished loading. +;; 5. Running Tests +;; 5A. You can run tests like `ENGINES=druid MB_DRUID_PORT=8082 MB_DRUID_HOST=http://ec2-52-90-109-199.compute-1.amazonaws.com lein test` + +(def ^:private ^:const default-filename "Default filename for batched ingestion data file." + "checkins.json") + + +;;; Generating Data File (defn- flattened-test-data [] - (let [dbdef (i/flatten-dbdef metabase.test.data.dataset-definitions/test-data "checkins") + (let [dbdef (i/flatten-dbdef defs/test-data "checkins") tabledef (first (:table-definitions dbdef))] (->> (:rows tabledef) (map (partial zipmap (map :field-name (:field-definitions tabledef)))) @@ -29,7 +84,21 @@ (json/generate-stream row writer) (.append writer \newline))))) -(def ^:private ^:const indexing-task +(defn- generate-json-for-batch-ingestion + "Generate the file to be used for a batched data ingestion for Druid." + ([] + (generate-json-for-batch-ingestion default-filename)) + ([filename] + (write-dbdef-to-json (flattened-test-data) filename))) + + +;;; Running Indexing Task + +(defn- indexing-task + "Create a batched ingestion task dictionary." + [{:keys [base-dir filename] + :or {base-dir "/home/ec2-user" + filename default-filename}}] {:type :index :spec {:dataSchema {:dataSource "checkins" :parser {:type :string @@ -53,18 +122,20 @@ :intervals ["2000/2016"]}} :ioConfig {:type :index :firehose {:type :local - :baseDir temp-dir - :filter source-filename}}}}) + :baseDir base-dir + :filter filename}}}}) -(def ^:private ^:const indexer-endpoint "http://localhost:8090/druid/indexer/v1/task") (def ^:private ^:const indexer-timeout-seconds "Maximum number of seconds we should wait for the indexing task to finish before deciding it's failed." - 120) + 180) (resolve-private-fns metabase.driver.druid GET POST) -(defn- run-indexing-task [] - (let [{:keys [task]} (POST indexer-endpoint, :body indexing-task) +(defn- run-indexing-task + "Run a batched ingestion task on HOST." + [host & {:as indexing-task-args}] + (let [indexer-endpoint (str host ":8090/druid/indexer/v1/task") + {:keys [task]} (POST indexer-endpoint, :body (indexing-task indexing-task-args)) status-url (str indexer-endpoint "/" task "/status")] (println "STATUS URL: " (str indexer-endpoint "/" task "/status")) (loop [remaining-seconds indexer-timeout-seconds] @@ -77,43 +148,3 @@ (throw (Exception. (str "Indexing task failed:\n" (u/pprint-to-str status))))) (Thread/sleep 1000) (recur (dec remaining-seconds))))))) - -(defn- setup-druid-test-data* [] - (println (u/format-color 'blue "Loading druid test data...")) - (write-dbdef-to-json (flattened-test-data) (str temp-dir "/" source-filename)) - (run-indexing-task)) - -#_(defn- setup-druid-test-data - {:expectations-options :before-run} - [] - (datasets/when-testing-engine :druid - (setup-druid-test-data*))) - -;; TODO - needs to wait until http://localhost:8082/druid/v2/datasources/checkins?interval=-5000/5000 returns data -#_{:dimensions [:venue_name - :venue_category_name - :user_password - :venue_longitude - :user_name - :id - :venue_latitude - :user_last_login - :venue_price] - :metrics [:count]} - - -(defn- database->connection-details [this context dbdef] - {:host "http://localhost" - :port 8082}) - -(extend DruidDriver - i/IDatasetLoader - (merge i/IDatasetLoaderDefaultsMixin - {:engine (constantly :druid) - :database->connection-details database->connection-details - :create-db! (constantly "nil") - :destroy-db! (constantly nil)})) - - -;; TODO - don't log druid query during sync -;; TODO - make `:paging` a feature? -- GitLab