Skip to content
Snippets Groups Projects
Commit 0dd374d4 authored by Cam Saül's avatar Cam Saül
Browse files

Merge pull request #800 from metabase/sample-data

Sample Dataset
Made updates as discussed this morning.
parents d5daf6b2 bd683fa7
No related branches found
No related tags found
No related merge requests found
......@@ -23,4 +23,5 @@ profiles.clj
/node_modules/
/.babel_cache
/coverage
/resources/sample-dataset.db.trace.db
/deploy/artifacts/*
#! /bin/bash
echo "Running 'npm install' to download javascript dependencies..." &&
npm install &&
if [ -n "$CI_DISABLE_WEBPACK_MINIFICATION" ]; then
echo "Running 'webpack' to assemble and minify frontend assets..."
./node_modules/webpack/bin/webpack.js
else
echo "Running 'webpack -p' to assemble and minify frontend assets..."
./node_modules/webpack/bin/webpack.js -p
fi &&
if [ -f resources/sample-dataset.db.mv.db ]; then
echo "Sample Dataset already generated."
else
echo "Running 'lein generate-sample-dataset' to generate the sample dataset..."
lein generate-sample-dataset
fi &&
echo "Running 'lein uberjar'..." &&
lein uberjar
......@@ -20,8 +20,8 @@ test:
# 2) runs Eastwood linter
# 3) Bikeshed linter
# 4) runs JS linter + JS test
# 5) runs lein uberjar
- case $CIRCLE_NODE_INDEX in 0) MB_TEST_DATASETS=h2,mongo,postgres,mysql lein test ;; 1) MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) lein eastwood ;; 3) lein bikeshed --max-line-length 240 ;; 4) npm run lint && npm run build && npm run test ;; 5) CI_DISABLE_WEBPACK_MINIFICATION=1 lein uberjar ;; esac:
# 5) runs ./build-uberjar
- case $CIRCLE_NODE_INDEX in 0) MB_TEST_DATASETS=h2,mongo,postgres,mysql lein test ;; 1) MB_DB_TYPE=postgres MB_DB_DBNAME=circle_test MB_DB_PORT=5432 MB_DB_USER=ubuntu MB_DB_HOST=localhost lein test ;; 2) lein eastwood ;; 3) lein bikeshed --max-line-length 240 ;; 4) npm run lint && npm run build && npm run test ;; 5) CI_DISABLE_WEBPACK_MINIFICATION=1 ./build-uberjar ;; esac:
parallel: true
deployment:
master:
......
......@@ -36,7 +36,7 @@ Then run the HTTP server with
Check that the project can compile successfully with
lein uberjar
./build-uberjar
Run the linters with
......
(ns leiningen.npm
(:use clojure.java.shell))
(defn npm [projects & args]
;; TODO - some better validations such as checking if `npm` is available
(println "Running `npm install` to download javascript dependencies")
(let [result (sh "npm" "install")]
(if (= 0 (:exit result))
(println (:out result))
(println (:err result)))))
\ No newline at end of file
(ns leiningen.webpack
(:require [clojure.java.shell :refer :all]))
;; Set the CI_DISABLE_WEBPACK_MINIFICATION environment variable to skip minification which takes ~6+ minutes on CircleCI
(defn webpack [projects & args]
;; TODO - some better validations such as checking that we have webpack available
(println "Running `webpack -p` to assemble and minify frontend assets")
(let [result (sh (str (:root projects) "/node_modules/webpack/bin/webpack.js") (if (System/getenv "CI_DISABLE_WEBPACK_MINIFICATION") ""
"-p"))]
(if (= 0 (:exit result))
(println (:out result))
(println (:err result)))))
......@@ -5,7 +5,8 @@
:description "Metabase Community Edition"
:url "http://metabase.com/"
:min-lein-version "2.5.0"
:aliases {"test" ["with-profile" "+expectations" "expectations"]}
:aliases {"test" ["with-profile" "+expectations" "expectations"]
"generate-sample-dataset" ["with-profile" "+generate-sample-dataset" "run"]}
:dependencies [[org.clojure/clojure "1.7.0"]
[org.clojure/core.logic "0.8.10"]
[org.clojure/core.match "0.3.0-alpha4"] ; optimized pattern matching library for Clojure
......@@ -90,5 +91,9 @@
"-Dmb.jetty.port=3010"
"-Dmb.api.key=test-api-key"
"-Xverify:none"]} ; disable bytecode verification when running tests so they start slightly faster
:uberjar {:aot :all
:prep-tasks ^:replace ["npm" "webpack" "javac" "compile"]}})
:uberjar {:aot :all}
:generate-sample-dataset {:dependencies [[faker "0.2.2"] ; Fake data generator -- port of Perl/Ruby
[incanter/incanter-core "1.5.6"]] ; Satistical functions like normal distibutions}})
:source-paths ["sample_dataset"]
:global-vars {*warn-on-reflection* false}
:main ^:skip-aot metabase.sample-dataset.generate}})
File added
(ns metabase.sample-dataset.generate
(:require [clojure.math.numeric-tower :as math]
[clojure.string :as s]
(faker [address :as address]
[company :as company]
[lorem :as lorem]
[internet :as internet]
[name :as name])
[incanter.distributions :as dist]
(korma [core :as k]
[db :as kdb]))
(:import java.util.Date))
(def ^:private ^:const sample-dataset-filename
(str (System/getProperty "user.dir") "/resources/sample-dataset.db"))
(defn- normal-distribution-rand [mean median]
(dist/draw (dist/normal-distribution mean median)))
(defn- normal-distribution-rand-int [mean median]
(math/round (normal-distribution-rand mean median)))
;;; ## PEOPLE
(defn- random-latitude []
(-> (rand)
(* 180)
(- 90)))
(defn- random-longitude []
(-> (rand)
(* 360)
(- 180)))
(defn ^Date years-ago [n]
(let [d (Date.)]
(.setYear d (- (.getYear d) n))
d))
(defn ^Date random-date-between [^Date min ^Date max]
(let [min-ms (.getTime min)
max-ms (.getTime max)
range (- max-ms min-ms)
d (Date.)]
(.setTime d (+ (long (rand range)) min-ms))
d))
(defn- random-person []
(let [first (name/first-name)
last (name/last-name)]
{:name (format "%s %s" first last)
:email (internet/free-email (format "%s.%s" first last))
:password (str (java.util.UUID/randomUUID))
:birth_date (random-date-between (years-ago 60) (years-ago 18))
:address (address/street-address)
:city (address/city)
:zip (apply str (take 5 (address/zip-code)))
:state (address/us-state-abbr)
:latitude (random-latitude)
:longitude (random-longitude)
:source (rand-nth ["Google" "Twitter" "Facebook" "Organic" "Affiliate"])
:created_at (random-date-between (years-ago 1) (Date.))}))
;;; ## PRODUCTS
(defn- random-company-name []
(first (company/names)))
(defn- random-price [min max]
(let [range (- max min)]
(-> (rand-int (* range 100))
(/ 100.0)
(+ min))))
(def ^:private ^:const product-names
{:adjective '[Small, Ergonomic, Rustic, Intelligent, Gorgeous, Incredible, Fantastic, Practical, Sleek, Awesome, Enormous, Mediocre, Synergistic, Heavy Duty, Lightweight, Aerodynamic, Durable]
:material '[Steel, Wooden, Concrete, Plastic, Cotton, Granite, Rubber, Leather, Silk, Wool, Linen, Marble, Iron, Bronze, Copper, Aluminum, Paper]
:product '[Chair, Car, Computer, Gloves, Pants, Shirt, Table, Shoes, Hat, Plate, Knife, Bottle, Coat, Lamp, Keyboard, Bag, Bench, Clock, Watch, Wallet]})
(defn- random-product-name []
(format "%s %s %s"
(rand-nth (product-names :adjective))
(rand-nth (product-names :material))
(rand-nth (product-names :product))))
(def ^:private ean-checksum
(let [^:const weights (flatten (repeat 6 [1 3]))]
(fn [digits]
{:pre [(= 12 (count digits))
(= 12 (count (apply str digits)))]
:post [(= 1 (count (str %)))]}
(as-> (reduce + (map (fn [digit weight]
(* digit weight))
digits weights))
it
(mod it 10)
(- 10 it)
(mod it 10)))))
(defn- random-ean []
{:post [(= (count %) 13)]}
(let [digits (vec (repeatedly 12 #(rand-int 10)))]
(->> (conj digits (ean-checksum digits))
(apply str))))
(defn- random-product []
{:ean (random-ean)
:title (random-product-name)
:category (rand-nth ["Widget" "Gizmo" "Gadget" "Doohickey"])
:vendor (random-company-name)
:price (random-price 12 100)
:created_at (random-date-between (years-ago 1) (Date.))})
;;; ## ORDERS
(def ^:private ^:const state->tax-rate
{"AK" 0.0
"AL" 0.04
"AR" 0.065
"AZ" 0.056
"CA" 0.075
"CO" 0.029
"CT" 0.0635
"DC" 0.0575
"DE" 0.0
"FL" 0.06
"GA" 0.04
"HI" 0.04
"IA" 0.06
"ID" 0.06
"IL" 0.0625
"IN" 0.07
"KS" 0.065
"KY" 0.06
"LA" 0.04
"MA" 0.0625
"MD" 0.06
"ME" 0.055
"MI" 0.06
"MN" 0.06875
"MO" 0.04225
"MS" 0.07
"MT" 0.0
"NC" 0.0475
"ND" 0.05
"NE" 0.055
"NH" 0.0
"NJ" 0.07
"NM" 0.05125
"NV" 0.0685
"NY" 0.04
"OH" 0.0575
"OK" 0.045
"OR" 0.0
"PA" 0.06
"RI" 0.07
"SC" 0.06
"SD" 0.04
"TN" 0.07
"TX" 0.0625
"UT" 0.047
"VA" 0.043
"VT" 0.06
"WA" 0.065
"WI" 0.05
"WV" 0.06
"WY" 0.04
;; Territories / Associated States / Armed Forces - just give these all zero
;; These might come back from address/us-state-abbr
"AA" 0.0 ; Armed Forces - Americas
"AE" 0.0 ; Armed Forces - Europe
"AP" 0.0 ; Armed Forces - Pacific
"AS" 0.0 ; American Samoa
"FM" 0.0 ; Federated States of Micronesia
"GU" 0.0 ; Guam
"MH" 0.0 ; Marshall Islands
"MP" 0.0 ; Northern Mariana Islands
"PR" 0.0 ; Puerto Rico
"PW" 0.0 ; Palau
"VI" 0.0 ; Virgin Islands
})
(defn- max-date [& dates]
{:pre [(every? (partial instance? Date) dates)]
:post [(instance? Date %)]}
(let [d (Date.)]
(.setTime d (apply max (map #(.getTime ^Date %) dates)))
d))
(defn- min-date [& dates]
{:pre [(every? (partial instance? Date) dates)]
:post [(instance? Date %)]}
(let [d (Date.)]
(.setTime d (apply min (map #(.getTime ^Date %) dates)))
d))
(defn random-order [{:keys [state], :as ^Person person} {:keys [price], :as product}]
{:pre [(string? state)
(number? price)]
:post [(map? %)]}
(let [tax-rate (state->tax-rate state)
_ (assert tax-rate
(format "No tax rate found for state '%s'." state))
tax (-> (* price 100.0)
int
(/ 100.0))]
{:user_id (:id person)
:product_id (:id product)
:subtotal price
:tax tax
:total (+ price tax)
:created_at (random-date-between (min-date (:created_at person) (:created_at product)) (Date.))}))
;;; ## REVIEWS
(defn random-review [product]
{:product_id (:id product)
:reviewer (internet/user-name)
:rating (rand-nth [1 1
2 2 2
3 3
4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
5 5 5 5 5 5 5 5 5 5 5 5 5])
:body (first (lorem/paragraphs))
:created_at (random-date-between (:created_at product) (Date.))})
(defn- create-randoms [n f]
(vec (map-indexed (fn [id obj]
(assoc obj :id (inc id)))
(repeatedly n f))))
(defn- product-add-reviews [product]
(let [num-reviews (max 0 (normal-distribution-rand-int 5 4))
reviews (vec (for [review (repeatedly num-reviews #(random-review product))]
(assoc review :product_id (:id product))))
rating (if (seq reviews) (/ (reduce + (map :rating reviews))
(count reviews))
0.0)]
(assoc product :reviews reviews, :rating (-> (* rating 10.0)
int
(/ 10.0)))))
(defn- person-add-orders [products person]
{:pre [(sequential? products)
(map? person)]
:post [(map? %)]}
(let [num-orders (max 0 (normal-distribution-rand-int 5 10))]
(if (zero? num-orders)
person
(assoc person :orders (vec (repeatedly num-orders #(random-order person (rand-nth products))))))))
(defn create-random-data [& {:keys [people products]
:or {people 2500 products 200}}]
{:post [(map? %)
(= (count (:people %)) people)
(= (count (:products %)) products)
(every? keyword? (keys %))
(every? sequential? (vals %))]}
(println (format "Generating random data: %d people, %d products..." people products))
(let [products (mapv product-add-reviews (create-randoms products random-product))
people (mapv (partial person-add-orders products) (create-randoms people random-person))]
{:people (mapv #(dissoc % :orders) people)
:products (mapv #(dissoc % :reviews) products)
:reviews (vec (mapcat :reviews products))
:orders (vec (mapcat :orders people))}))
;;; # LOADING THE DATA
(defn- create-table-sql [table-name field->type]
{:pre [(keyword? table-name)
(map? field->type)
(every? keyword? (keys field->type))
(every? string? (vals field->type))]
:post [(string? %)]}
(format "CREATE TABLE \"%s\" (\"ID\" BIGINT AUTO_INCREMENT, %s, PRIMARY KEY (\"ID\"));"
(s/upper-case (name table-name))
(apply str (->> (for [[field type] (seq field->type)]
(format "\"%s\" %s NOT NULL" (s/upper-case (name field)) type))
(interpose ", ")))))
(def ^:private ^:const tables
{:people {:name "VARCHAR(255)"
:email "VARCHAR(255)"
:password "VARCHAR(255)"
:birth_date "DATE"
:address "VARCHAR(255)"
:zip "CHAR(5)"
:city "VARCHAR(255)"
:state "CHAR(2)"
:latitude "FLOAT"
:longitude "FLOAT"
:source "VARCHAR(255)"
:created_at "DATETIME"}
:products {:ean "CHAR(13)"
:title "VARCHAR(255)"
:category "VARCHAR(255)"
:vendor "VARCHAR(255)"
:price "FLOAT"
:rating "FLOAT"
:created_at "DATETIME"}
:orders {:user_id "INTEGER"
:product_id "INTEGER"
:subtotal "FLOAT"
:tax "FLOAT"
:total "FLOAT"
:created_at "DATETIME"}
:reviews {:product_id "INTEGER"
:reviewer "VARCHAR(255)"
:rating "SMALLINT"
:body "TEXT"
:created_at "DATETIME"}})
(def ^:private ^:const fks
[{:source-table "ORDERS"
:field "USER_ID"
:dest-table "PEOPLE"}
{:source-table "ORDERS"
:field "PRODUCT_ID"
:dest-table "PRODUCTS"}
{:source-table "REVIEWS"
:field "PRODUCT_ID"
:dest-table "PRODUCTS"}])
(defn create-h2-db
([filename]
(create-h2-db filename (create-random-data)))
([filename data]
(println "Deleting existing db...")
(clojure.java.io/delete-file (str filename ".mv.db") :silently)
(clojure.java.io/delete-file (str filename ".trace.db") :silently)
(println "Creating db...")
(let [db (kdb/h2 {:db (format "file:%s;UNDO_LOG=0;CACHE_SIZE=131072;QUERY_CACHE_SIZE=128;COMPRESS=TRUE;MULTI_THREADED=TRUE;MVCC=TRUE;DEFRAG_ALWAYS=TRUE;MAX_COMPACT_TIME=5000;ANALYZE_AUTO=100"
filename)
:make-pool? false})]
(doseq [[table-name field->type] (seq tables)]
(k/exec-raw db (create-table-sql table-name field->type)))
;; Add FK constraints
(println "Adding FKs...")
(doseq [{:keys [source-table field dest-table]} fks]
(k/exec-raw db (format "ALTER TABLE \"%s\" ADD CONSTRAINT \"FK_%s_%s_%s\" FOREIGN KEY (\"%s\") REFERENCES \"%s\" (\"ID\");"
source-table
source-table field dest-table
field
dest-table)))
;; Insert the data
(println "Inserting data...")
(doseq [[table rows] (seq data)]
(assert (keyword? table))
(assert (sequential? rows))
(let [entity (-> (k/create-entity (s/upper-case (name table)))
(k/database db))]
(k/insert entity (k/values (for [row rows]
(->> (for [[k v] (seq row)]
[(s/upper-case (name k)) v])
(into {})))))))
;; Create the 'GUEST' user
(println "Preparing database for export...")
(k/exec-raw db "CREATE USER GUEST PASSWORD 'guest';")
(doseq [table (keys data)]
(k/exec-raw db (format "GRANT SELECT ON %s TO GUEST;" (s/upper-case (name table)))))
(println "Done."))))
(defn -main [& [filename]]
(let [filename (or filename sample-dataset-filename)]
(println (format "Writing sample dataset to %s..." filename))
(create-h2-db filename)))
......@@ -16,6 +16,7 @@
[medley.core :as medley]
(metabase [config :as config]
[db :as db]
[driver :as driver]
[routes :as routes]
[setup :as setup]
[task :as task])
......@@ -23,6 +24,7 @@
[log-api-call :refer :all]
[format :refer :all])
(metabase.models [setting :refer [defsetting]]
[database :refer [Database]]
[user :refer [User]])))
;; ## CONFIG
......@@ -134,6 +136,29 @@
(.stop ^org.eclipse.jetty.server.Server @jetty-instance)
(reset! jetty-instance nil)))
(def ^:private ^:const sample-dataset-name "Sample Dataset")
(def ^:private ^:const sample-dataset-filename "sample-dataset.db.mv.db")
(defn- add-sample-dataset! []
(when-not (db/exists? Database :name sample-dataset-name)
(try
(log/info "Loading sample dataset...")
(let [resource (-> (Thread/currentThread) ; hunt down the sample dataset DB file inside the current JAR
.getContextClassLoader
(.getResource sample-dataset-filename))]
(if-not resource
(log/error (format "Can't load sample dataset: the DB file '%s' can't be found by the ClassLoader." sample-dataset-filename))
(let [h2-file (-> (.getPath resource)
(s/replace #"^file:" "zip:") ; to connect to an H2 DB inside a JAR just replace file: with zip:
(s/replace #"\.mv\.db$" "") ; strip the .mv.db suffix from the path
(str ";USER=GUEST;PASSWORD=guest"))] ; specify the GUEST user account created for the DB
(driver/sync-database! (db/ins Database
:name sample-dataset-name
:details {:db h2-file}
:engine :h2)))))
(catch Throwable e
(log/error (format "Failed to load sample dataset: %s" (.getMessage e)))))))
(defn -main
"Launch Metabase in standalone mode."
......@@ -142,7 +167,10 @@
(try
;; run our initialization process
(init)
;; add the sample dataset DB if applicable
(add-sample-dataset!)
;; launch embedded webserver
(start-jetty)
(catch Exception e
(.printStackTrace e)
(log/error "Metabase Initialization FAILED: " (.getMessage e)))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment