Skip to content
Snippets Groups Projects
Commit ade674b1 authored by Simon Belak's avatar Simon Belak
Browse files

Incorporated all of @camsaul's suggestions

+ no longer returning datetimes as longs
parent 1c29091e
Branches
Tags
No related merge requests found
......@@ -60,7 +60,7 @@
[environ "1.1.0"] ; easy environment management
[hiccup "1.0.5"] ; HTML templating
[honeysql "0.8.2"] ; Transform Clojure data structures to SQL
[kixi/stats "0.3.8"] ; Various statistic measures implemented as transducers
[kixi/stats "0.3.8"] ; Various statistic measures implemented as transducers
[log4j/log4j "1.2.17" ; logging framework
:exclusions [javax.mail/mail
javax.jms/jms
......@@ -88,7 +88,7 @@
[toucan "1.0.3" ; Model layer, hydration, and DB utilities
:exclusions [honeysql]]]
:repositories [["bintray" "https://dl.bintray.com/crate/crate"]] ; Repo for Crate JDBC driver
:plugins [[lein-environ "1.1.0"] ; easy access to environment variables
:plugins [[lein-environ "1.1.0"] ; easy access to environment variables
[lein-ring "0.11.0" ; start the HTTP server with 'lein ring server'
:exclusions [org.clojure/clojure]]] ; TODO - should this be a dev dependency ?
:main ^:skip-aot metabase.core
......
(ns metabase.api.fingerprint
(:require [compojure.core :refer [GET POST PUT]]
(:require [compojure.core :refer [GET]]
[metabase.api.common :as api]
[metabase.fingerprinting :as fingerprinting]
(metabase.models [field :refer [Field]]
[table :refer [Table]]
[segment :refer [Segment]]
[card :refer [Card]])
[metabase.models.card :refer [Card]]
[metabase.models.field :refer [Field]]
[metabase.models.segment :refer [Segment]]
[metabase.models.table :refer [Table]]
[schema.core :as s]))
(def ^:private ^:const MaxQueryCost (s/maybe (s/enum "cache"
"sample"
"full-scan"
"joins")))
(def ^:private ^:const MaxComputationCost (s/maybe (s/enum "linear"
"unbounded"
"yolo")))
(def ^:private ^:const Resolution (s/maybe (s/enum "month"
"day"
"raw")))
;; See metabase.fingerprinting/fingerprint for description of these settings.
(def ^:private ^:const MaxQueryCost
(s/maybe (s/enum "cache"
"sample"
"full-scan"
"joins")))
(def ^:private ^:const MaxComputationCost
(s/maybe (s/enum "linear"
"unbounded"
"yolo")))
(def ^:private ^:const Resolution
(s/maybe (s/enum "month"
"day"
"raw")))
(defn- max-cost
[query computation]
......@@ -74,7 +80,7 @@
(map (partial api/read-check Field))
(apply fingerprinting/multifield-fingerprint
{:max-cost (max-cost max_query_cost max_computation_cost)
:resolution (keyword resolution)})))
:resolution (or (keyword resolution) :day)})))
(api/defendpoint GET "/compare/fields/:id1/:id2"
"Get comparison fingerprints for `Field`s with ID1 and ID2."
......
......@@ -2,17 +2,18 @@
"Fingerprinting (feature extraction) for various models."
(:require [bigml.histogram.core :as hist]
[bigml.sketchy.hyper-loglog :as hyper-loglog]
(clj-time [coerce :as t.coerce]
[core :as t]
[format :as t.format]
[periodic :as t.periodic])
[kixi.stats.math :as math]
[clj-time.coerce :as t.coerce]
[clj-time.core :as t]
[clj-time.format :as t.format]
[clj-time.periodic :as t.periodic]
[kixi.stats.core :as stats]
[kixi.stats.math :as math]
[medley.core :as m]
[metabase.db.metadata-queries :as metadata]
(metabase.models [card :refer [Card]]
[field :refer [Field]]
[segment :refer [Segment]]
[table :refer [Table]])
[metabase.models.card :refer [Card]]
[metabase.models.field :refer [Field]]
[metabase.models.segment :refer [Segment]]
[metabase.models.table :refer [Table]]
[redux.core :as redux]
[tide.core :as tide]))
......@@ -68,7 +69,7 @@
(map (juxt :mean :count))
bins))))
(def ^{:arglist '([histogram])} nil-count
(def ^{:arglists '([histogram])} nil-count
"Return number of nil values histogram holds."
(comp :count hist/missing-bin))
......@@ -99,7 +100,7 @@
[field]
(if (sequential? field)
(mapv field-type field)
[(:base_type field) (or (:special_type field) :type/Nil)]))
[(:base_type field) (or (:special_type field) :type/*)]))
(def ^:private Num [:type/Number :type/*])
(def ^:private DateTime [:type/DateTime :type/*])
......@@ -107,25 +108,25 @@
(def ^:private Any [:type/* :type/*])
(def ^:private Text [:type/Text :type/*])
(def linear-computation? ^:private ^{:arglist '([max-cost])}
(def linear-computation? ^:private ^{:arglists '([max-cost])}
(comp #{:linear} :computation))
(def unbounded-computation? ^:private ^{:arglist '([max-cost])}
(def unbounded-computation? ^:private ^{:arglists '([max-cost])}
(comp #{:unbounded :yolo} :computation))
(def yolo-computation? ^:private ^{:arglist '([max-cost])}
(def yolo-computation? ^:private ^{:arglists '([max-cost])}
(comp #{:yolo} :computation))
(def cache-only? ^:private ^{:arglist '([max-cost])}
(def cache-only? ^:private ^{:arglists '([max-cost])}
(comp #{:cache} :query))
(def sample-only? ^:private ^{:arglist '([max-cost])}
(def sample-only? ^:private ^{:arglists '([max-cost])}
(comp #{:sample} :query))
(def full-scan? ^:private ^{:arglist '([max-cost])}
(def full-scan? ^:private ^{:arglists '([max-cost])}
(comp #{:full-scan :joins} :query))
(def alow-joins? ^:private ^{:arglist '([max-cost])}
(def alow-joins? ^:private ^{:arglists '([max-cost])}
(comp #{:joins} :query))
(defmulti fingerprinter
......@@ -197,9 +198,13 @@
(def ^:private ^:const timestamp-truncation-factor (/ 1 1000 60 60 24))
(def ^:private ^{:arglist '([t])} truncate-timestamp
(def ^:private ^{:arglists '([t])} truncate-timestamp
"Truncate UNIX timestamp from ms to days."
(partial * timestamp-truncation-factor))
(comp long (partial * timestamp-truncation-factor)))
(def ^:private ^{:arglists '([t])} pad-timestamp
"Pad truncated timestamp back into a proper UNIX (ms) timestamp."
(comp long (partial * (/ timestamp-truncation-factor))))
(defn- fill-timeseries
"Given a coll of `[DateTime, Any]` pairs with periodicty `step` fill missing
......@@ -213,8 +218,7 @@
[t (ts-index t 0)])))
(some-> ts
ffirst
(/ timestamp-truncation-factor)
long
pad-timestamp
t.coerce/from-long
(t.periodic/periodic-seq step)))))
......@@ -225,44 +229,46 @@
[resolution ts]
(let [period (case resolution
:month 12
:day 52)]
:day 52)]
(when (>= (count ts) (* 2 period))
(tide/decompose period ts))))
(defmethod fingerprinter [DateTime Num]
[{:keys [max-cost resolution]} _]
(redux/post-complete
(redux/pre-step
(redux/fuse {:linear-regression (stats/simple-linear-regression first second)
:series (if (= resolution :raw)
conj
(redux/post-complete
conj
(partial fill-timeseries
(case resolution
:month (t/months 1)
:day (t/days 1)))))})
(fn [[x y]]
[(-> x t.format/parse t.coerce/to-long truncate-timestamp) y]))
(fn [{:keys [series linear-regression]}]
(let [ys-r (not-empty (reverse (map second series)))
{:keys [trend seasonal reminder]}
(when (and (not= resolution :raw)
(unbounded-computation? max-cost))
(decompose-timeseries resolution series))]
(merge {:series series
:linear-regression linear-regression
:trend trend
:seasonal seasonal
:reminder reminder}
(case resolution
:month {:YoY (growth (first ys-r) (nth ys-r 11))
:YoY-previous (growth (second ys-r) (nth ys-r 12))
:MoM (growth (first ys-r) (second ys-r))
:MoM-previous (growth (second ys-r) (nth ys-r 2))}
:day {:DoD (growth (first ys-r) (second ys-r))
:DoD-previous (growth (second ys-r) (nth ys-r 2))}
:raw nil))))))
(let [resolution (or resolution :raw)]
(redux/post-complete
(redux/pre-step
(redux/fuse {:linear-regression (stats/simple-linear-regression first second)
:series (if (= resolution :raw)
conj
(redux/post-complete
conj
(partial fill-timeseries
(case resolution
:month (t/months 1)
:day (t/days 1)))))})
(fn [[x y]]
[(-> x t.format/parse t.coerce/to-long truncate-timestamp) y]))
(fn [{:keys [series linear-regression]}]
(let [ys-r (->> series (map second) reverse not-empty)
{:keys [trend seasonal reminder]}
(when (and (not= resolution :raw)
(unbounded-computation? max-cost))
(decompose-timeseries resolution series))]
(merge {:series (for [[x y] series]
[(t.coerce/from-long (pad-timestamp x)) y])
:linear-regression linear-regression
:trend trend
:seasonal seasonal
:reminder reminder}
(case resolution
:month {:YoY (growth (first ys-r) (nth ys-r 11))
:YoY-previous (growth (second ys-r) (nth ys-r 12))
:MoM (growth (first ys-r) (second ys-r))
:MoM-previous (growth (second ys-r) (nth ys-r 2))}
:day {:DoD (growth (first ys-r) (second ys-r))
:DoD-previous (growth (second ys-r) (nth ys-r 2))}
:raw nil)))))))
(defmethod fingerprinter [Category Any]
[opts [x y]]
......@@ -302,11 +308,14 @@
t.format/parse)
(fn [{:keys [histogram histogram-hour histogram-day histogram-month
histogram-quarter]}]
(let [nil-count (nil-count histogram)]
{:min (hist/minimum histogram)
:max (hist/maximum histogram)
:histogram (bins histogram)
:percentiles (apply hist/percentiles histogram percentiles)
(let [nil-count (nil-count histogram)
->datetime (comp t.coerce/from-long long)]
{:min (->datetime (hist/minimum histogram))
:max (->datetime (hist/maximum histogram))
:histogram (m/map-keys ->datetime (bins histogram))
:percentiles (m/map-vals ->datetime
(apply hist/percentiles histogram
percentiles))
:histogram-hour (bins histogram-hour)
:histogram-day (bins histogram-day)
:histogram-month (bins histogram-month)
......
......@@ -5,8 +5,6 @@
(derive :type/Dictionary :type/Collection)
(derive :type/Array :type/Collection)
(derive :type/Nil :type/*)
;;; Numeric Types
(derive :type/Number :type/*)
......
(ns metabase.fingerprinting-test
(:require (clj-time [coerce :as t.coerce]
[core :as t])
(:require [clj-time.coerce :as t.coerce]
[clj-time.core :as t]
[expectations :refer :all]
[metabase.fingerprinting :refer :all :as f]
[metabase.fingerprinting :as f :refer :all]
[redux.core :as redux]))
(def numbers [0.1 0.4 0.2 nil 0.5 0.3 0.51 0.55 0.22])
(def datetimes [(t/date-time 2016 1) (t/date-time 2016 2) nil (t/date-time 2016 5)
(t/date-time 2016 7 23) (t/date-time 2016 10 2)])
(def categories [:foo :baz :bar :bar nil :foo])
(def ^:private numbers [0.1 0.4 0.2 nil 0.5 0.3 0.51 0.55 0.22])
(def ^:private datetimes [(t/date-time 2016 1) (t/date-time 2016 2) nil
(t/date-time 2016 5) (t/date-time 2016 7 23)
(t/date-time 2016 10 2)])
(def ^:private categories [:foo :baz :bar :bar nil :foo])
(def hist (transduce identity histogram (take 100 (cycle numbers))))
(def hist-c (transduce identity histogram-categorical (take 100 (cycle categories))))
(def ^:private hist (transduce identity histogram (take 100 (cycle numbers))))
(def ^:private hist-c (transduce identity histogram-categorical
(take 100 (cycle categories))))
(expect
[2
......@@ -40,7 +42,7 @@
[(bins hist)])
(expect
b [100.0
[100.0
11]
[(total-count hist)
(nil-count hist)])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment