From d43dfd4ad3a6ddb54d5181b942c7159e534d575c Mon Sep 17 00:00:00 2001 From: Simon Belak <simon.belak@gmail.com> Date: Fri, 10 Nov 2017 17:12:44 +0100 Subject: [PATCH] Add output sampling --- .../src/metabase/xray/containers/CardXRay.jsx | 2 +- src/metabase/feature_extraction/core.clj | 1 - .../feature_extraction/feature_extractors.clj | 82 ++++++++++++++++++- .../feature_extractors_test.clj | 19 +++++ 4 files changed, 98 insertions(+), 6 deletions(-) diff --git a/frontend/src/metabase/xray/containers/CardXRay.jsx b/frontend/src/metabase/xray/containers/CardXRay.jsx index c4fa552d160..e2c7469a12f 100644 --- a/frontend/src/metabase/xray/containers/CardXRay.jsx +++ b/frontend/src/metabase/xray/containers/CardXRay.jsx @@ -118,7 +118,7 @@ class CardXRay extends Component { series={[ { card: xray.features.model, - data: xray.dataset + data: xray.features.series }, { card: { diff --git a/src/metabase/feature_extraction/core.clj b/src/metabase/feature_extraction/core.clj index c06930af582..16f11cd98f7 100644 --- a/src/metabase/feature_extraction/core.clj +++ b/src/metabase/feature_extraction/core.clj @@ -160,7 +160,6 @@ (ensure-aligment fields cols rows))) {:model card :table (Table (:table_id card))}) - :dataset dataset :sample? (sampled? opts dataset) :comparables (comparables card)})) diff --git a/src/metabase/feature_extraction/feature_extractors.clj b/src/metabase/feature_extraction/feature_extractors.clj index a603bf44934..d1b472a52c9 100644 --- a/src/metabase/feature_extraction/feature_extractors.clj +++ b/src/metabase/feature_extraction/feature_extractors.clj @@ -97,10 +97,83 @@ :strategy :num-bins})] (h/equidistant-bins min-value max-value bin-width histogram)))))) +(defn- triangle-area + "Return the area by triangle specified by vertices `[x1, y1]`, `[x2, y2]`, and + `[x3, y3].` + http://mathworld.wolfram.com/TriangleArea.html" + [[x1 y1] [x2 y2] [x3 y3]] + (* 0.5 (+ (* (- x2) y1) + (* x3 y1) + (* x1 y2) + (* (- x3) y2) + (* (- x1) y3) + (* x2 y3)))) + +(defn largest-triangle-three-buckets + "Downsample series `series` to (approximately) `target-size` data points using + Largest-Triangle-Three-Buckets algorithm. Series needs to be at least + 2*`target-size` long for the algorithm to make sense. If it is not, the + original series is returned unmolested. + + Note: this is true downsampling (selecting just some points), with no + smoothing performed. + https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf" + [target-size series] + (let [current-size (count series)] + (if (< current-size (* 2 target-size)) + series + (let [[head & body] series + tail (last body) + body (butlast body) + bucket-size (-> (/ current-size target-size) Math/floor int)] + (conj (->> (conj (partition bucket-size body) [tail]) + (partition 2 1) + (reduce + (fn [points [middle right]] + (let [left (last points) + right-center (transduce identity + (redux/juxt + ((map first) stats/mean) + ((map second) stats/mean)) + right)] + (conj points (apply max-key (partial triangle-area + left + right-center) + middle)))) + [head])) + tail))))) + +(defn saddles + "Returns the number of saddles in a given series." + [series] + (->> series + (partition 2 1) + (partition-by (fn [[[_ y1] [_ y2]]] + (>= y2 y1))) + rest + count)) + +; The largest dataset returned will therefore be 2*target-1 points as we need at +; least 2 points per bucket before we can apply downsampling. +(def ^:private ^Integer datapoint-target-smooth 100) +(def ^:private ^Integer datapoint-target-noisy 300) + +(def ^:private ^Double noisiness-threshold 0.05) + +(defn- target-size + [series] + (if (some-> series + saddles + (safe-divide (count series)) + (> noisiness-threshold)) + datapoint-target-noisy + datapoint-target-smooth)) + (defn- series->dataset ([fields series] (series->dataset identity fields series)) ([keyfn fields series] - {:rows (for [[x y] series] + {:rows (for [[x y] (largest-triangle-three-buckets (target-size series) + series)] [(keyfn x) y]) :columns (map :name fields) :cols (map #(dissoc % :remapped_from) fields)})) @@ -407,7 +480,7 @@ (->> series (partition 2 1) (map (fn [[[_ y1] [x y2]]] - [x (growth y2 y1)])))) + [x (or (growth y2 y1) 0)])))) :seasonal-decomposition (when (and resolution (costs/unbounded-computation? max-cost)) @@ -432,14 +505,15 @@ {:name "TREND" :display_name "Linear regression trend" :base_type :type/Float}] - (for [[x y] series] + ; 2 points fully define a line + (for [[x y] [(first series) (last series)]] [x (+ (* slope x) offset)]))) (defmethod x-ray [DateTime Num] [{:keys [field series] :as features}] (let [x-field (first field)] (-> features - (dissoc :series) + (update :series (partial series->dataset from-double field)) (update :growth-series (partial series->dataset from-double [x-field {:name "GROWTH" diff --git a/test/metabase/feature_extraction/feature_extractors_test.clj b/test/metabase/feature_extraction/feature_extractors_test.clj index baa38010253..395b0df016c 100644 --- a/test/metabase/feature_extraction/feature_extractors_test.clj +++ b/test/metabase/feature_extraction/feature_extractors_test.clj @@ -3,6 +3,7 @@ [core :as t] [coerce :as t.coerce]] [expectations :refer :all] + [medley.core :as m] [metabase.feature-extraction.feature-extractors :refer :all :as fe] [metabase.feature-extraction.histogram :as h] [redux.core :as redux])) @@ -180,3 +181,21 @@ :type) (-> (->features {:base_type :type/NeverBeforeSeen} numbers) :type)]) + +(expect + [0 1 3 0] + [(#'fe/saddles [[1 1] [2 2] [3 3]]) + (#'fe/saddles [[1 1] [2 2] [3 -2]]) + (#'fe/saddles [[1 1] [2 2] [3 -2] [4 5] [5 2]]) + (#'fe/saddles nil)]) + +(expect + [(var-get #'fe/datapoint-target-smooth) + (var-get #'fe/datapoint-target-noisy)] + [(#'fe/target-size (m/indexed (range 10))) + (#'fe/target-size (m/indexed (repeatedly 1000 rand)))]) + +(expect + [32 10] + [(count (largest-triangle-three-buckets 30 (m/indexed (repeatedly 1000 rand)))) + (count (largest-triangle-three-buckets 30 (m/indexed (repeatedly 10 rand))))]) -- GitLab