Skip to content
Snippets Groups Projects
Commit d43dfd4a authored by Simon Belak's avatar Simon Belak
Browse files

Add output sampling

parent 95a0e331
No related branches found
No related tags found
No related merge requests found
......@@ -118,7 +118,7 @@ class CardXRay extends Component {
series={[
{
card: xray.features.model,
data: xray.dataset
data: xray.features.series
},
{
card: {
......
......@@ -160,7 +160,6 @@
(ensure-aligment fields cols rows)))
{:model card
:table (Table (:table_id card))})
:dataset dataset
:sample? (sampled? opts dataset)
:comparables (comparables card)}))
......
......@@ -97,10 +97,83 @@
:strategy :num-bins})]
(h/equidistant-bins min-value max-value bin-width histogram))))))
(defn- triangle-area
"Return the area by triangle specified by vertices `[x1, y1]`, `[x2, y2]`, and
`[x3, y3].`
http://mathworld.wolfram.com/TriangleArea.html"
[[x1 y1] [x2 y2] [x3 y3]]
(* 0.5 (+ (* (- x2) y1)
(* x3 y1)
(* x1 y2)
(* (- x3) y2)
(* (- x1) y3)
(* x2 y3))))
(defn largest-triangle-three-buckets
"Downsample series `series` to (approximately) `target-size` data points using
Largest-Triangle-Three-Buckets algorithm. Series needs to be at least
2*`target-size` long for the algorithm to make sense. If it is not, the
original series is returned unmolested.
Note: this is true downsampling (selecting just some points), with no
smoothing performed.
https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf"
[target-size series]
(let [current-size (count series)]
(if (< current-size (* 2 target-size))
series
(let [[head & body] series
tail (last body)
body (butlast body)
bucket-size (-> (/ current-size target-size) Math/floor int)]
(conj (->> (conj (partition bucket-size body) [tail])
(partition 2 1)
(reduce
(fn [points [middle right]]
(let [left (last points)
right-center (transduce identity
(redux/juxt
((map first) stats/mean)
((map second) stats/mean))
right)]
(conj points (apply max-key (partial triangle-area
left
right-center)
middle))))
[head]))
tail)))))
(defn saddles
"Returns the number of saddles in a given series."
[series]
(->> series
(partition 2 1)
(partition-by (fn [[[_ y1] [_ y2]]]
(>= y2 y1)))
rest
count))
; The largest dataset returned will therefore be 2*target-1 points as we need at
; least 2 points per bucket before we can apply downsampling.
(def ^:private ^Integer datapoint-target-smooth 100)
(def ^:private ^Integer datapoint-target-noisy 300)
(def ^:private ^Double noisiness-threshold 0.05)
(defn- target-size
[series]
(if (some-> series
saddles
(safe-divide (count series))
(> noisiness-threshold))
datapoint-target-noisy
datapoint-target-smooth))
(defn- series->dataset
([fields series] (series->dataset identity fields series))
([keyfn fields series]
{:rows (for [[x y] series]
{:rows (for [[x y] (largest-triangle-three-buckets (target-size series)
series)]
[(keyfn x) y])
:columns (map :name fields)
:cols (map #(dissoc % :remapped_from) fields)}))
......@@ -407,7 +480,7 @@
(->> series
(partition 2 1)
(map (fn [[[_ y1] [x y2]]]
[x (growth y2 y1)]))))
[x (or (growth y2 y1) 0)]))))
:seasonal-decomposition
(when (and resolution
(costs/unbounded-computation? max-cost))
......@@ -432,14 +505,15 @@
{:name "TREND"
:display_name "Linear regression trend"
:base_type :type/Float}]
(for [[x y] series]
; 2 points fully define a line
(for [[x y] [(first series) (last series)]]
[x (+ (* slope x) offset)])))
(defmethod x-ray [DateTime Num]
[{:keys [field series] :as features}]
(let [x-field (first field)]
(-> features
(dissoc :series)
(update :series (partial series->dataset from-double field))
(update :growth-series (partial series->dataset from-double
[x-field
{:name "GROWTH"
......
......@@ -3,6 +3,7 @@
[core :as t]
[coerce :as t.coerce]]
[expectations :refer :all]
[medley.core :as m]
[metabase.feature-extraction.feature-extractors :refer :all :as fe]
[metabase.feature-extraction.histogram :as h]
[redux.core :as redux]))
......@@ -180,3 +181,21 @@
:type)
(-> (->features {:base_type :type/NeverBeforeSeen} numbers)
:type)])
(expect
[0 1 3 0]
[(#'fe/saddles [[1 1] [2 2] [3 3]])
(#'fe/saddles [[1 1] [2 2] [3 -2]])
(#'fe/saddles [[1 1] [2 2] [3 -2] [4 5] [5 2]])
(#'fe/saddles nil)])
(expect
[(var-get #'fe/datapoint-target-smooth)
(var-get #'fe/datapoint-target-noisy)]
[(#'fe/target-size (m/indexed (range 10)))
(#'fe/target-size (m/indexed (repeatedly 1000 rand)))])
(expect
[32 10]
[(count (largest-triangle-three-buckets 30 (m/indexed (repeatedly 1000 rand))))
(count (largest-triangle-three-buckets 30 (m/indexed (repeatedly 10 rand))))])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment