Skip to content
Snippets Groups Projects
Commit e9f4d8f1 authored by Simon Belak's avatar Simon Belak
Browse files

Add descriptions, tidy up some features.

parent 85fe5f76
No related branches found
No related tags found
No related merge requests found
......@@ -5,7 +5,8 @@
[metabase.fingerprinting
[comparison :as comparison]
[costs :as costs]
[fingerprinters :as f]]
[fingerprinters :as f]
[feature-descriptions :refer [add-descriptions]]]
[medley.core :as m]
[metabase.models
[card :refer [Card]]
......@@ -140,14 +141,6 @@
(comparison/fingerprint-distance a b))])
a b))}))
(def ^:private ^{:arglists '([fingerprint])} add-descriptions
(partial m/map-kv (fn [k v]
(if (#{:field :type :table :card :segment} k)
[k v]
[k {:value v
:label k
:descripttion k}]))))
(defn- trim-decimals
[decimal-places fingerprint]
(postwalk
......
(ns metabase.fingerprinting.feature-descriptions
"Desciptions of all the fingerprint features exposed as x-rays."
(require [medley.core :as m]))
(def ^:private descriptions
{:histogram {:label "Distribution"
:description "Distribution of values."
:link "https://en.wikipedia.org/wiki/Probability_mass_function"}
:percentiles {:label "Percentiles"
:description
:link "https://en.wikipedia.org/wiki/Percentile"}
:sum {:label "Sum"
:description "Sum of all values."}
:sum-of-squares {:label "Sum of squares"
:description "Sum of squares of all values."}
:%>mean {:label "Share of values greater than mean."}
:cv {:label "Coefficient of variation"
:description "Ratio between mean and standard deviation. Used as a dispersion measure."
:link "https://en.wikipedia.org/wiki/Coefficient_of_variation"}
:range-vs-sd {:label "Ratio between standard deviation and range of values."}
:mean-median-spread {:label "Relative mean-median spread"
:description "The lower the ratio, the more symmetric the distribution."}
:range {:label "Range"
:description "Range between the smallest and the largest value."}
:cardinality {:label "Cardinality"
:description "Number of different values."}
:min {:label "Minimal value"}
:max {:label "Maximal value"}
:mean {:label "Mean"
:description "Mean (expected) value."
:link "https://en.wikipedia.org/wiki/Mean"}
:median {:label "Median"
:description "Value seperating the data set in two equal halfs -- the \"middle\" value."
:link "https://en.wikipedia.org/wiki/Median"}
:var {:label "Variance"
:description "Measure of how far the values are spread from the mean."
:link "https://en.wikipedia.org/wiki/Variance"}
:sd {:label "Standard deviatin"
:description "Measure of how far the values are spread from the mean."
:link "https://en.wikipedia.org/wiki/Standard_deviation"}
:count {:label "Count"
:description "Number of rows in the dataset."
}
:kurtosis {:label "Kurtosis"
:description "Descriptor of the shape of the distribution. Measures tail extremity (outliers)"
:link "https://en.wikipedia.org/wiki/Kurtosis"}
:skewness {:label "Skewness"
:description "Measure of asymmetry of the distribution."
:link "https://en.wikipedia.org/wiki/Skewness"}
:entropy {:label "Entropy"
:description "Measure of unpredictability of the state (ie. of its average information content)."
:link "https://en.wikipedia.org/wiki/Entropy_(information_theory)"}
:linear-regression {:label "Linear regression"
:description "Slope and intercept of a linear function fit to data."
:link "https://en.wikipedia.org/wiki/Linear_regression"}
:correlation {:label "Correlation"
:description "The quality of a least squares fitting -- the extent to which two variables have a linear relationship with each other."
:link "http://mathworld.wolfram.com/CorrelationCoefficient.html"}
:covariance {:label "Covariance"
:description "A measure of the joint variability."
:link "https://en.wikipedia.org/wiki/Covariance"}
:seasonal-decomposition {:label "Seasonal decomposition"
:description "Decomposes time series into seasonal, trend, and residual components."
:link "http://www.stat.washington.edu/courses/stat527/s13/readings/Cleveland_JASA_1979.pdf"}
:earliest {:label "The earliest value"}
:latest {:label "The latest value"}
:histogram-hour {:label "Distribution of hours in a day"}
:histogram-day {:label "Distribution of days of week"}
:histogram-month {:label "Distribution of months"}
:histogram-quarter {:label "Distribution of quarters"}})
(def ^{:arglists '([fingerprint])} add-descriptions
(partial m/map-kv (fn [k v]
(if-let [description (descriptions k)]
[k (assoc description :value v)]
[k v]))))
......@@ -2,7 +2,7 @@
"Fingerprinting (feature extraction) for various models."
(:require [bigml.histogram.core :as h.impl]
[bigml.sketchy.hyper-loglog :as hyper-loglog]
[clojure.math.numeric-tower :refer [ceil expt floor]] ;;;;;; temp!
[clojure.math.numeric-tower :refer [ceil expt floor round]] ;;;;;; temp!
[clj-time
[coerce :as t.coerce]
[core :as t]
......@@ -140,28 +140,31 @@
(defn- equidistant-bins
[histogram]
(let [{:keys [min max]} (h.impl/bounds histogram)
bin-width (h/optimal-bin-width histogram)
{:keys [min-value num-bins bin-width]} (nicer-breakout
{:min-value min
:max-value max
:num-bins (calculate-num-bins
min max bin-width)
:strategy :num-bins})]
(->> min-value
(iterate (partial + bin-width))
(take (inc num-bins))
(map (fn [x]
[x (h.impl/sum histogram x)]))
(partition 2 1)
(map (fn [[[x s1] [_ s2]]]
[x (- s2 s1)])))))
(if (h/categorical? histogram)
(-> histogram h.impl/bins first :target :counts)
(let [{:keys [min max]} (h.impl/bounds histogram)
bin-width (h/optimal-bin-width histogram)
{:keys [min-value num-bins bin-width]} (nicer-breakout
{:min-value min
:max-value max
:num-bins (calculate-num-bins
min max bin-width)
:strategy :num-bins})]
(->> min-value
(iterate (partial + bin-width))
(take (inc num-bins))
(map (fn [x]
[x (h.impl/sum histogram x)]))
(partition 2 1)
(map (fn [[[x s1] [_ s2]]]
[x (- s2 s1)]))))))
(defn- histogram->dataset
([field histogram] (histogram->dataset identity field histogram))
([keyfn field histogram]
{:rows (for [[k v] (equidistant-bins histogram)]
[(keyfn k) v])
{:rows (let [norm (/ (h.imp/total-count histogram))]
(for [[k v] (equidistant-bins histogram)]
[(keyfn k) (* v norm)]))
:columns [(:name field) "SHARE"]
:cols [field
{:name "SHARE"
......@@ -201,7 +204,7 @@
(dissoc fingerprint :type :field :has-nils?))
(defmethod fingerprinter Num
[_ field]
[{:keys [max-cost]} field]
(redux/post-complete
(redux/fuse {:histogram h/histogram
:cardinality cardinality
......@@ -214,45 +217,47 @@
(if (pos? (h/total-count histogram))
(let [nil-count (h/nil-count histogram)
total-count (h/total-count histogram)
unique% (/ cardinality (max total-count 1))
uniqueness (/ cardinality (max total-count 1))
var (or (h.impl/variance histogram) 0)
sd (math/sqrt var)
min (h.impl/minimum histogram)
max (h.impl/maximum histogram)
mean (h.impl/mean histogram)
median (h.impl/median histogram)
span (- max min)]
{:histogram histogram
:percentiles (apply h.impl/percentiles histogram percentiles)
:sum sum
:sum-of-squares sum-of-squares
:positive-definite? (>= min 0)
:%>mean (- 1 ((h.impl/cdf histogram) mean))
:cardinality-vs-count unique%
:var>sd? (> var sd)
:nil% (/ nil-count (clojure.core/max total-count 1))
:has-nils? (pos? nil-count)
:0<=x<=1? (<= 0 min max 1)
:-1<=x<=1? (<= -1 min max 1)
:cv (safe-divide mean sd)
:span-vs-sd (safe-divide span sd)
:mean-median-spread (safe-divide span (- mean median))
:min-vs-max (safe-divide min max)
:span span
:cardinality cardinality
:min min
:max max
:mean mean
:median median
:var var
:sd sd
:count total-count
:kurtosis kurtosis
:skewness skewness
:all-distinct? (>= unique% (- 1 cardinality-error))
:entropy (h/entropy histogram)
:type Num
:field field})
range (- max min)]
(merge
{:histogram histogram
:percentiles (apply h.impl/percentiles histogram percentiles)
:positive-definite? (>= min 0)
:%>mean (- 1 ((h.impl/cdf histogram) mean))
:uniqueness uniqueness
:var>sd? (> var sd)
:nil% (/ nil-count (clojure.core/max total-count 1))
:has-nils? (pos? nil-count)
:0<=x<=1? (<= 0 min max 1)
:-1<=x<=1? (<= -1 min max 1)
:cv (safe-divide sd mean)
:range-vs-sd (safe-divide sd range)
:mean-median-spread (safe-divide (- mean median) range)
:min-vs-max (safe-divide min max)
:range range
:cardinality cardinality
:min min
:max max
:mean mean
:median median
:var var
:sd sd
:count total-count
:kurtosis kurtosis
:skewness skewness
:all-distinct? (>= unique% (- 1 cardinality-error))
:entropy (h/entropy histogram)
:type Num
:field field}
(when (costs/full-scan? max-cost)
{:sum sum
:sum-of-squares sum-of-squares})))
{:count 0
:type Num
:field field}))))
......@@ -261,11 +266,14 @@
[fingerprint]
(select-keys fingerprint
[:histogram :mean :median :min :max :sd :count :kurtosis
:skewness :entropy :nil% :cardinality-vs-count :span]))
:skewness :entropy :nil% :uniqueness :range :min-vs-max]))
(defmethod x-ray Num
[{:keys [field] :as fingerprint}]
(update fingerprint :histogram (partial histogram->dataset field)))
(-> fingerprint
(update :histogram (partial histogram->dataset field))
(dissoc :has-nils? :var>sd? :0<=x<=1? :-1<=x<=1? :all-distinct?
:positive-definite? :var>sd? :uniqueness :min-vs-max)))
(defmethod fingerprinter [Num Num]
[_ field]
......@@ -389,7 +397,7 @@
(defn- quarter
[dt]
(-> (t/month dt) (/ 3) Math/ceil long))
(-> dt t/month (/ 3) Math/ceil long))
(defmethod fingerprinter DateTime
[_ field]
......@@ -409,7 +417,7 @@
histogram-quarter]}]
(let [nil-count (h/nil-count histogram)
total-count (h/total-count histogram)]
{:earlies (h.impl/minimum histogram)
{:earliest (h.impl/minimum histogram)
:latest (h.impl/maximum histogram)
:histogram histogram
:percentiles (apply h.impl/percentiles histogram percentiles)
......@@ -428,11 +436,45 @@
[fingerprint]
(dissoc fingerprint :type :percentiles :field :has-nils?))
(defn- round-to-month
[dt]
(if (<= (t/day dt) 15)
(t/floor dt t/month)
(t/date-time (t/year dt) (inc (t/month dt)))))
(defn- month-frequencies
[earliest latest]
(let [earilest (round-to-month latest)
latest (round-to-month latest)
start-month (t/month earliest)
duration (t/in-months (t/interval earliest latest))]
(->> (range (dec start-month) (+ start-month duration))
(map #(inc (mod % 12)))
frequencies)))
(defn- quarter-frequencies
[earliest latest]
(let [earilest (round-to-month latest)
latest (round-to-month latest)
start-quarter (quarter earliest)
duration (round (/ (t/in-months (t/interval earliest latest)) 3))]
(->> (range (dec start-quarter) (+ start-quarter duration))
(map #(inc (mod % 4)))
frequencies)))
(defn- weigh-periodicity
[weights card]
(let [baseline (apply min (vals weights))]
(update card :rows (partial map (fn [[k v]]
[k (* v (/ baseline (weights k)))])))))
(defmethod x-ray DateTime
[{:keys [field] :as fingerprint}]
[{:keys [field earliest latest] :as fingerprint}]
(let [earliest (from-double earliest)
latest (from-double latest)])
(-> fingerprint
(update :earlies from-double)
(update :latest from-double)
(assoc :earliest earliest)
(assoc :latest latest)
(update :histogram (partial histogram->dataset from-double field))
(update :percentiles (partial m/map-vals from-double))
(update :histogram-hour (partial histogram->dataset
......@@ -445,16 +487,22 @@
:display_name "Day of week"
:base_type :type/Integer
:special_type :type/Category}))
(update :histogram-month (partial histogram->dataset
{:name "MONTH"
:display_name "Month of year"
:base_type :type/Integer
:special_type :type/Category}))
(update :histogram-quarter (partial histogram->dataset
{:name "QUARTER"
:display_name "Quarter of year"
:base_type :type/Integer
:special_type :type/Category}))))
(update :histogram-month (comp
(partial weight-periodicity
(month-frequencies earliest latest))
(partial histogram->dataset
{:name "MONTH"
:display_name "Month of year"
:base_type :type/Integer
:special_type :type/Category})))
(update :histogram-quarter (comp
(partial weight-periodicity
(quarter-frequencies earliest latest))
(partial histogram->dataset
{:name "QUARTER"
:display_name "Quarter of year"
:base_type :type/Integer
:special_type :type/Category})))))
(defmethod fingerprinter Category
[_ field]
......@@ -464,16 +512,16 @@
(fn [{:keys [histogram cardinality]}]
(let [nil-count (h/nil-count histogram)
total-count (h/total-count histogram)
unique% (/ cardinality (max total-count 1))]
{:histogram histogram
:cardinality-vs-count unique%
:nil% (/ nil-count (max total-count 1))
:has-nils? (pos? nil-count)
:cardinality cardinality
:count total-count
:entropy (h/entropy histogram)
:type Category
:field field}))))
uniqueness (/ cardinality (max total-count 1))]
{:histogram histogram
:uniqueness uniqueness
:nil% (/ nil-count (max total-count 1))
:has-nils? (pos? nil-count)
:cardinality cardinality
:count total-count
:entropy (h/entropy histogram)
:type Category
:field field}))))
(defmethod comparison-vector Category
[fingerprint]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment