Skip to content
Snippets Groups Projects
Commit d2310915 authored by Simon Belak's avatar Simon Belak
Browse files

Add zeros insight. Add docstrings to insights. Minor refactor

parent 7ebd3218
Branches
Tags
No related merge requests found
......@@ -43,13 +43,13 @@ export class NormalRangeInsight extends Component {
}
}
export class GapsInsight extends Component {
static insightType = "gaps"
static title = "Gaps in the data"
export class NilsInsight extends Component {
static insightType = "nils"
static title = "Missing data"
static icon = "warning"
render() {
const { mode, quality, filter, features: { table } } = this.props
const { quality, filter, features: { table } } = this.props
const viewAllRowsUrl = table && Question.create()
.query()
......@@ -62,9 +62,36 @@ export class GapsInsight extends Component {
// construct the question with filter
return (
<InsightText>
You have { quality } { mode } values in your data.
You have { quality } missing (null) values in your data.
<span> </span>
{ table && <span><Link to={viewAllRowsUrl}>View all rows</Link> with { mode } value.</span> }
{ table && <span><Link to={viewAllRowsUrl}>View all rows</Link> with missing value.</span> }
</InsightText>
)
}
}
export class ZerosInsight extends Component {
static insightType = "zeros"
static title = "0s in your data"
static icon = "warning"
render() {
const { quality, filter, features: { table } } = this.props
const viewAllRowsUrl = table && Question.create()
.query()
// imitate the required hydrated metadata format
.setTable({ ...table, database: { id: table.db_id }})
.addFilter(filter)
.question()
.getUrl()
// construct the question with filter
return (
<InsightText>
You have { quality } zeros in your data. They may be standins for missing data or indicate some other abnormality.
<span> </span>
{ table && <span><Link to={viewAllRowsUrl}>View all rows</Link> with zeros.</span> }
</InsightText>
)
}
......@@ -185,9 +212,11 @@ export class RegimeChangeInsight extends Component {
const INSIGHT_COMPONENTS = [
// any field
NilsInsight,
// numeric fields
NormalRangeInsight,
GapsInsight,
ZerosInsight,
// timeseries
NoisinessInsight,
VariationTrendInsight,
......
No preview for this file type
......@@ -212,7 +212,7 @@
:subtotal price
:tax tax
:quantity (random-price 1 5)
:discount (sometimes 0.1 #(random-price 1 10))
:discount (sometimes 0.1 #(random-price 0 10))
:total (+ price tax)
:created_at (random-date-between (min-date (:created_at person) (:created_at product)) (u/relative-date :year 2))}))
......
......@@ -170,7 +170,7 @@
(let [nil-count (h/nil-count histogram)
total-count (h/total-count histogram)]
(merge {:histogram histogram
:nil% (/ nil-count (max total-count 1))
:nil% (math/safe-divide nil-count total-count)
:has-nils? (pos? nil-count)
:count total-count
:entropy (h/entropy histogram)}
......@@ -179,10 +179,10 @@
(defn- cardinality-extractor
[{:keys [cardinality histogram]}]
(let [uniqueness (/ cardinality (max (h/total-count histogram) 1))]
(let [uniqueness (math/safe-divide cardinality (h/total-count histogram))]
{:uniqueness uniqueness
:cardinality cardinality
:all-distinct? (>= uniqueness (- 1 cardinality-error))}))
:all-distinct? (some-> uniqueness (>= (- 1 cardinality-error)))}))
(defn- field-metadata-extractor
[field]
......@@ -200,7 +200,8 @@
{:histogram h/histogram
:cardinality cardinality
:kurtosis (redux/pre-step stats/kurtosis (somef double))
:skewness (redux/pre-step stats/skewness (somef double))}
:skewness (redux/pre-step stats/skewness (somef double))
:zeros (redux/with-xform stats/count (filter (somef zero?)))}
(when (costs/full-scan? max-cost)
{:sum (redux/with-xform + (keep (somef double)))
:sum-of-squares (redux/with-xform + (keep (somef #(Math/pow % 2))))})
......@@ -210,7 +211,7 @@
histogram-extractor
cardinality-extractor
(field-metadata-extractor field)
(fn [{:keys [histogram histogram-categorical kurtosis skewness sum
(fn [{:keys [histogram histogram-categorical kurtosis skewness sum zeros
sum-of-squares]}]
(let [var (h.impl/variance histogram)
sd (some-> var Math/sqrt)
......@@ -239,12 +240,13 @@
:skewness skewness
:sum sum
:sum-of-squares sum-of-squares
:zero% (math/safe-divide zeros (h/total-count histogram))
:histogram (or histogram-categorical histogram)})))))
(defmethod comparison-vector Num
[features]
(select-keys features
[:histogram :mean :median :min :max :sd :count :kurtosis
[:histogram :mean :median :min :max :sd :count :kurtosis :zero%
:skewness :entropy :nil% :uniqueness :range :min-vs-max]))
(defmethod x-ray Num
......@@ -252,7 +254,8 @@
(-> features
(update :histogram (partial histogram->dataset field))
(assoc :insights ((merge-juxt insights/normal-range
insights/gaps)
insights/zeros
insights/nils)
features))
(dissoc :has-nils? :var>sd? :0<=x<=1? :-1<=x<=1? :all-distinct?
:positive-definite? :var>sd? :uniqueness :min-vs-max)))
......@@ -511,6 +514,6 @@
(field-metadata-extractor field)
(fn [{:keys [total-count nil-count]}]
{:count total-count
:nil% (/ nil-count (max total-count 1))
:nil% (math/safe-divide nil-count total-count)
:has-nils? (pos? nil-count)
:type nil}))))
......@@ -47,15 +47,28 @@
(+ (impl/total-count histogram)
(nil-count histogram)))
(defn iqr
"Return interquartile range for a given histogram.
https://en.wikipedia.org/wiki/Interquartile_range"
[^Histogram histogram]
{:pre [(not (categorical? histogram))]}
(when-not (empty? histogram)
(let [{q1 0.25 q3 0.75} (impl/percentiles histogram 0.25 0.75)]
{:iqr (- q3 q1)
:q1 q1
:q3 q3})))
(defn optimal-bin-width
"Determine optimal bin width (and consequently number of bins) for a given
histogram using Freedman-Diaconis rule.
https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule"
[^Histogram histogram]
{:pre [(not (categorical? histogram))]}
(when-not (empty? histogram)
(let [{first-q 0.25 third-q 0.75} (impl/percentiles histogram 0.25 0.75)]
(* 2 (- third-q first-q) (math/pow (impl/total-count histogram) (/ -3))))))
(some-> histogram
iqr
:iqr
(* 2 (math/pow (impl/total-count histogram)
(/ -3)))))
(defn equidistant-bins
"Split histogram into `bin-width` wide bins. If `bin-width` is not given use
......
(ns metabase.feature-extraction.insights
"Data insights -- morsels of prepackaged analysis."
(:require [bigml.histogram.core :as h]
[clojure.math.numeric-tower :as num]
(:require [clojure.math.numeric-tower :as num]
[distributions.core :as d]
[kixi.stats.core :as stats]
[redux.core :as redux]
[metabase.feature-extraction
[histogram :as h]
[math :as math]
[timeseries :as ts]]))
......@@ -17,25 +17,35 @@
{~(keyword insight) insight#})))
(definsight normal-range
""
"What is the normal (expected) range for this data?
We define normal as being within interquartile range.
https://en.wikipedia.org/wiki/Interquartile_range"
[histogram]
(let [{lower 0.25 upper 0.75} (h/percentiles histogram 0.25 0.75)]
{:lower lower
:upper upper}))
(let [{:keys [q1 q3]} (h/iqr histogram)]
{:lower q1
:upper q3}))
(definsight gaps
""
(definsight nils
"Are there any nils in the data?"
[nil% field]
(when (pos? nil%)
{:mode :nils
:quality (if (< nil% 0.1)
{:quality (if (< nil% 0.1)
:some
:many)
:filter [[:IS_NULL [:field-id (:id field)]]]}))
:filter [:IS_NULL [:field-id (:id field)]]}))
(definsight zeros
"Are there any 0s in the data?"
[zero% field]
(when (pos? zero%)
{:quality (if (< zero% 0.1)
:some
:many)
:filter [:= [:field-id (:id field)] 0]}))
(definsight autocorrelation
"
Template: Your data has a [strong/mild] autocorrelation at lag [lag]."
"Is there a significant autocorrelation at lag up to period length?
https://en.wikipedia.org/wiki/Autocorrelation"
[series resolution]
(let [{:keys [autocorrelation lag]} (math/autocorrelation
{:max-lag (or (some-> resolution
......@@ -50,7 +60,8 @@
:lag lag})))
(definsight noisiness
""
"Is the data is noisy?
We determine noisiness by the relatve number of saddles in the series."
[series resolution]
(let [saddles% (/ (math/saddles series) (max (count series) 1))]
(when (> saddles% 0.1)
......@@ -60,8 +71,9 @@
:recommended-resolution (ts/higher-resolution resolution)})))
(definsight variation-trend
"
https://en.wikipedia.org/wiki/Variance"
"Is there a consistent thrend in changes of variation from one period to the
next.
https://en.wikipedia.org/wiki/Variance"
[resolution series]
(when resolution
(->> series
......@@ -104,7 +116,8 @@
:decreasing)})))))))))
(definsight seasonality
""
"Is there a seasonal component to the changes in data?
https://www.wessa.net/download/stl.pdf"
[seasonal-decomposition]
(when seasonal-decomposition
(let [diff (transduce identity stats/mean
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment