From 22601dbacce22ed9ab98e1f900c170f63f3a0e1f Mon Sep 17 00:00:00 2001
From: Simon Belak <simon@metabase.com>
Date: Tue, 26 May 2020 19:55:47 +0200
Subject: [PATCH] Harden fingerprinting (#12573)

---
 .../analyze/fingerprint/fingerprinters.clj    | 93 ++++++++++++-------
 .../sync/analyze/fingerprint/insights.clj     | 26 +++---
 .../fingerprint/fingerprinters_test.clj       | 14 ++-
 .../analyze/fingerprint/insights_test.clj     | 66 +++++++------
 4 files changed, 123 insertions(+), 76 deletions(-)

diff --git a/src/metabase/sync/analyze/fingerprint/fingerprinters.clj b/src/metabase/sync/analyze/fingerprint/fingerprinters.clj
index 47ac84fef33..9f0a4c97cd3 100644
--- a/src/metabase/sync/analyze/fingerprint/fingerprinters.clj
+++ b/src/metabase/sync/analyze/fingerprint/fingerprinters.clj
@@ -6,13 +6,14 @@
             [kixi.stats
              [core :as stats]
              [math :as math]]
+            [medley.core :as m]
             [metabase.models.field :as field]
             [metabase.sync.analyze.classifiers.name :as classify.name]
             [metabase.sync.util :as sync-util]
             [metabase.util :as u]
             [metabase.util
              [date-2 :as u.date]
-             [i18n :refer [trs]]]
+             [i18n :refer [deferred-trs trs]]]
             [redux.core :as redux])
   (:import com.bigml.histogram.Histogram
            com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
@@ -53,6 +54,46 @@
    (.offer acc x)
    acc))
 
+(defmacro robust-map
+  "Wrap each map value in try-catch block."
+  [& kvs]
+  `(hash-map ~@(apply concat (for [[k v] (partition 2 kvs)]
+                               `[~k (try
+                                      ~v
+                                      (catch Throwable _#))]))))
+
+(defmacro ^:private with-reduced-error
+  [msg & body]
+  `(let [result# (sync-util/with-error-handling ~msg ~@body)]
+     (if (instance? Throwable result#)
+       (reduced result#)
+       result#)))
+
+(defn with-error-handling
+  "Wrap `rf` in an error-catching transducer."
+  [rf msg]
+  (fn
+    ([] (with-reduced-error msg (rf)))
+    ([acc]
+     (unreduced
+      (if (or (reduced? acc)
+              (instance? Throwable acc))
+        acc
+        (with-reduced-error msg (rf acc)))))
+    ([acc e] (with-reduced-error msg (rf acc e)))))
+
+(defn robust-fuse
+  "Like `redux/fuse` but wraps every reducing fn in `with-error-handling` and returns `nil` for
+   that fn if an error has been encountered during transducing."
+  [kfs]
+  (redux/fuse (m/map-kv-vals (fn [k f]
+                               (redux/post-complete
+                                (with-error-handling f (deferred-trs "Error reducing {0}" (name k)))
+                                (fn [result]
+                                  (when-not (instance? Throwable result)
+                                    result))))
+                             kfs)))
+
 (defmulti fingerprinter
   "Return a fingerprinter transducer for a given field based on the field's type."
   {:arglists '([field])}
@@ -68,8 +109,8 @@
 
 (def ^:private global-fingerprinter
   (redux/post-complete
-   (redux/fuse {:distinct-count cardinality
-                :nil%           (stats/share nil?)})
+   (robust-fuse {:distinct-count cardinality
+                 :nil%           (stats/share nil?)})
    (partial hash-map :global)))
 
 (defmethod fingerprinter :default
@@ -101,26 +142,6 @@
      (merge global-fingerprint
             type-fingerprint))))
 
-(defmacro ^:private with-reduced-error
-  [msg & body]
-  `(let [result# (sync-util/with-error-handling ~msg ~@body)]
-     (if (instance? Throwable result#)
-       (reduced result#)
-       result#)))
-
-(defn with-error-handling
-  "Wrap `rf` in an error-catching transducer."
-  [rf msg]
-  (fn
-    ([] (with-reduced-error msg (rf)))
-    ([acc]
-     (unreduced
-      (if (or (reduced? acc)
-              (instance? Throwable acc))
-        acc
-        (with-reduced-error msg (rf acc)))))
-    ([acc e] (with-reduced-error msg (rf acc e)))))
-
 (defmacro ^:private deffingerprinter
   [field-type transducer]
   (let [field-type (if (vector? field-type)
@@ -168,8 +189,8 @@
 
 (deffingerprinter :type/DateTime
   ((map ->temporal)
-   (redux/fuse {:earliest earliest
-                :latest   latest})))
+   (robust-fuse {:earliest earliest
+                 :latest   latest})))
 
 (defn- histogram
   "Transducer that summarizes numerical data with a histogram."
@@ -177,17 +198,25 @@
   ([^Histogram histogram] histogram)
   ([^Histogram histogram x] (hist/insert-simple! histogram x)))
 
+(defn real-number?
+  "Is `x` a real number (i.e. not a `NaN` or an `Infinity`)?"
+  [x]
+  (and (number? x)
+       (not (Double/isNaN x))
+       (not (Double/isInfinite x))))
+
 (deffingerprinter :type/Number
   (redux/post-complete
-   histogram
+   ((filter real-number?) histogram)
    (fn [h]
      (let [{q1 0.25 q3 0.75} (hist/percentiles h 0.25 0.75)]
-       {:min (hist/minimum h)
+       (robust-map
+        :min (hist/minimum h)
         :max (hist/maximum h)
         :avg (hist/mean h)
         :sd  (some-> h hist/variance math/sqrt)
         :q1  q1
-        :q3  q3}))))
+        :q3  q3)))))
 
 (defn- valid-serialized-json?
   "Is x a serialized JSON dictionary or array."
@@ -199,10 +228,10 @@
   ((map str) ; we cast to str to support `field-literal` type overwriting:
              ; `[:field-literal "A_NUMBER" :type/Text]` (which still
              ; returns numbers in the result set)
-   (redux/fuse {:percent-json   (stats/share valid-serialized-json?)
-                :percent-url    (stats/share u/url?)
-                :percent-email  (stats/share u/email?)
-                :average-length ((map count) stats/mean)})))
+   (robust-fuse {:percent-json   (stats/share valid-serialized-json?)
+                 :percent-url    (stats/share u/url?)
+                 :percent-email  (stats/share u/email?)
+                 :average-length ((map count) stats/mean)})))
 
 (defn fingerprint-fields
   "Return a transducer for fingerprinting a resultset with fields `fields`."
diff --git a/src/metabase/sync/analyze/fingerprint/insights.clj b/src/metabase/sync/analyze/fingerprint/insights.clj
index a32f59be8ab..70a56f3e872 100644
--- a/src/metabase/sync/analyze/fingerprint/insights.clj
+++ b/src/metabase/sync/analyze/fingerprint/insights.clj
@@ -99,25 +99,19 @@
 
 (def ^:private ^:const ^Long validation-set-size 20)
 
-(defn- real-number?
-  [x]
-  (and (number? x)
-       (not (Double/isNaN x))
-       (not (Double/isInfinite x))))
-
 (defn- best-fit
   "Fit curves from `trendline-function-families` and pick the one with the smallest RMSE.
    To keep the operation single pass we collect a small validation set as we go using reservoir
    sampling, and use it to calculate RMSE."
   [fx fy]
   (redux/post-complete
-   (redux/fuse
+   (f/robust-fuse
     {:fits           (->> (for [{:keys [x-link-fn y-link-fn formula model]} trendline-function-families]
                             (redux/post-complete
                              (stats/simple-linear-regression (comp (stats/somef x-link-fn) fx)
                                                              (comp (stats/somef y-link-fn) fy))
                              (fn [[offset slope]]
-                               (when (every? real-number? [offset slope])
+                               (when (every? f/real-number? [offset slope])
                                  {:model   (model offset slope)
                                   :formula (formula offset slope)}))))
                           (apply redux/juxt))
@@ -133,7 +127,7 @@
               (map #(assoc % :mae (transduce identity
                                              (mae (comp (:model %) first) second)
                                              validation-set)))
-              (filter (comp real-number? :mae))
+              (filter (comp f/real-number? :mae))
               not-empty
               (apply min-key :mae)
               :formula))))
@@ -202,17 +196,19 @@
                (redux/post-complete
                 (let [y-position (:position number-col)
                       yfn        #(nth % y-position)]
-                  (redux/juxt ((map yfn) (last-n 2))
-                              ((map xfn) (last-n 2))
-                              (stats/simple-linear-regression xfn yfn)
-                              (best-fit xfn yfn)))
+                  ((filter (comp f/real-number? yfn))
+                   (redux/juxt ((map yfn) (last-n 2))
+                               ((map xfn) (last-n 2))
+                               (stats/simple-linear-regression xfn yfn)
+                               (best-fit xfn yfn))))
                 (fn [[[y-previous y-current] [x-previous x-current] [offset slope] best-fit]]
                   (let [unit         (if (or (nil? (:unit datetime))
                                              (->> datetime :unit mbql.u/normalize-token (= :default)))
                                        (infer-unit x-previous x-current)
                                        (:unit datetime))
                         show-change? (valid-period? x-previous x-current unit)]
-                    {:last-value     y-current
+                    (f/robust-map
+                     :last-value     y-current
                      :previous-value (when show-change?
                                        y-previous)
                      :last-change    (when show-change?
@@ -221,7 +217,7 @@
                      :offset         offset
                      :best-fit       best-fit
                      :col            (:name number-col)
-                     :unit           unit})))))
+                     :unit           unit))))))
       (trs "Error generating timeseries insight keyed by: {0}"
            (sync-util/name-for-logging (field/map->FieldInstance datetime))))))
 
diff --git a/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj b/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj
index e317425b6d4..8b7d8cebb31 100644
--- a/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj
+++ b/test/metabase/sync/analyze/fingerprint/fingerprinters_test.clj
@@ -42,7 +42,19 @@
                                  :sd  1.0}}}
          (transduce identity
                     (fingerprinter (field/map->FieldInstance {:base_type :type/Number}))
-                    [1.0 2.0 3.0]))))
+                    [1.0 2.0 3.0])))
+  (testing "We should robustly survive weird values such as NaN, Infinity, and nil"
+    (is (= {:global {:distinct-count 7
+                     :nil%           0.25}
+            :type   {:type/Number {:avg 2.0
+                                   :min 1.0
+                                   :max 3.0
+                                   :q1  1.25
+                                   :q3  2.75
+                                   :sd  1.0}}}
+           (transduce identity
+                      (fingerprinter (field/map->FieldInstance {:base_type :type/Number}))
+                      [1.0 2.0 3.0 Double/NaN Double/POSITIVE_INFINITY Double/NEGATIVE_INFINITY nil nil])))))
 
 (deftest fingerprint-string-values-test
   (is (= {:global {:distinct-count 5
diff --git a/test/metabase/sync/analyze/fingerprint/insights_test.clj b/test/metabase/sync/analyze/fingerprint/insights_test.clj
index 79f1cbbc34a..a25b74e8653 100644
--- a/test/metabase/sync/analyze/fingerprint/insights_test.clj
+++ b/test/metabase/sync/analyze/fingerprint/insights_test.clj
@@ -1,6 +1,5 @@
 (ns metabase.sync.analyze.fingerprint.insights-test
   (:require [clojure.test :refer :all]
-            [expectations :refer :all]
             [metabase.sync.analyze.fingerprint.insights :as i :refer :all]))
 
 (def ^:private cols [{:base_type :type/DateTime} {:base_type :type/Number}])
@@ -25,13 +24,6 @@
                  first
                  :last-value))))))
 
-(expect
-  (transduce identity
-             (insights [{:base_type :type/DateTime :unit :year}
-                        {:base_type :type/Integer}])
-             [["2014-01-01T00:00:00Z" 100]
-              ["2015-01-01T00:00:00Z" 200]]))
-
 (defn- inst->day
   [t]
   (some-> t (#'i/->millis-from-epoch) (#'i/ms->day)))
@@ -97,26 +89,44 @@
                    ["2018-12-02",179,3311]
                    ["2018-12-03",144,2525]])
 
-(expect
-  [{:last-value     144,
-    :previous-value 179,
-    :last-change    -0.19553072625698323,
-    :slope          -7.671473413418271,
-    :offset         137234.92983406168,
-    :best-fit       [:* 1.5672560913548484E227 [:exp [:* -0.02899533549378612 :x]]],
-    :unit           :day,
-    :col            nil}
-   {:last-value     2525,
-    :previous-value 3311,
-    :last-change    -0.2373905164602839,
-    :slope          -498.764272733624,
-    :offset         8915371.843617931,
-    :best-fit       [:+ 8915371.843617931 [:* -498.764272733624 :x]],
-    :col            nil,
-    :unit           :day}]
-  (transduce identity
-             (insights [{:base_type :type/DateTime} {:base_type :type/Number} {:base_type :type/Number}])
-             ts))
+(deftest timeseries-insight-test
+  (is (= [{:last-value     144,
+           :previous-value 179,
+           :last-change    -0.19553072625698323,
+           :slope          -7.671473413418271,
+           :offset         137234.92983406168,
+           :best-fit       [:* 1.5672560913548484E227 [:exp [:* -0.02899533549378612 :x]]],
+           :unit           :day,
+           :col            nil}
+          {:last-value     2525,
+           :previous-value 3311,
+           :last-change    -0.2373905164602839,
+           :slope          -498.764272733624,
+           :offset         8915371.843617931,
+           :best-fit       [:+ 8915371.843617931 [:* -498.764272733624 :x]],
+           :col            nil,
+           :unit           :day}]
+         (transduce identity
+                    (insights [{:base_type :type/DateTime}
+                               {:base_type :type/Number}
+                               {:base_type :type/Number}])
+                    ts)))
+  (testing "We should robustly survive weird values such as NaN, Infinity, and nil"
+    (is (= [{:last-value     20.0
+             :previous-value 10.0
+             :last-change    1.0
+             :slope          10.0
+             :offset         -178350.0
+             :best-fit       [:+ -178350.0 [:* 10.0 :x]]
+             :unit           :day
+             :col            nil}]
+           (transduce identity
+                      (insights [{:base_type :type/DateTime} {:base_type :type/Number}])
+                      [["2018-11-01" 10.0]
+                       ["2018-11-02" 20.0]
+                       ["2018-11-03" nil]
+                       ["2018-11-05" Double/NaN]
+                       ["2018-11-08" Double/POSITIVE_INFINITY]])))))
 
 (deftest change-test
   (is (= 0.0 (change 1 1)))
-- 
GitLab