From ef0065d8c41bc206fc43dc5ff00d6dc02b30eb1f Mon Sep 17 00:00:00 2001
From: Simon Belak <simon.belak@gmail.com>
Date: Wed, 8 Nov 2017 00:49:43 +0100
Subject: [PATCH] Add regularization to trend fitting and correctly handle NaNs

---
 .../feature_extraction/feature_extractors.clj | 31 +++++++++++++------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/src/metabase/feature_extraction/feature_extractors.clj b/src/metabase/feature_extraction/feature_extractors.clj
index d02876e8386..d7d3e5bd6b4 100644
--- a/src/metabase/feature_extraction/feature_extractors.clj
+++ b/src/metabase/feature_extraction/feature_extractors.clj
@@ -329,26 +329,39 @@
     (field-metadata-extractor field)
     (fn [{:keys [series linear-regression power-law-regression
                  log-linear-regression] :as features}]
-      (let [best-fit   (transduce
+      (let [; We add a small regularization penalty to more complex curves to
+            ; prevent technically correct but nonsense solutions.
+            lambda     0.1
+            regularize (fn [penalty]
+                         (fn [ssr]
+                           (if (Double/isNaN ssr)
+                             Double/MAX_VALUE
+                             (+ ssr (* lambda penalty)))))
+            best-fit   (transduce
                         identity
                         (redux/post-complete
                          (redux/fuse
                           {:linear-regression
                            (let [[a b] linear-regression]
-                             (math/ssr (fn [x]
-                                         (+ a (* b x)))))
+                             (redux/post-complete
+                              (math/ssr (fn [x]
+                                          (+ a (* b x))))
+                              (regularize 0)))
                            :power-law-regression
                            (let [[a b] power-law-regression]
-                             (math/ssr (fn [x]
-                                         (* (Math/exp a) (Math/pow x b)))))
+                             (redux/post-complete
+                              (math/ssr (fn [x]
+                                          (* (Math/exp a) (Math/pow x b))))
+                              (regularize 2)))
                            :log-linear-regression
                            (let [[a b] log-linear-regression]
-                             (math/ssr (fn [x]
-                                         (+ a (* b (Math/log x))))))})
+                             (redux/post-complete
+                              (math/ssr (fn [x]
+                                          (+ a (* b (Math/log x)))))
+                              (regularize 1)))})
                          (fn [fits]
-                           (let [[model ssr] (apply min-key val fits)]
+                           (let [model (key (apply min-key val fits))]
                              {:model  model
-                              :ssr    ssr
                               :params (features model)})))
                         series)
             resolution (infer-resolution query series)
-- 
GitLab