diff --git a/src/metabase/feature_extraction/feature_extractors.clj b/src/metabase/feature_extraction/feature_extractors.clj index d02876e83861f0540bbafd1e9cfcf4e4a8249a5a..d7d3e5bd6b41e06769eb1b882a1a0db153d533f3 100644 --- a/src/metabase/feature_extraction/feature_extractors.clj +++ b/src/metabase/feature_extraction/feature_extractors.clj @@ -329,26 +329,39 @@ (field-metadata-extractor field) (fn [{:keys [series linear-regression power-law-regression log-linear-regression] :as features}] - (let [best-fit (transduce + (let [; We add a small regularization penalty to more complex curves to + ; prevent technically correct but nonsense solutions. + lambda 0.1 + regularize (fn [penalty] + (fn [ssr] + (if (Double/isNaN ssr) + Double/MAX_VALUE + (+ ssr (* lambda penalty))))) + best-fit (transduce identity (redux/post-complete (redux/fuse {:linear-regression (let [[a b] linear-regression] - (math/ssr (fn [x] - (+ a (* b x))))) + (redux/post-complete + (math/ssr (fn [x] + (+ a (* b x)))) + (regularize 0))) :power-law-regression (let [[a b] power-law-regression] - (math/ssr (fn [x] - (* (Math/exp a) (Math/pow x b))))) + (redux/post-complete + (math/ssr (fn [x] + (* (Math/exp a) (Math/pow x b)))) + (regularize 2))) :log-linear-regression (let [[a b] log-linear-regression] - (math/ssr (fn [x] - (+ a (* b (Math/log x))))))}) + (redux/post-complete + (math/ssr (fn [x] + (+ a (* b (Math/log x))))) + (regularize 1)))}) (fn [fits] - (let [[model ssr] (apply min-key val fits)] + (let [model (key (apply min-key val fits))] {:model model - :ssr ssr :params (features model)}))) series) resolution (infer-resolution query series)