From 28a737bc23fc812499006b5273a44418f931e4ef Mon Sep 17 00:00:00 2001 From: Oleksandr Yakushev <alex@bytopia.org> Date: Thu, 5 Sep 2024 12:50:33 +0300 Subject: [PATCH] perf: Add more timeseries insights optimizations (#47651) * perf: [insights] Optimize last-2 allocations * perf: [insights] Optimize best-fit allocations when creating validation sample * perf: [insights] Optimize simple-linear-regression --- src/metabase/analyze/fingerprint/insights.clj | 77 ++++++++++--------- 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/src/metabase/analyze/fingerprint/insights.clj b/src/metabase/analyze/fingerprint/insights.clj index 3779ba5b252..f375b051df6 100644 --- a/src/metabase/analyze/fingerprint/insights.clj +++ b/src/metabase/analyze/fingerprint/insights.clj @@ -19,17 +19,19 @@ (set! *warn-on-reflection* true) (defn- last-2 [] - (fn - ([] []) - ([acc] - (let [cnt (count acc)] - (cond (= cnt 0) [nil nil] - (= cnt 1) [nil (nth acc 0)] - :else acc))) - ([acc x] - (if (< (count acc) 2) - (conj acc x) - [(nth acc 1) x])))) + (let [none (Object.)] + (fn + ([] (object-array [none none])) + ([^objects acc] + (let [a (aget acc 0) + b (aget acc 1)] + (cond (identical? b none) [nil nil] + (identical? a none) [nil b] + :else [a b]))) + ([^objects acc, x] + (aset acc 0 (aget acc 1)) + (aset acc 1 x) + acc)))) (defn change "Relative difference between `x1` an `x2`." @@ -47,8 +49,8 @@ "Transducer that samples a fixed number `n` of samples consistently. https://en.wikipedia.org/wiki/Reservoir_sampling. Uses java.util.Random with a seed of `n` to ensure a consistent sample if a dataset has not changed. - The returned instance is mutable, so don't reuse it." - [n] + The returned instance is mutable, so don't reuse it. `f` is invoked on the element before adding it to the sample." + [n f] (let [n (int n) rng (Random. n) counter (int-array 1) ;; A box for a mutable primitive int. @@ -66,13 +68,14 @@ idx (.nextInt rng c+1)] (aset counter 0 c+1) (cond - (< c n) (aset reservoir c x) - (< idx n) (aset reservoir idx x))))))) + (< c n) (aset reservoir c (f x)) + (< idx n) (aset reservoir idx (f x)))))))) (defn- simple-linear-regression "Faster and more efficient implementation of `kixi.stats.estimate/simple-linear-regression`. Computes some of squares - on each step, and on the completing step returns `[offset slope]`." - [fx fy] + on each step, and on the completing step returns `[offset slope]`. Additionally accepts `x-link-fn` and `y-link-fn` + functions which should be double->double." + [fx, fy, ^clojure.lang.IFn$DD x-link-fn, ^clojure.lang.IFn$DD y-link-fn] (fn ([] (double-array 6)) ([^doubles arr e] @@ -80,8 +83,8 @@ y (fy e)] (if (or (nil? x) (nil? y)) arr - (let [x (double x) - y (double y) + (let [x (.invokePrim x-link-fn (double x)) + y (.invokePrim y-link-fn (double y)) c (aget arr 0) mx (aget arr 1) my (aget arr 2) @@ -118,34 +121,38 @@ (math/abs (- (fy x) (fy-hat x)))))) stats/mean)) +(defn- double-identity ^double [^double x] x) + +(defn- double-log ^double [^double x] (Math/log x)) + (def ^:private trendline-function-families ;; http://mathworld.wolfram.com/LeastSquaresFitting.html - [{:x-link-fn identity - :y-link-fn identity + [{:x-link-fn double-identity + :y-link-fn double-identity :model (fn [offset slope] (fn [x] (+ offset (* slope x)))) :formula (fn [offset slope] [:+ offset [:* slope :x]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingExponential.html - {:x-link-fn identity - :y-link-fn math/log + {:x-link-fn double-identity + :y-link-fn double-log :model (fn [offset slope] (fn [x] (* (math/exp offset) (math/exp (* slope x))))) :formula (fn [offset slope] [:* (math/exp offset) [:exp [:* slope :x]]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingLogarithmic.html - {:x-link-fn math/log - :y-link-fn identity + {:x-link-fn double-log + :y-link-fn double-identity :model (fn [offset slope] (fn [x] - (+ offset (* slope (math/log x))))) + (+ offset (* slope (double-log x))))) :formula (fn [offset slope] [:+ offset [:* slope [:log :x]]])} ;; http://mathworld.wolfram.com/LeastSquaresFittingPowerLaw.html - {:x-link-fn math/log - :y-link-fn math/log + {:x-link-fn double-log + :y-link-fn double-log :model (fn [offset slope] (fn [x] (* (math/exp offset) (math/pow x slope)))) @@ -163,19 +170,15 @@ (fingerprinters/robust-fuse {:fits (->> (for [{:keys [x-link-fn y-link-fn formula model]} trendline-function-families] (redux/post-complete - (simple-linear-regression #(some-> (fx %) x-link-fn) - #(some-> (fy %) y-link-fn)) + (simple-linear-regression fx fy x-link-fn y-link-fn) (fn [[offset slope]] (when (every? u/real-number? [offset slope]) {:model (model offset slope) :formula (formula offset slope)})))) redux/juxt*) - :validation-set ((keep (fn [row] - (let [x (fx row) - y (fy row)] - (when (and x y) - [x y])))) - (reservoir-sample validation-set-size))}) + :validation-set ((filter (fn [row] (and (fx row) (fy row)))) + (reservoir-sample validation-set-size + (fn [row] [(fx row) (fy row)])))}) (fn [{:keys [validation-set fits]}] (some->> fits (remove nil?) @@ -254,7 +257,7 @@ ((filter (comp u/real-number? yfn)) (redux/juxt ((map yfn) (last-2)) ((map xfn) (last-2)) - (simple-linear-regression xfn yfn) + (simple-linear-regression xfn yfn double-identity double-identity) (best-fit xfn yfn)))) (fn [[[y-previous y-current] [x-previous x-current] [offset slope] best-fit-equation]] (let [unit (let [unit (some-> datetime :unit mbql.u/normalize-token)] -- GitLab