diff --git a/src/metabase/analyze/fingerprint/insights.clj b/src/metabase/analyze/fingerprint/insights.clj index 89932634b7cfdbd2138d1ff7a4c8dd93a9be76e4..ccdd3c37f6a9496618f9cdce612be154c43db2d0 100644 --- a/src/metabase/analyze/fingerprint/insights.clj +++ b/src/metabase/analyze/fingerprint/insights.clj @@ -14,8 +14,11 @@ [metabase.util.date-2 :as u.date] [redux.core :as redux]) (:import + (java.util Random) (java.time Instant LocalDate LocalDateTime LocalTime OffsetDateTime OffsetTime ZonedDateTime))) +(set! *warn-on-reflection* true) + (defn- last-n [n] (fn @@ -40,19 +43,21 @@ :else (/ (- x2 x1) x1))))) (defn reservoir-sample - "Transducer that samples a fixed number `n` of samples. - https://en.wikipedia.org/wiki/Reservoir_sampling" + "Transducer that samples a fixed number `n` of samples consistently. + https://en.wikipedia.org/wiki/Reservoir_sampling. Uses java.util.Random + with a seed of `n` to ensure a consistent sample if a dataset has not changed." [n] - (fn - ([] [[] 0]) - ([[reservoir c] x] - (let [c (inc c) - idx (rand-int c)] - (cond - (<= c n) [(conj reservoir x) c] - (< idx n) [(assoc reservoir idx x) c] - :else [reservoir c]))) - ([[reservoir _]] reservoir))) + (let [rng (Random. n)] + (fn + ([] [[] 0]) + ([[reservoir c] x] + (let [c (inc c) + idx (.nextInt rng c)] + (cond + (<= c n) [(conj reservoir x) c] + (< idx n) [(assoc reservoir idx x) c] + :else [reservoir c]))) + ([[reservoir _]] reservoir)))) (defn mae "Given two functions: (fÅ· input) and (fy input), returning the predicted and actual values of y @@ -135,10 +140,9 @@ :formula)))) (defn- timeseries? - [{:keys [numbers datetimes others]}] + [{:keys [numbers datetimes]}] (and (pos? (count numbers)) - (= (count datetimes) 1) - (empty? others))) + (= (count datetimes) 1))) ;; We downsize UNIX timestamps to lessen the chance of overflows and numerical instabilities. (def ^Long ^:const ^:private ms-in-a-day (* 1000 60 60 24)) @@ -188,7 +192,7 @@ x-position (:position datetime) xfn #(some-> % (nth x-position) - ;; at this point in the pipeline, dates are still stings + ;; at this point in the pipeline, dates are still strings fingerprinters/->temporal ->millis-from-epoch ms->day)] diff --git a/test/metabase/analyze/fingerprint/insights_test.clj b/test/metabase/analyze/fingerprint/insights_test.clj index 981605433ed92a965ef7807b9e74885a13cdccf0..a748d6469876beda007f7c43c8bf18e7a29c1090 100644 --- a/test/metabase/analyze/fingerprint/insights_test.clj +++ b/test/metabase/analyze/fingerprint/insights_test.clj @@ -96,6 +96,8 @@ ["2018-12-02",179,3311] ["2018-12-03",144,2525]]) +(def ^:private larger-ts (concat ts ts ts ts)) + (defn- round-to-precision "Round (presumably floating-point) `number` to a precision of `sig-figures`. Returns a `Double`. @@ -160,6 +162,27 @@ ["2018-11-05" Double/NaN] ["2018-11-08" Double/POSITIVE_INFINITY]]))))) +(defn- prepeatedly + "basic parallel version of `repeatedly`." + [n f] + (let [futures (doall (repeatedly n #(future (f))))] + (map deref futures))) + +(deftest consistent-timeseries-insight-test + (testing "Timeseries insights remain stable when sampling. (#44349)" + (let [insights (fn [] + (-> (transduce identity + (insights/insights [{:base_type :type/DateTime} + {:base_type :type/Number} + {:base_type :type/Number}]) + ;; intentionally make a dataset larger than + ;; `insights/validation-set-size` to induce the random sampling + larger-ts) + ; This value varies between machines (M1 Macs? JVMs?) so round it to avoid test failures. + (update-in [0 :best-fit 1] #(round-to-precision 6 %))))] + (is (= 1 + (count (distinct (prepeatedly 100 insights)))))))) + (deftest change-test (is (= 0.0 (insights/change 1 1))) (is (= -0.5 (insights/change 1 2))) @@ -171,3 +194,15 @@ (is (= 1.0 (insights/change -1 -2))) (is (= nil (insights/change -1 0))) (is (= 1.0 (insights/change 0 -1)))) + +(deftest insights-with-custom-epxression-columns-test + (testing "If valid timeseries columns exist, insights should be computed even with custom expressions. (#46244)" + (is (some? + (transduce identity + (insights/insights [{:base_type :type/DateTime} + {:base_type :type/Number} + ;; Any column with a base type that is not number or temporal previously + ;; prevented timeseries insights from being calculated + {:base_type :type/Text}]) + [["2024-08-09" 10.0 "weekday"] + ["2024-08-10" 20.0 "weekend"]])))))