From 46191072d0cd59331f0c3cc64f51f3f323410bfe Mon Sep 17 00:00:00 2001
From: Braden Shepherdson <braden@metabase.com>
Date: Wed, 6 Nov 2024 09:18:27 -0500
Subject: [PATCH] [testing] Add filters to random query generators (#49448)

Adds filters to generative testing for queries.

There's a lot of new code here, but don't panic, it's quite straightforward:
generating filter values, in the right shapes for various filter expressions.
---
 src/metabase/lib/filter.cljc                  |   8 +-
 src/metabase/lib/schema/filter.cljc           |   4 +-
 .../lib/schema/temporal_bucketing.cljc        |   7 +
 test/metabase/lib/test_util/generators.cljc   |  53 ++-
 .../lib/test_util/generators/filters.cljc     | 308 ++++++++++++++++++
 .../lib/test_util/generators/util.cljc        |   9 +
 6 files changed, 371 insertions(+), 18 deletions(-)
 create mode 100644 test/metabase/lib/test_util/generators/filters.cljc
 create mode 100644 test/metabase/lib/test_util/generators/util.cljc

diff --git a/src/metabase/lib/filter.cljc b/src/metabase/lib/filter.cljc
index 85375041737..a7807b6f9f0 100644
--- a/src/metabase/lib/filter.cljc
+++ b/src/metabase/lib/filter.cljc
@@ -310,10 +310,10 @@
 (lib.common/defop not-null [x])
 (lib.common/defop is-empty [x])
 (lib.common/defop not-empty [x])
-(lib.common/defop starts-with [whole part])
-(lib.common/defop ends-with [whole part])
-(lib.common/defop contains [whole part])
-(lib.common/defop does-not-contain [whole part])
+(lib.common/defop starts-with [whole & parts])
+(lib.common/defop ends-with [whole & parts])
+(lib.common/defop contains [whole & parts])
+(lib.common/defop does-not-contain [whole & parts])
 (lib.common/defop relative-time-interval [x value bucket offset-value offset-bucket])
 (lib.common/defop time-interval [x amount unit])
 (lib.common/defop segment [segment-id])
diff --git a/src/metabase/lib/schema/filter.cljc b/src/metabase/lib/schema/filter.cljc
index 86a614e1006..65590bf1dc9 100644
--- a/src/metabase/lib/schema/filter.cljc
+++ b/src/metabase/lib/schema/filter.cljc
@@ -137,7 +137,9 @@
 (mr/def ::operator
   [:map
    [:lib/type [:= :operator/filter]]
-   [:short [:enum := :!= :inside :between :< :> :<= :>= :is-null :not-null :is-empty :not-empty :contains :does-not-contain :starts-with :ends-with]]
+   [:short [:enum
+            := :!= :inside :between :< :> :<= :>= :is-null :not-null :is-empty :not-empty
+            :contains :does-not-contain :starts-with :ends-with :time-interval :relative-time-interval]]
    ;; this is used for display name and it depends on the arguments to the filter clause itself... e.g.
    ;;
    ;; number_a < number_b
diff --git a/src/metabase/lib/schema/temporal_bucketing.cljc b/src/metabase/lib/schema/temporal_bucketing.cljc
index de31c615ad0..4ab966cac95 100644
--- a/src/metabase/lib/schema/temporal_bucketing.cljc
+++ b/src/metabase/lib/schema/temporal_bucketing.cljc
@@ -99,6 +99,13 @@
                 :decode/normalize common/normalize-keyword}]
         time-bucketing-units))
 
+(def ordered-datetime-truncation-units
+  "Valid datetime bucketing units for truncation operations.
+  The front end shows the options in this order."
+  (into []
+        (distinct)
+        (concat ordered-time-truncation-units ordered-date-truncation-units)))
+
 (def ordered-datetime-bucketing-units
   "Valid datetime bucketing units for either truncation or extraction operations.
   The front end shows the options in this order."
diff --git a/test/metabase/lib/test_util/generators.cljc b/test/metabase/lib/test_util/generators.cljc
index e4ff2af9d81..dec0c2e596c 100644
--- a/test/metabase/lib/test_util/generators.cljc
+++ b/test/metabase/lib/test_util/generators.cljc
@@ -4,7 +4,10 @@
    [clojure.test :refer [deftest is testing]]
    [metabase.lib.breakout :as lib.breakout]
    [metabase.lib.core :as lib]
+   [metabase.lib.equality :as lib.equality]
    [metabase.lib.test-metadata :as meta]
+   [metabase.lib.test-util.generators.filters :as gen.filters]
+   [metabase.lib.test-util.generators.util :as gen.u]
    [metabase.util :as u]))
 
 ;; NOTE: Being able to *execute* these queries and grok the results would actually be really powerful, if we can
@@ -77,14 +80,6 @@
   (swap! step-kinds assoc kind step-def))
 
 ;; Helpers =======================================================================================
-(defn- choose
-  "Uniformly chooses among a seq of options.
-
-  Returns nil if the list is empty! This is handy for choose-and-do vs. do-nothing while writing the next steps."
-  [xs]
-  (when-not (empty? xs)
-    (rand-nth xs)))
-
 (defn- choose-stage
   "Chooses a stage to operator on. 80% act on -1, 20% chooses a stage by index (which might be the last stage)."
   [query]
@@ -117,9 +112,9 @@
 ;; TODO: If exhaustion is a goal, we should think about making these into generators or otherwise reifying the choices.
 (defmethod next-steps* :aggregate [query _aggregate]
   (let [stage-number (choose-stage query)
-        operator     (choose (lib/available-aggregation-operators query stage-number))
+        operator     (gen.u/choose (lib/available-aggregation-operators query stage-number))
         agg          (if (:requires-column? operator)
-                       (lib/aggregation-clause operator (choose (:columns operator)))
+                       (lib/aggregation-clause operator (gen.u/choose (:columns operator)))
                        (lib/aggregation-clause operator))]
     [:aggregate stage-number agg]))
 
@@ -128,11 +123,11 @@
 
 (defmethod next-steps* :breakout [query _breakout]
   (let [stage-number (choose-stage query)
-        column       (choose (lib/breakoutable-columns query stage-number))
+        column       (gen.u/choose (lib/breakoutable-columns query stage-number))
         ;; If this is a temporal column, we need to choose a unit for it. Nil if it's not temporal.
         ;; TODO: Don't always bucket/bin! We should sometimes choose the "Don't bin" option etc.
-        bucket       (choose (lib/available-temporal-buckets query stage-number column))
-        binning      (choose (lib/available-binning-strategies query stage-number column))
+        bucket       (gen.u/choose (lib/available-temporal-buckets query stage-number column))
+        binning      (gen.u/choose (lib/available-binning-strategies query stage-number column))
         brk-column   (cond-> column
                        bucket  (lib/with-temporal-bucket bucket)
                        binning (lib/with-binning binning))]
@@ -167,6 +162,38 @@
           (is (= (count after-breakouts)
                  (count before-breakouts))))))))
 
+;; Filters =======================================================================================
+(add-step {:kind :filter})
+
+(defmethod next-steps* :filter [query _filter]
+  (let [stage-number (choose-stage query)]
+    [:filter stage-number (gen.filters/gen-filter (lib/filterable-columns query stage-number))]))
+
+(defmethod run-step* :filter [query [_filter stage-number filter-clause]]
+  (lib/filter query stage-number filter-clause))
+
+(defmethod before-and-after :filter [before after [_filter stage-number filter-clause]]
+  (let [before-filters (lib/filters before stage-number)
+        after-filters  (lib/filters after  stage-number)]
+    (testing (str ":filter stage " stage-number
+                  "\n\nBefore query\n" (u/pprint-to-str before)
+                  "\n\nAfter query\n" (u/pprint-to-str after)
+                  "\n\nwith filter clause\n" (u/pprint-to-str filter-clause)
+                  "\n")
+      (if (some #(lib.equality/= % filter-clause) before-filters)
+        (testing "with an existing, equivalent filter"
+          (testing "does not add a new one"
+            (is (= (count before-filters)
+                   (count after-filters)))))
+        (testing "with a new filter"
+          (testing "adds it to the end of the list"
+            (is (= (count after-filters)
+                   (inc (count before-filters))))
+            (is (=? filter-clause (last after-filters))))
+          (testing (str `lib/filter-operator " returns the right op")
+            (is (= (first filter-clause)
+                   (:short (lib/filter-operator after stage-number (last after-filters)))))))))))
+
 ;; Generator internals ===========================================================================
 (defn- run-step
   "Applies a step, returning the updated context."
diff --git a/test/metabase/lib/test_util/generators/filters.cljc b/test/metabase/lib/test_util/generators/filters.cljc
new file mode 100644
index 00000000000..e9e06f19ba5
--- /dev/null
+++ b/test/metabase/lib/test_util/generators/filters.cljc
@@ -0,0 +1,308 @@
+(ns metabase.lib.test-util.generators.filters
+  (:require
+   [metabase.lib.core :as lib]
+   [metabase.lib.options :as lib.options]
+   [metabase.lib.schema.temporal-bucketing :as lib.schema.temporal-bucketing]
+   [metabase.lib.test-util.generators.util :as gen.u]
+   [metabase.lib.types.isa :as lib.types.isa]
+   [metabase.util.time :as u.time]))
+
+;; Filter values =================================================================================
+(defn- gen-int []
+  (- (rand-int 2000000) 1000000))
+
+(def ^:private valid-ascii
+  (mapv char (range 0x20 0x7f)))
+
+(def ^:private han-unicode
+  (mapv char (range 0x4e00 0xa000)))
+
+(defn- gen-string
+  ([]
+   (if (< (rand) 0.1)
+     (gen-string han-unicode 40)
+     (gen-string valid-ascii 70)))
+  ([symbols max-len]
+   (apply str (repeatedly (inc (rand-int max-len)) #(rand-nth symbols)))))
+
+(defn- gen-time []
+  (u.time/local-time (rand-int 24) (rand-int 60) (rand-int 60) (rand-int 1000000000)))
+
+(defn- gen-time:minute []
+  (u.time/local-time (rand-int 24) (rand-int 60) 0 0))
+
+(defn- gen-date []
+  ;; Random day of the year, from 2000-01-01 through 2037-12-31.
+  ;; Avoids 2038 because of the 32-bit timestamp overflow.
+  ;; TODO: Always adds 0-364 days, so it can't return Dec 31st of a leap year. I don't think it matters, but I will
+  ;; highlight it.
+  (u.time/add (u.time/local-date (+ 2000 (rand-int 38)) 1 1) ;; Jan 1, from 2000 through 2037
+              :day
+              (rand-int 365)))
+
+(defn- gen-datetime []
+  (u.time/local-date-time (gen-date) (gen-time)))
+
+(defn- gen-datetime:minute []
+  (u.time/local-date-time (gen-date) (gen-time:minute)))
+
+(defn- gen-latitude []
+  ;; +/- 75 degrees is a generous but plausible range for latitudes.
+  (* 150 (- (rand) 0.5)))
+
+(defn- gen-longitude []
+  ;; +/- 180 degrees
+  (- (* 360 (rand))
+     180))
+
+(def ^:private fake-categories
+  (vec (for [i (range 1 41)]
+         (str "Fake Category " i))))
+
+(defn- gen-category []
+  ;; Just some made-up values that are clearly not random strings for debugging.
+  (rand-nth fake-categories))
+
+(defn- rand-column-value [{:keys [effective-type] :as column}]
+  (cond
+    ;; Numeric PKs and FKs are always integers.
+    (and (lib.types.isa/id? column)
+         (lib.types.isa/numeric? column))              (abs (gen-int))
+    (#{:type/BigInteger :type/Integer} effective-type) (gen-int)
+    (lib.types.isa/category? column)                   (gen-category)
+    (lib.types.isa/latitude? column)                   (gen-latitude)
+    (lib.types.isa/longitude? column)                  (gen-longitude)
+    (lib.types.isa/numeric? column)                    (cond-> (gen-int)
+                                                         (< (rand) 0.5) (+ (rand)))
+    (lib.types.isa/string-or-string-like? column)      (gen-string)
+    (lib.types.isa/time? column)                       (gen-time)
+    (lib.types.isa/date-without-time? column)          (gen-date)
+    (lib.types.isa/date-or-datetime? column)           (gen-datetime)
+    :else (throw (ex-info " !!! Not sure what values to generate for column" {:effective-type effective-type
+                                                                              :column         column}))))
+
+;; Filter clauses ================================================================================
+(def ^:private ^:dynamic *filterable-columns* nil)
+
+(defmulti ^:private gen-filter-clause
+  (fn [_column operator]
+    (:short operator)))
+
+;; Binary operators like :<
+(doseq [[op f] [[:<  lib/<]
+                [:<= lib/<=]
+                [:>  lib/>]
+                [:>= lib/>=]]]
+  (defmethod gen-filter-clause op [column _op]
+    (f column (rand-column-value column))))
+
+;; Multi-value operators like := and :starts-with
+(doseq [[op f] [[:=                lib/=]
+                [:!=               lib/!=]
+                [:starts-with      lib/starts-with]
+                [:ends-with        lib/ends-with]
+                [:contains         lib/contains]
+                [:does-not-contain lib/does-not-contain]]]
+  (defmethod gen-filter-clause op [column _op]
+    (apply f column (repeatedly (inc (rand-int 4))
+                                #(rand-column-value column)))))
+
+(defmethod gen-filter-clause :between [column _op]
+  (let [lo (rand-column-value column)
+        hi (rand-column-value column)]
+    ;; TODO: Maybe make the LHS arg be the smaller? Right now they're random.
+    ;; Probably they can be `sort`ed, but that might fail on weird types.
+    (lib/between column lo hi)))
+
+;; Unary operators like `:is-empty`.
+(doseq [[op f] [[:is-empty  lib/is-empty]
+                [:not-empty lib/not-empty]
+                [:is-null   lib/is-null]
+                [:not-null  lib/not-null]]]
+  (defmethod gen-filter-clause op [column _op]
+    (f column)))
+
+(defn- skipped-operator? [op]
+  (#{:inside} (:short op)))
+
+(defn- units-from [min-unit]
+  (drop-while #(not= % min-unit) lib.schema.temporal-bucketing/ordered-datetime-truncation-units))
+
+(defn- gen-filter:unit
+  "If the optional second argument is provided, returns a range unit that is at least as large as this minimum."
+  ([column]
+   (gen-filter:unit column (if (lib.types.isa/date-without-time? column)
+                             :day
+                             :minute)))
+  ([_column min-unit]
+   (gen.u/choose (units-from min-unit))))
+
+(defn- gen-filter:relative-date-current [column]
+  (lib/time-interval column :current (gen-filter:unit column)))
+
+(defn- gen-filter:relative-date-nearby [column]
+  (let [past-future (gen.u/choose [+ -])
+        unit        (gen-filter:unit column)
+        n           (gen.u/choose (range 1 20))]
+    (cond-> (lib/time-interval column (past-future n) unit)
+      (< (rand) 0.2) (lib.options/update-options assoc :include-current true))))
+
+(defn- gen-filter:relative-date-offset [column]
+  ;; Only one past-future, since both offsets have to point in the same direction, at least in the UI.
+  (let [past-future (gen.u/choose [+ -])
+        range-unit  (gen-filter:unit column)
+        range-n     (gen.u/choose (range 1 20))
+        offset-unit (gen-filter:unit column range-unit)
+        offset-n    (gen.u/choose (range 1 6))]
+    (lib/relative-time-interval column
+                                (past-future range-n)  range-unit
+                                (past-future offset-n) offset-unit)))
+
+(defn- gen-filter:relative-date [column]
+  ;; Current: day, week, month, quarter, year.
+  ;; Previous: N minutes/hours/days/weeks/months/quarters/years
+  ;; Next: N minutes/hours/days/weeks/months/quarters/years
+  ;; Plus optionally either "include this <unit>" or "M <larger-unit>s from now/ago"
+  (let [r (rand)]
+    (cond
+      (< r 0.2) (gen-filter:relative-date-current column)    ;; "This month"
+      (< r 0.4) (gen-filter:relative-date-offset column)     ;; "Next 3 months starting from 2 years ago"
+      :else     (gen-filter:relative-date-nearby column))))  ;; ""
+
+(defmulti ^:private gen-filter:exclude-date-options identity)
+
+(defmethod gen-filter:exclude-date-options :hour-of-day [_unit]
+  (range 0 24))
+
+(defmethod gen-filter:exclude-date-options :day-of-week [_unit]
+  (take 7 (iterate #(u.time/add % :day 1) (u.time/local-date))))
+
+(defn- jan1 []
+  (u.time/truncate (u.time/local-date) :year))
+
+(defmethod gen-filter:exclude-date-options :month-of-year [_unit]
+  (->> (jan1)
+       (iterate #(u.time/add % :month 1))
+       (take 12)))
+
+(defmethod gen-filter:exclude-date-options :quarter-of-year [_unit]
+  (->> (jan1)
+       (iterate #(u.time/add % :month 3))
+       (take 4)))
+
+(defn- gen-filter:exclude-date [column]
+  ;; Excludes are stored currently as:
+  ;; [:!= {} [:field {:temporal-unit :month-of-year} 123] "2024-02-01" "2024-03-01"]
+  ;; to exclude February and March.
+  ;; The allowed units are: :day-of-week, :month-of-year, and :quarter-of-year; plus :hour-of-day for datetimes.
+  ;; Hours are 0-based numbers, the others use exemplar dates - the first of a month, the nearest Thursday to today.
+  (let [units    (cond-> [:day-of-week :month-of-year :quarter-of-year]
+                   (not (lib.types.isa/date-without-time? column)) (conj :hour-of-day))
+        unit     (gen.u/choose units)
+        opts     (vec (gen-filter:exclude-date-options unit))
+        ;; Always one option, plus 40% chance of more.
+        selected (loop [sel #{(rand-nth opts)}]
+                   (if (< (rand) 0.4)
+                     (recur (conj sel (rand-nth opts)))
+                     sel))
+        ;; But if that selected everything, drop one at random.
+        selected (cond-> selected
+                   (= (count selected) (count opts)) (disj (rand-nth opts)))]
+    (apply lib/!= (lib/with-temporal-bucket column unit) selected)))
+
+(defn- specify-time? [column]
+  (and (not (lib.types.isa/date-without-time? column))
+       (< (rand) 0.2)))
+
+(defn- gen-filter:date-binary [column operator]
+  (if (specify-time? column)
+    (operator (lib/with-temporal-bucket column :minute) (gen-datetime:minute))
+    (operator column (gen-date))))
+
+(defn- gen-filter:date-between [column]
+  ;; TODO: Swap the arguments to put the earlier one on the left.
+  (if (specify-time? column)
+    (lib/between (lib/with-temporal-bucket column :minute)
+                 (gen-datetime:minute) (gen-datetime:minute))
+    (lib/between column (gen-date) (gen-date))))
+
+(defn- gen-filter:date [column]
+  ;; - 30% relative date ranges
+  ;; - 20% exclude
+  ;; - 50% before/after/on/between
+  (let [r (rand)]
+    (cond
+      (< r 0.20) (gen-filter:relative-date column)
+      (< r 0.30) (gen-filter:relative-date-offset column)
+      (< r 0.50) (gen-filter:exclude-date column)
+      (< r 0.60) (gen-filter:date-binary column lib/<)
+      (< r 0.70) (gen-filter:date-binary column lib/>)
+      (< r 0.85) (gen-filter:date-binary column lib/=)
+      :else      (gen-filter:date-between column))))
+
+(defn- gen-filter:datetime [column]
+  ;; TODO: There's actually no difference right now. Clean this up?
+  (gen-filter:date column))
+
+(defn- gen-filter:generic [column]
+  (when-let [operator (some->> (:operators column)
+                               (remove skipped-operator?)
+                               rand-nth)]
+    (gen-filter-clause column operator)))
+
+(defn- gen-filter:inside [col1 col2]
+  (let [[lat lon] (if (lib.types.isa/latitude? col1)
+                    [col1 col2]
+                    [col2 col1])
+        [lat-min lat-max] (sort (repeatedly 2 gen-latitude))
+        [lon-min lon-max] (sort (repeatedly 2 gen-longitude))]
+    ;; Yes, this is really the argument order for an `:inside` clause.
+    (lib/inside lat lon lat-max lon-min lat-min lon-max)))
+
+(defn- gen-filter:coordinate [column]
+  (let [counterpart? (if (lib.types.isa/latitude? column)
+                       lib.types.isa/longitude?
+                       lib.types.isa/latitude?)
+        counterparts (filter counterpart? *filterable-columns*)]
+    (if (and (seq counterparts)
+             (< (rand) 0.5))
+      ;; If we found a coordinate pair, generate an :inside filter 50% of the time.
+      (gen-filter:inside column (gen.u/choose counterparts))
+      ;; Otherwise, generic filter on the original column.
+      (gen-filter:generic column))))
+
+(defn- ^:private gen-filter*
+  ([] (gen-filter* 0))
+  ([recursion-depth]
+   (let [column (rand-nth *filterable-columns*)]
+     (cond
+       (lib.types.isa/coordinate? column)        (gen-filter:coordinate column)
+       (lib.types.isa/date-without-time? column) (gen-filter:date column)
+       (lib.types.isa/date-or-datetime? column)  (gen-filter:datetime column)
+       :else
+       (let [result (gen-filter:generic column)]
+         ;; Sometimes we pick a column with no filter operators, and result is nil. Recur in that case to roll again.
+         (if (= result ::no-operators)
+           (if (>= recursion-depth 20)
+             (throw (ex-info "Deep recusion in gen-filter* - no filterable columns, or none with valid :operators?"
+                             {:filterable-columns *filterable-columns*}))
+             (recur (inc recursion-depth)))
+           result))))))
+
+(doseq [[op f] [[:and lib/and]
+                [:or  lib/or]]]
+  (defmethod gen-filter-clause op [_column _op]
+    (->> (repeatedly gen-filter*)
+         (filter identity)
+         (take (+ 2 (rand-int 3)))
+         (apply f))))
+
+(defn gen-filter
+  "Given the [[lib/filterable-columns]] for our query and stage, returns a random filter for that stage.
+
+  That includes `:and` and `:or` compound filters, numbers, strings, categories, datetimes, and coordinates.
+  Coordinates include the specialized `:inside` expression. Datetimes generate relative, excluding, Before, After,
+  Between and On filters, just like the FE."
+  [filterable-columns]
+  (binding [*filterable-columns* filterable-columns]
+    (gen-filter*)))
diff --git a/test/metabase/lib/test_util/generators/util.cljc b/test/metabase/lib/test_util/generators/util.cljc
new file mode 100644
index 00000000000..db44c24467d
--- /dev/null
+++ b/test/metabase/lib/test_util/generators/util.cljc
@@ -0,0 +1,9 @@
+(ns metabase.lib.test-util.generators.util)
+
+(defn choose
+  "Uniformly chooses among a seq of options.
+
+  Returns nil if the list is empty! This is handy for choose-and-do vs. do-nothing while writing the next steps."
+  [xs]
+  (when-not (empty? xs)
+    (rand-nth xs)))
-- 
GitLab