From 46191072d0cd59331f0c3cc64f51f3f323410bfe Mon Sep 17 00:00:00 2001 From: Braden Shepherdson <braden@metabase.com> Date: Wed, 6 Nov 2024 09:18:27 -0500 Subject: [PATCH] [testing] Add filters to random query generators (#49448) Adds filters to generative testing for queries. There's a lot of new code here, but don't panic, it's quite straightforward: generating filter values, in the right shapes for various filter expressions. --- src/metabase/lib/filter.cljc | 8 +- src/metabase/lib/schema/filter.cljc | 4 +- .../lib/schema/temporal_bucketing.cljc | 7 + test/metabase/lib/test_util/generators.cljc | 53 ++- .../lib/test_util/generators/filters.cljc | 308 ++++++++++++++++++ .../lib/test_util/generators/util.cljc | 9 + 6 files changed, 371 insertions(+), 18 deletions(-) create mode 100644 test/metabase/lib/test_util/generators/filters.cljc create mode 100644 test/metabase/lib/test_util/generators/util.cljc diff --git a/src/metabase/lib/filter.cljc b/src/metabase/lib/filter.cljc index 85375041737..a7807b6f9f0 100644 --- a/src/metabase/lib/filter.cljc +++ b/src/metabase/lib/filter.cljc @@ -310,10 +310,10 @@ (lib.common/defop not-null [x]) (lib.common/defop is-empty [x]) (lib.common/defop not-empty [x]) -(lib.common/defop starts-with [whole part]) -(lib.common/defop ends-with [whole part]) -(lib.common/defop contains [whole part]) -(lib.common/defop does-not-contain [whole part]) +(lib.common/defop starts-with [whole & parts]) +(lib.common/defop ends-with [whole & parts]) +(lib.common/defop contains [whole & parts]) +(lib.common/defop does-not-contain [whole & parts]) (lib.common/defop relative-time-interval [x value bucket offset-value offset-bucket]) (lib.common/defop time-interval [x amount unit]) (lib.common/defop segment [segment-id]) diff --git a/src/metabase/lib/schema/filter.cljc b/src/metabase/lib/schema/filter.cljc index 86a614e1006..65590bf1dc9 100644 --- a/src/metabase/lib/schema/filter.cljc +++ b/src/metabase/lib/schema/filter.cljc @@ -137,7 +137,9 @@ (mr/def ::operator [:map [:lib/type [:= :operator/filter]] - [:short [:enum := :!= :inside :between :< :> :<= :>= :is-null :not-null :is-empty :not-empty :contains :does-not-contain :starts-with :ends-with]] + [:short [:enum + := :!= :inside :between :< :> :<= :>= :is-null :not-null :is-empty :not-empty + :contains :does-not-contain :starts-with :ends-with :time-interval :relative-time-interval]] ;; this is used for display name and it depends on the arguments to the filter clause itself... e.g. ;; ;; number_a < number_b diff --git a/src/metabase/lib/schema/temporal_bucketing.cljc b/src/metabase/lib/schema/temporal_bucketing.cljc index de31c615ad0..4ab966cac95 100644 --- a/src/metabase/lib/schema/temporal_bucketing.cljc +++ b/src/metabase/lib/schema/temporal_bucketing.cljc @@ -99,6 +99,13 @@ :decode/normalize common/normalize-keyword}] time-bucketing-units)) +(def ordered-datetime-truncation-units + "Valid datetime bucketing units for truncation operations. + The front end shows the options in this order." + (into [] + (distinct) + (concat ordered-time-truncation-units ordered-date-truncation-units))) + (def ordered-datetime-bucketing-units "Valid datetime bucketing units for either truncation or extraction operations. The front end shows the options in this order." diff --git a/test/metabase/lib/test_util/generators.cljc b/test/metabase/lib/test_util/generators.cljc index e4ff2af9d81..dec0c2e596c 100644 --- a/test/metabase/lib/test_util/generators.cljc +++ b/test/metabase/lib/test_util/generators.cljc @@ -4,7 +4,10 @@ [clojure.test :refer [deftest is testing]] [metabase.lib.breakout :as lib.breakout] [metabase.lib.core :as lib] + [metabase.lib.equality :as lib.equality] [metabase.lib.test-metadata :as meta] + [metabase.lib.test-util.generators.filters :as gen.filters] + [metabase.lib.test-util.generators.util :as gen.u] [metabase.util :as u])) ;; NOTE: Being able to *execute* these queries and grok the results would actually be really powerful, if we can @@ -77,14 +80,6 @@ (swap! step-kinds assoc kind step-def)) ;; Helpers ======================================================================================= -(defn- choose - "Uniformly chooses among a seq of options. - - Returns nil if the list is empty! This is handy for choose-and-do vs. do-nothing while writing the next steps." - [xs] - (when-not (empty? xs) - (rand-nth xs))) - (defn- choose-stage "Chooses a stage to operator on. 80% act on -1, 20% chooses a stage by index (which might be the last stage)." [query] @@ -117,9 +112,9 @@ ;; TODO: If exhaustion is a goal, we should think about making these into generators or otherwise reifying the choices. (defmethod next-steps* :aggregate [query _aggregate] (let [stage-number (choose-stage query) - operator (choose (lib/available-aggregation-operators query stage-number)) + operator (gen.u/choose (lib/available-aggregation-operators query stage-number)) agg (if (:requires-column? operator) - (lib/aggregation-clause operator (choose (:columns operator))) + (lib/aggregation-clause operator (gen.u/choose (:columns operator))) (lib/aggregation-clause operator))] [:aggregate stage-number agg])) @@ -128,11 +123,11 @@ (defmethod next-steps* :breakout [query _breakout] (let [stage-number (choose-stage query) - column (choose (lib/breakoutable-columns query stage-number)) + column (gen.u/choose (lib/breakoutable-columns query stage-number)) ;; If this is a temporal column, we need to choose a unit for it. Nil if it's not temporal. ;; TODO: Don't always bucket/bin! We should sometimes choose the "Don't bin" option etc. - bucket (choose (lib/available-temporal-buckets query stage-number column)) - binning (choose (lib/available-binning-strategies query stage-number column)) + bucket (gen.u/choose (lib/available-temporal-buckets query stage-number column)) + binning (gen.u/choose (lib/available-binning-strategies query stage-number column)) brk-column (cond-> column bucket (lib/with-temporal-bucket bucket) binning (lib/with-binning binning))] @@ -167,6 +162,38 @@ (is (= (count after-breakouts) (count before-breakouts)))))))) +;; Filters ======================================================================================= +(add-step {:kind :filter}) + +(defmethod next-steps* :filter [query _filter] + (let [stage-number (choose-stage query)] + [:filter stage-number (gen.filters/gen-filter (lib/filterable-columns query stage-number))])) + +(defmethod run-step* :filter [query [_filter stage-number filter-clause]] + (lib/filter query stage-number filter-clause)) + +(defmethod before-and-after :filter [before after [_filter stage-number filter-clause]] + (let [before-filters (lib/filters before stage-number) + after-filters (lib/filters after stage-number)] + (testing (str ":filter stage " stage-number + "\n\nBefore query\n" (u/pprint-to-str before) + "\n\nAfter query\n" (u/pprint-to-str after) + "\n\nwith filter clause\n" (u/pprint-to-str filter-clause) + "\n") + (if (some #(lib.equality/= % filter-clause) before-filters) + (testing "with an existing, equivalent filter" + (testing "does not add a new one" + (is (= (count before-filters) + (count after-filters))))) + (testing "with a new filter" + (testing "adds it to the end of the list" + (is (= (count after-filters) + (inc (count before-filters)))) + (is (=? filter-clause (last after-filters)))) + (testing (str `lib/filter-operator " returns the right op") + (is (= (first filter-clause) + (:short (lib/filter-operator after stage-number (last after-filters))))))))))) + ;; Generator internals =========================================================================== (defn- run-step "Applies a step, returning the updated context." diff --git a/test/metabase/lib/test_util/generators/filters.cljc b/test/metabase/lib/test_util/generators/filters.cljc new file mode 100644 index 00000000000..e9e06f19ba5 --- /dev/null +++ b/test/metabase/lib/test_util/generators/filters.cljc @@ -0,0 +1,308 @@ +(ns metabase.lib.test-util.generators.filters + (:require + [metabase.lib.core :as lib] + [metabase.lib.options :as lib.options] + [metabase.lib.schema.temporal-bucketing :as lib.schema.temporal-bucketing] + [metabase.lib.test-util.generators.util :as gen.u] + [metabase.lib.types.isa :as lib.types.isa] + [metabase.util.time :as u.time])) + +;; Filter values ================================================================================= +(defn- gen-int [] + (- (rand-int 2000000) 1000000)) + +(def ^:private valid-ascii + (mapv char (range 0x20 0x7f))) + +(def ^:private han-unicode + (mapv char (range 0x4e00 0xa000))) + +(defn- gen-string + ([] + (if (< (rand) 0.1) + (gen-string han-unicode 40) + (gen-string valid-ascii 70))) + ([symbols max-len] + (apply str (repeatedly (inc (rand-int max-len)) #(rand-nth symbols))))) + +(defn- gen-time [] + (u.time/local-time (rand-int 24) (rand-int 60) (rand-int 60) (rand-int 1000000000))) + +(defn- gen-time:minute [] + (u.time/local-time (rand-int 24) (rand-int 60) 0 0)) + +(defn- gen-date [] + ;; Random day of the year, from 2000-01-01 through 2037-12-31. + ;; Avoids 2038 because of the 32-bit timestamp overflow. + ;; TODO: Always adds 0-364 days, so it can't return Dec 31st of a leap year. I don't think it matters, but I will + ;; highlight it. + (u.time/add (u.time/local-date (+ 2000 (rand-int 38)) 1 1) ;; Jan 1, from 2000 through 2037 + :day + (rand-int 365))) + +(defn- gen-datetime [] + (u.time/local-date-time (gen-date) (gen-time))) + +(defn- gen-datetime:minute [] + (u.time/local-date-time (gen-date) (gen-time:minute))) + +(defn- gen-latitude [] + ;; +/- 75 degrees is a generous but plausible range for latitudes. + (* 150 (- (rand) 0.5))) + +(defn- gen-longitude [] + ;; +/- 180 degrees + (- (* 360 (rand)) + 180)) + +(def ^:private fake-categories + (vec (for [i (range 1 41)] + (str "Fake Category " i)))) + +(defn- gen-category [] + ;; Just some made-up values that are clearly not random strings for debugging. + (rand-nth fake-categories)) + +(defn- rand-column-value [{:keys [effective-type] :as column}] + (cond + ;; Numeric PKs and FKs are always integers. + (and (lib.types.isa/id? column) + (lib.types.isa/numeric? column)) (abs (gen-int)) + (#{:type/BigInteger :type/Integer} effective-type) (gen-int) + (lib.types.isa/category? column) (gen-category) + (lib.types.isa/latitude? column) (gen-latitude) + (lib.types.isa/longitude? column) (gen-longitude) + (lib.types.isa/numeric? column) (cond-> (gen-int) + (< (rand) 0.5) (+ (rand))) + (lib.types.isa/string-or-string-like? column) (gen-string) + (lib.types.isa/time? column) (gen-time) + (lib.types.isa/date-without-time? column) (gen-date) + (lib.types.isa/date-or-datetime? column) (gen-datetime) + :else (throw (ex-info " !!! Not sure what values to generate for column" {:effective-type effective-type + :column column})))) + +;; Filter clauses ================================================================================ +(def ^:private ^:dynamic *filterable-columns* nil) + +(defmulti ^:private gen-filter-clause + (fn [_column operator] + (:short operator))) + +;; Binary operators like :< +(doseq [[op f] [[:< lib/<] + [:<= lib/<=] + [:> lib/>] + [:>= lib/>=]]] + (defmethod gen-filter-clause op [column _op] + (f column (rand-column-value column)))) + +;; Multi-value operators like := and :starts-with +(doseq [[op f] [[:= lib/=] + [:!= lib/!=] + [:starts-with lib/starts-with] + [:ends-with lib/ends-with] + [:contains lib/contains] + [:does-not-contain lib/does-not-contain]]] + (defmethod gen-filter-clause op [column _op] + (apply f column (repeatedly (inc (rand-int 4)) + #(rand-column-value column))))) + +(defmethod gen-filter-clause :between [column _op] + (let [lo (rand-column-value column) + hi (rand-column-value column)] + ;; TODO: Maybe make the LHS arg be the smaller? Right now they're random. + ;; Probably they can be `sort`ed, but that might fail on weird types. + (lib/between column lo hi))) + +;; Unary operators like `:is-empty`. +(doseq [[op f] [[:is-empty lib/is-empty] + [:not-empty lib/not-empty] + [:is-null lib/is-null] + [:not-null lib/not-null]]] + (defmethod gen-filter-clause op [column _op] + (f column))) + +(defn- skipped-operator? [op] + (#{:inside} (:short op))) + +(defn- units-from [min-unit] + (drop-while #(not= % min-unit) lib.schema.temporal-bucketing/ordered-datetime-truncation-units)) + +(defn- gen-filter:unit + "If the optional second argument is provided, returns a range unit that is at least as large as this minimum." + ([column] + (gen-filter:unit column (if (lib.types.isa/date-without-time? column) + :day + :minute))) + ([_column min-unit] + (gen.u/choose (units-from min-unit)))) + +(defn- gen-filter:relative-date-current [column] + (lib/time-interval column :current (gen-filter:unit column))) + +(defn- gen-filter:relative-date-nearby [column] + (let [past-future (gen.u/choose [+ -]) + unit (gen-filter:unit column) + n (gen.u/choose (range 1 20))] + (cond-> (lib/time-interval column (past-future n) unit) + (< (rand) 0.2) (lib.options/update-options assoc :include-current true)))) + +(defn- gen-filter:relative-date-offset [column] + ;; Only one past-future, since both offsets have to point in the same direction, at least in the UI. + (let [past-future (gen.u/choose [+ -]) + range-unit (gen-filter:unit column) + range-n (gen.u/choose (range 1 20)) + offset-unit (gen-filter:unit column range-unit) + offset-n (gen.u/choose (range 1 6))] + (lib/relative-time-interval column + (past-future range-n) range-unit + (past-future offset-n) offset-unit))) + +(defn- gen-filter:relative-date [column] + ;; Current: day, week, month, quarter, year. + ;; Previous: N minutes/hours/days/weeks/months/quarters/years + ;; Next: N minutes/hours/days/weeks/months/quarters/years + ;; Plus optionally either "include this <unit>" or "M <larger-unit>s from now/ago" + (let [r (rand)] + (cond + (< r 0.2) (gen-filter:relative-date-current column) ;; "This month" + (< r 0.4) (gen-filter:relative-date-offset column) ;; "Next 3 months starting from 2 years ago" + :else (gen-filter:relative-date-nearby column)))) ;; "" + +(defmulti ^:private gen-filter:exclude-date-options identity) + +(defmethod gen-filter:exclude-date-options :hour-of-day [_unit] + (range 0 24)) + +(defmethod gen-filter:exclude-date-options :day-of-week [_unit] + (take 7 (iterate #(u.time/add % :day 1) (u.time/local-date)))) + +(defn- jan1 [] + (u.time/truncate (u.time/local-date) :year)) + +(defmethod gen-filter:exclude-date-options :month-of-year [_unit] + (->> (jan1) + (iterate #(u.time/add % :month 1)) + (take 12))) + +(defmethod gen-filter:exclude-date-options :quarter-of-year [_unit] + (->> (jan1) + (iterate #(u.time/add % :month 3)) + (take 4))) + +(defn- gen-filter:exclude-date [column] + ;; Excludes are stored currently as: + ;; [:!= {} [:field {:temporal-unit :month-of-year} 123] "2024-02-01" "2024-03-01"] + ;; to exclude February and March. + ;; The allowed units are: :day-of-week, :month-of-year, and :quarter-of-year; plus :hour-of-day for datetimes. + ;; Hours are 0-based numbers, the others use exemplar dates - the first of a month, the nearest Thursday to today. + (let [units (cond-> [:day-of-week :month-of-year :quarter-of-year] + (not (lib.types.isa/date-without-time? column)) (conj :hour-of-day)) + unit (gen.u/choose units) + opts (vec (gen-filter:exclude-date-options unit)) + ;; Always one option, plus 40% chance of more. + selected (loop [sel #{(rand-nth opts)}] + (if (< (rand) 0.4) + (recur (conj sel (rand-nth opts))) + sel)) + ;; But if that selected everything, drop one at random. + selected (cond-> selected + (= (count selected) (count opts)) (disj (rand-nth opts)))] + (apply lib/!= (lib/with-temporal-bucket column unit) selected))) + +(defn- specify-time? [column] + (and (not (lib.types.isa/date-without-time? column)) + (< (rand) 0.2))) + +(defn- gen-filter:date-binary [column operator] + (if (specify-time? column) + (operator (lib/with-temporal-bucket column :minute) (gen-datetime:minute)) + (operator column (gen-date)))) + +(defn- gen-filter:date-between [column] + ;; TODO: Swap the arguments to put the earlier one on the left. + (if (specify-time? column) + (lib/between (lib/with-temporal-bucket column :minute) + (gen-datetime:minute) (gen-datetime:minute)) + (lib/between column (gen-date) (gen-date)))) + +(defn- gen-filter:date [column] + ;; - 30% relative date ranges + ;; - 20% exclude + ;; - 50% before/after/on/between + (let [r (rand)] + (cond + (< r 0.20) (gen-filter:relative-date column) + (< r 0.30) (gen-filter:relative-date-offset column) + (< r 0.50) (gen-filter:exclude-date column) + (< r 0.60) (gen-filter:date-binary column lib/<) + (< r 0.70) (gen-filter:date-binary column lib/>) + (< r 0.85) (gen-filter:date-binary column lib/=) + :else (gen-filter:date-between column)))) + +(defn- gen-filter:datetime [column] + ;; TODO: There's actually no difference right now. Clean this up? + (gen-filter:date column)) + +(defn- gen-filter:generic [column] + (when-let [operator (some->> (:operators column) + (remove skipped-operator?) + rand-nth)] + (gen-filter-clause column operator))) + +(defn- gen-filter:inside [col1 col2] + (let [[lat lon] (if (lib.types.isa/latitude? col1) + [col1 col2] + [col2 col1]) + [lat-min lat-max] (sort (repeatedly 2 gen-latitude)) + [lon-min lon-max] (sort (repeatedly 2 gen-longitude))] + ;; Yes, this is really the argument order for an `:inside` clause. + (lib/inside lat lon lat-max lon-min lat-min lon-max))) + +(defn- gen-filter:coordinate [column] + (let [counterpart? (if (lib.types.isa/latitude? column) + lib.types.isa/longitude? + lib.types.isa/latitude?) + counterparts (filter counterpart? *filterable-columns*)] + (if (and (seq counterparts) + (< (rand) 0.5)) + ;; If we found a coordinate pair, generate an :inside filter 50% of the time. + (gen-filter:inside column (gen.u/choose counterparts)) + ;; Otherwise, generic filter on the original column. + (gen-filter:generic column)))) + +(defn- ^:private gen-filter* + ([] (gen-filter* 0)) + ([recursion-depth] + (let [column (rand-nth *filterable-columns*)] + (cond + (lib.types.isa/coordinate? column) (gen-filter:coordinate column) + (lib.types.isa/date-without-time? column) (gen-filter:date column) + (lib.types.isa/date-or-datetime? column) (gen-filter:datetime column) + :else + (let [result (gen-filter:generic column)] + ;; Sometimes we pick a column with no filter operators, and result is nil. Recur in that case to roll again. + (if (= result ::no-operators) + (if (>= recursion-depth 20) + (throw (ex-info "Deep recusion in gen-filter* - no filterable columns, or none with valid :operators?" + {:filterable-columns *filterable-columns*})) + (recur (inc recursion-depth))) + result)))))) + +(doseq [[op f] [[:and lib/and] + [:or lib/or]]] + (defmethod gen-filter-clause op [_column _op] + (->> (repeatedly gen-filter*) + (filter identity) + (take (+ 2 (rand-int 3))) + (apply f)))) + +(defn gen-filter + "Given the [[lib/filterable-columns]] for our query and stage, returns a random filter for that stage. + + That includes `:and` and `:or` compound filters, numbers, strings, categories, datetimes, and coordinates. + Coordinates include the specialized `:inside` expression. Datetimes generate relative, excluding, Before, After, + Between and On filters, just like the FE." + [filterable-columns] + (binding [*filterable-columns* filterable-columns] + (gen-filter*))) diff --git a/test/metabase/lib/test_util/generators/util.cljc b/test/metabase/lib/test_util/generators/util.cljc new file mode 100644 index 00000000000..db44c24467d --- /dev/null +++ b/test/metabase/lib/test_util/generators/util.cljc @@ -0,0 +1,9 @@ +(ns metabase.lib.test-util.generators.util) + +(defn choose + "Uniformly chooses among a seq of options. + + Returns nil if the list is empty! This is handy for choose-and-do vs. do-nothing while writing the next steps." + [xs] + (when-not (empty? xs) + (rand-nth xs))) -- GitLab