diff --git a/src/metabase/driver/generic_sql/query_processor.clj b/src/metabase/driver/generic_sql/query_processor.clj index 740f7ee32fc91503f4adf1db871eac3b6a125ee0..d4d823418cef9261090736abe695d559be990416 100644 --- a/src/metabase/driver/generic_sql/query_processor.clj +++ b/src/metabase/driver/generic_sql/query_processor.clj @@ -5,7 +5,6 @@ [clojure.tools.logging :as log] [honeysql [core :as hsql] - [format :as hformat] [helpers :as h]] [metabase [driver :as driver] @@ -32,12 +31,6 @@ Each nested query increments this counter by 1." 0) -;; register the function "distinct-count" with HoneySQL -;; (hsql/format :%distinct-count.x) -> "count(distinct x)" -(defmethod hformat/fn-handler "distinct-count" [_ field] - (str "count(distinct " (hformat/to-sql field) ")")) - - ;;; ## Formatting (defn- qualified-alias diff --git a/src/metabase/util/honeysql_extensions.clj b/src/metabase/util/honeysql_extensions.clj index bc636d97ca0145deab44bccd8ff7179602552e55..67c33ed4799daa706c6602e1dc218b192146f16c 100644 --- a/src/metabase/util/honeysql_extensions.clj +++ b/src/metabase/util/honeysql_extensions.clj @@ -9,7 +9,8 @@ (alter-meta! #'honeysql.core/format assoc :style/indent 1) (alter-meta! #'honeysql.core/call assoc :style/indent 1) -;; for some reason the metadata on these helper functions is wrong which causes Eastwood to fail, see https://github.com/jkk/honeysql/issues/123 +;; for some reason the metadata on these helper functions is wrong which causes Eastwood to fail, see +;; https://github.com/jkk/honeysql/issues/123 (alter-meta! #'honeysql.helpers/merge-left-join assoc :arglists '([m & clauses]) :style/indent 1) @@ -45,27 +46,35 @@ (defmethod hformat/fn-handler "extract" [_ unit expr] (str "extract(" (name unit) " from " (hformat/to-sql expr) ")")) +;; register the function "distinct-count" with HoneySQL +;; (hsql/format :%distinct-count.x) -> "count(distinct x)" +(defmethod hformat/fn-handler "distinct-count" [_ field] + (str "count(distinct " (hformat/to-sql field) ")")) -;; HoneySQL 0.7.0+ parameterizes numbers to fix issues with NaN and infinity -- see https://github.com/jkk/honeysql/pull/122. -;; However, this broke some of Metabase's behavior, specifically queries with calculated columns with numeric literals -- -;; some SQL databases can't recognize that a calculated field in a SELECT clause and a GROUP BY clause is the same thing if the calculation involves parameters. -;; Go ahead an use the old behavior so we can keep our HoneySQL dependency up to date. + +;; HoneySQL 0.7.0+ parameterizes numbers to fix issues with NaN and infinity -- see +;; https://github.com/jkk/honeysql/pull/122. However, this broke some of Metabase's behavior, specifically queries +;; with calculated columns with numeric literals -- some SQL databases can't recognize that a calculated field in a +;; SELECT clause and a GROUP BY clause is the same thing if the calculation involves parameters. Go ahead an use the +;; old behavior so we can keep our HoneySQL dependency up to date. (extend-protocol honeysql.format/ToSql java.lang.Number (to-sql [x] (str x))) -;; HoneySQL automatically assumes that dots within keywords are used to separate schema / table / field / etc. -;; To handle weird situations where people actually put dots *within* a single identifier we'll replace those dots with lozenges, -;; let HoneySQL do its thing, then switch them back at the last second +;; HoneySQL automatically assumes that dots within keywords are used to separate schema / table / field / etc. To +;; handle weird situations where people actually put dots *within* a single identifier we'll replace those dots with +;; lozenges, let HoneySQL do its thing, then switch them back at the last second ;; -;; TODO - Maybe instead of this lozengey hackiness it would make more sense just to add a new "identifier" record type that implements `ToSql` in a more intelligent way +;; TODO - Maybe instead of this lozengey hackiness it would make more sense just to add a new "identifier" record type +;; that implements `ToSql` in a more intelligent way (defn escape-dots "Replace dots in a string with WHITE MEDIUM LOZENGES (⬨)." ^String [s] (s/replace (name s) #"\." "⬨")) (defn qualify-and-escape-dots - "Combine several NAME-COMPONENTS into a single Keyword, and escape dots in each name by replacing them with WHITE MEDIUM LOZENGES (⬨). + "Combine several NAME-COMPONENTS into a single Keyword, and escape dots in each name by replacing them with WHITE + MEDIUM LOZENGES (⬨). (qualify-and-escape-dots :ab.c :d) -> :ab⬨c.d" ^clojure.lang.Keyword [& name-components] diff --git a/src/metabase/util/stats.clj b/src/metabase/util/stats.clj index 4edb73068ab9dc47521e2fa0220fb97e7f3a6d94..c41b2eb278d2bbcddb35290e1e46dc44c95c90e3 100644 --- a/src/metabase/util/stats.clj +++ b/src/metabase/util/stats.clj @@ -102,9 +102,19 @@ (frequencies (map k many-maps))) (defn- histogram - "Bin some frequencies using a passed in BINNING-FN." - [binning-fn many-maps k] - (frequencies (map binning-fn (vals (value-frequencies many-maps k))))) + "Bin some frequencies using a passed in `binning-fn`. + + ;; Generate histogram for values of :a; `1` appears 3 times and `2` and `3` both appear once + (histogram bin-micro-number [{:a 1} {:a 1} {:a 1} {:a 2} {:a 3}] :a) + ;; -> {\"3+\" 1, \"1\" 2} + + ;; (or if you already have the counts) + (histogram bin-micro-number [3 1 1]) + ;; -> {\"3+\" 1, \"1\" 2}" + ([binning-fn counts] + (frequencies (map binning-fn counts))) + ([binning-fn many-maps k] + (histogram binning-fn (vals (value-frequencies many-maps k))))) (def ^:private micro-histogram "Return a histogram for micro numbers." @@ -212,19 +222,78 @@ :with_locked_params (contains? embedding-params-vals "locked") :with_disabled_params (contains? embedding-params-vals "disabled")})))})) +(defn- db-frequencies + "Fetch the frequencies of a given `column` with a normal SQL `SELECT COUNT(*) ... GROUP BY` query. This is way more + efficient than fetching every single row and counting them in Clojure-land! + + (db-frequencies Database :engine) + ;; -> {\"h2\" 2, \"postgres\" 1, ...} + + ;; include `WHERE` conditions or other arbitrary HoneySQL + (db-frequencies Database :engine {:where [:= :is_sample false]}) + ;; -> {\"h2\" 1, \"postgres\" 1, ...} + + ;; Generate a histogram: + (micro-histogram (vals (db-frequencies Database :engine))) + ;; -> {\"2\" 1, \"1\" 1, ...} + + ;; Include `WHERE` clause that includes conditions for a Table related by an FK relationship: + ;; (Number of Tables per DB engine) + (db-frequencies Table (db/qualify Database :engine) + {:left-join [Database [:= (db/qualify Database :id) + (db/qualify Table :db_id)]]}) + ;; -> {\"googleanalytics\" 4, \"postgres\" 48, \"h2\" 9}" + {:style/indent 2} + [model column & [additonal-honeysql]] + (into {} (for [{:keys [k count]} (db/select [model [column :k] [:%count.* :count]] + (merge {:group-by [column]} + additonal-honeysql))] + [k count]))) + +(defn- num-notifications-with-xls-or-csv-cards + "Return the number of Notifications that satisfy `where-conditions` that have at least one PulseCard with `include_xls` or + `include_csv`. + + ;; Pulses only (filter out Alerts) + (num-notifications-with-xls-or-csv-cards [:= :alert_condition nil])" + [& where-conditions] + ;; :%distinct-count is a custom fn we registered in `metabase.util.honeysql-extensions`! + (-> (db/query {:select [[:%distinct-count.pulse.id :count]] + :from [:pulse] + :left-join [:pulse_card [:= :pulse.id :pulse_card.pulse_id]] + :where (cons + :and + (cons + [:or [:= :pulse_card.include_csv true] + [:= :pulse_card.include_xls true]] + where-conditions))}) + first + :count)) + (defn- pulse-metrics - "Get mes based on pulses + "Get metrics based on pulses TODO: characterize by non-user account emails, # emails" [] - (let [pulses (db/select [Pulse :creator_id]) - pulse-cards (db/select [PulseCard :card_id :pulse_id]) - pulse-channels (db/select [PulseChannel :channel_type :schedule_type])] - {:pulses (count pulses) - :pulse_types (frequencies (map :channel_type pulse-channels)) - :pulse_schedules (frequencies (map :schedule_type pulse-channels)) - :num_pulses_per_user (medium-histogram pulses :creator_id) - :num_pulses_per_card (medium-histogram pulse-cards :card_id) - :num_cards_per_pulses (medium-histogram pulse-cards :pulse_id)})) + (let [pulse-conditions {:left-join [Pulse [:= :pulse.id :pulse_id]], :where [:= :pulse.alert_condition nil]}] + {:pulses (db/count Pulse :alert_condition nil) + ;; "Table Cards" are Cards that include a Table you can download + :with_table_cards (num-notifications-with-xls-or-csv-cards [:= :alert_condition nil]) + :pulse_types (db-frequencies PulseChannel :channel_type pulse-conditions) + :pulse_schedules (db-frequencies PulseChannel :schedule_type pulse-conditions) + :num_pulses_per_user (medium-histogram (vals (db-frequencies Pulse :creator_id (dissoc pulse-conditions :left-join)))) + :num_pulses_per_card (medium-histogram (vals (db-frequencies PulseCard :card_id pulse-conditions))) + :num_cards_per_pulses (medium-histogram (vals (db-frequencies PulseCard :pulse_id pulse-conditions)))})) + +(defn- alert-metrics [] + (let [alert-conditions {:left-join [Pulse [:= :pulse.id :pulse_id]], :where [:not= (db/qualify Pulse :alert_condition) nil]}] + {:alerts (db/count Pulse :alert_condition [:not= nil]) + :with_table_cards (num-notifications-with-xls-or-csv-cards [:not= :alert_condition nil]) + :first_time_only (db/count Pulse :alert_condition [:not= nil], :alert_first_only true) + :above_goal (db/count Pulse :alert_condition [:not= nil], :alert_above_goal true) + :alert_types (db-frequencies PulseChannel :channel_type alert-conditions) + :num_alerts_per_user (medium-histogram (vals (db-frequencies Pulse :creator_id (dissoc alert-conditions :left-join)))) + :num_alerts_per_card (medium-histogram (vals (db-frequencies PulseCard :card_id alert-conditions))) + :num_cards_per_alerts (medium-histogram (vals (db-frequencies PulseCard :pulse_id alert-conditions)))})) (defn- label-metrics @@ -284,33 +353,10 @@ ;;; Execution Metrics -;; Because the QueryExecution table can number in the millions of rows, it isn't safe to pull the entire thing into memory; -;; instead, we'll fetch rows of QueryExecutions in chunks, building the summary as we go - -(def ^:private ^:const executions-chunk-size - "Number of QueryExecutions to fetch per chunk. This should be a good tradeoff between not being too large (which could - cause us to run out of memory) and not being too small (which would make calculating this summary excessively slow)." - 5000) - -;; fetch chunks by ID, e.g. 1-5000, 5001-10000, etc. - -(defn- executions-chunk - "Fetch the chunk of QueryExecutions whose ID is greater than STARTING-ID." - [starting-id] - (db/select [QueryExecution :id :executor_id :running_time :error] - :id [:> starting-id] - {:order-by [:id], :limit executions-chunk-size})) - -(defn- executions-lazy-seq - "Return a lazy seq of all QueryExecutions." - ([] - (executions-lazy-seq 0)) - ([starting-id] - (when-let [chunk (seq (executions-chunk starting-id))] - (lazy-cat chunk (executions-lazy-seq (:id (last chunk))))))) - (defn summarize-executions - "Summarize EXECUTIONS, by incrementing approriate counts in a summary map." + "Summarize `executions`, by incrementing approriate counts in a summary map." + ([] + (summarize-executions (db/select-reducible [QueryExecution :executor_id :running_time :error]))) ([executions] (reduce summarize-executions {:executions 0, :by_status {}, :num_per_user {}, :num_by_latency {}} executions)) ([summary execution] @@ -330,8 +376,7 @@ (defn- execution-metrics "Get metrics based on QueryExecutions." [] - (-> (executions-lazy-seq) - summarize-executions + (-> (summarize-executions) (update :num_per_user summarize-executions-per-user))) @@ -344,6 +389,7 @@ {:average_entry_size (int (or length 0)) :num_queries_cached (bin-small-number count)})) + ;;; System Metrics (defn- bytes->megabytes [b] @@ -381,6 +427,7 @@ :label (label-metrics) :metric (metric-metrics) :pulse (pulse-metrics) + :alert (alert-metrics) :question (question-metrics) :segment (segment-metrics) :system (system-metrics) diff --git a/test/metabase/util/stats_test.clj b/test/metabase/util/stats_test.clj index 486d9088a7dc972cd0844ead329cf5d97438f0ae..cd9cf4a095d54f5171e4b821f6e24747dd58b817 100644 --- a/test/metabase/util/stats_test.clj +++ b/test/metabase/util/stats_test.clj @@ -1,9 +1,15 @@ (ns metabase.util.stats-test (:require [expectations :refer :all] - [metabase.models.query-execution :refer [QueryExecution]] + [metabase.models [query-execution :refer [QueryExecution]] + [pulse :refer [Pulse]] + [pulse-channel :refer [PulseChannel]] + [card :refer [Card]] + [pulse-card :refer [PulseCard]]] [metabase.test.util :as tu] [metabase.util.stats :as stats-util :refer :all] - [toucan.db :as db])) + [toucan.db :as db] + [metabase.util :as u] + [toucan.util.test :as tt])) (expect "0" (#'stats-util/bin-micro-number 0)) (expect "1" (#'stats-util/bin-micro-number 1)) @@ -63,8 +69,8 @@ (expect false ((anonymous-usage-stats) :sso_configured)) (expect false ((anonymous-usage-stats) :has_sample_data)) -;;Spot checking a few system stats to ensure conversion from property -;;names and presence in the anonymous-usage-stats +;; Spot checking a few system stats to ensure conversion from property +;; names and presence in the anonymous-usage-stats (expect #{true} (let [system-stats (get-in (anonymous-usage-stats) [:stats :system])] @@ -87,3 +93,80 @@ (expect (old-execution-metrics) (#'stats-util/execution-metrics)) + + +;;; +----------------------------------------------------------------------------------------------------------------+ +;;; | Pulses & Alerts | +;;; +----------------------------------------------------------------------------------------------------------------+ + +;; make sure we get some reasonable Pulses & Alert metrics, and they filter each other out as appropriate + +;; alert_condition character varying(254), -- Condition (i.e. "rows" or "goal") used as a guard for alerts +;; alert_first_only boolean, -- True if the alert should be disabled after the first notification +;; alert_above_goal boolean, -- For a goal condition, alert when above the goal +(defn- x [] + {:pulses {:pulses 3 + :with_table_cards 2 + :pulse_types {"slack" 1, "email" 2} + :pulse_schedules {"daily" 2, "weekly" 1} + :num_pulses_per_user {"1-5" 1} + :num_pulses_per_card {"6-10" 1} + :num_cards_per_pulses {"1-5" 1, "6-10" 1}} + :alerts {:alerts 4 + :with_table_cards 2 + :first_time_only 1 + :above_goal 1 + :alert_types {"slack" 2, "email" 2} + :num_alerts_per_user {"1-5" 1} + :num_alerts_per_card {"11-25" 1} + :num_cards_per_alerts {"1-5" 1, "6-10" 1}}} + (tt/with-temp* [Card [c] + ;; ---------- Pulses ---------- + Pulse [p1] + Pulse [p2] + Pulse [p3] + PulseChannel [_ {:pulse_id (u/get-id p1), :schedule_type "daily", :channel_type "email"}] + PulseChannel [_ {:pulse_id (u/get-id p1), :schedule_type "weekly", :channel_type "email"}] + PulseChannel [_ {:pulse_id (u/get-id p2), :schedule_type "daily", :channel_type "slack"}] + ;; Pulse 1 gets 2 Cards (1 CSV) + PulseCard [_ {:pulse_id (u/get-id p1), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id p1), :card_id (u/get-id c), :include_csv true}] + ;; Pulse 2 gets 1 Card + PulseCard [_ {:pulse_id (u/get-id p1), :card_id (u/get-id c)}] + ;; Pulse 3 gets 7 Cards (1 CSV, 2 XLS, 2 BOTH) + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c), :include_csv true}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c), :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c), :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c), :include_csv true, :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id p3), :card_id (u/get-id c), :include_csv true, :include_xls true}] + ;; ---------- Alerts ---------- + Pulse [a1 {:alert_condition "rows", :alert_first_only false}] + Pulse [a2 {:alert_condition "rows", :alert_first_only true }] + Pulse [a3 {:alert_condition "goal", :alert_first_only false}] + Pulse [a4 {:alert_condition "goal", :alert_first_only false, :alert_above_goal true}] + ;; Alert 1 is Email, Alert 2 is Email & Slack, Alert 3 is Slack-only + PulseChannel [_ {:pulse_id (u/get-id a1), :channel_type "email"}] + PulseChannel [_ {:pulse_id (u/get-id a1), :channel_type "email"}] + PulseChannel [_ {:pulse_id (u/get-id a2), :channel_type "slack"}] + PulseChannel [_ {:pulse_id (u/get-id a3), :channel_type "slack"}] + ;; Alert 1 gets 2 Cards (1 CSV) + PulseCard [_ {:pulse_id (u/get-id a1), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id a1), :card_id (u/get-id c), :include_csv true}] + ;; Alert 2 gets 1 Card + PulseCard [_ {:pulse_id (u/get-id a1), :card_id (u/get-id c)}] + ;; Alert 3 gets 7 Cards (1 CSV, 2 XLS, 2 BOTH) + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c), :include_csv true}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c), :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c), :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c), :include_csv true, :include_xls true}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c), :include_csv true, :include_xls true}] + ;; Alert 4 gets 3 Cards + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c)}] + PulseCard [_ {:pulse_id (u/get-id a3), :card_id (u/get-id c)}]] + {:pulses (#'metabase.util.stats/pulse-metrics) + :alerts (#'metabase.util.stats/alert-metrics)}))