From 0bde1fc9ebadd706ca4a93f3d556422549e77a2b Mon Sep 17 00:00:00 2001 From: Braden Shepherdson <braden@metabase.com> Date: Wed, 17 Apr 2024 08:59:00 -0400 Subject: [PATCH] [MBQL lib] Column Extract from URL, email requires regex support (#41484) Not all drivers support regular expressions. Don't return the Column Extractions for URL -> (sub)domain and email -> domain unless the database can support the regular expression matches. Part of the follow-up for Extract Column epic #38964. --- .../lib/drill_thru/column_extract.cljc | 18 +++- .../lib/drill_thru/column_extract_test.cljc | 84 ++++++++++++++++--- 2 files changed, 88 insertions(+), 14 deletions(-) diff --git a/src/metabase/lib/drill_thru/column_extract.cljc b/src/metabase/lib/drill_thru/column_extract.cljc index a129b0f8196..63f021eac42 100644 --- a/src/metabase/lib/drill_thru/column_extract.cljc +++ b/src/metabase/lib/drill_thru/column_extract.cljc @@ -7,13 +7,18 @@ Query transformation: - - Add an expression that extracts the specified value from this column." + - Add an expression that extracts the specified value from this column. + + Extra constraints: + + - Database must support `:regex` feature for the URL and Email extractions to work." (:require [medley.core :as m] [metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter] [metabase.lib.drill-thru.common :as lib.drill-thru.common] [metabase.lib.expression :as lib.expression] [metabase.lib.filter :as lib.filter] + [metabase.lib.metadata :as lib.metadata] [metabase.lib.metadata.calculation :as lib.metadata.calculation] [metabase.lib.schema :as lib.schema] [metabase.lib.schema.drill-thru :as lib.schema.drill-thru] @@ -82,10 +87,17 @@ ;;12 34 5 6 7 8 9 10 #"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.") -(defn- column-extract-drill-for-column [column] +(defn- regex-available? [metadata-providerable] + ((:features (lib.metadata/database metadata-providerable)) :regex)) + +(defn- column-extract-drill-for-column [query column] (cond (lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…") :extractions (column-extract-temporal-units column)} + + ;; The URL and email extractions are powered by regular expressions, and not every database supports those. + ;; If the target database doesn't support :regex feature, return nil. + (not (regex-available? query)) nil (lib.types.isa/email? column) {:display-name (i18n/tru "Extract domain") :extractions [{:key :email-domain :display-name (i18n/tru "Domain")}]} @@ -103,7 +115,7 @@ stage-number :- :int {:keys [column column-ref value]} :- ::lib.schema.drill-thru/context] (when (and column (nil? value)) - (when-let [drill (column-extract-drill-for-column column)] + (when-let [drill (column-extract-drill-for-column query column)] (merge drill {:lib/type :metabase.lib.drill-thru/drill-thru :type :drill-thru/column-extract} diff --git a/test/metabase/lib/drill_thru/column_extract_test.cljc b/test/metabase/lib/drill_thru/column_extract_test.cljc index f08d02d283e..f72427543e9 100644 --- a/test/metabase/lib/drill_thru/column_extract_test.cljc +++ b/test/metabase/lib/drill_thru/column_extract_test.cljc @@ -281,19 +281,26 @@ #?(:clj @#'lib.drill-thru.column-extract/email->domain-regex :cljs lib.drill-thru.column-extract/email->domain-regex)) +(def ^:private homepage + (assoc (meta/field-metadata :people :email) + :id 9999001 + :name "HOMEPAGE" + :display-name "Homepage URL" + :base-type :type/Text + :effective-type :type/Text + :semantic-type :type/URL)) + +(defn- homepage-provider + ([] (homepage-provider meta/metadata-provider)) + ([base-provider] + (lib/composed-metadata-provider + (lib.tu/mock-metadata-provider {:fields [homepage]}) + base-provider))) + (deftest ^:parallel column-extract-url->domain-test ;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE. - (let [homepage (assoc (meta/field-metadata :people :email) - :id 9999001 - :name "HOMEPAGE" - :display-name "Homepage URL" - :base-type :type/Text - :effective-type :type/Text - :semantic-type :type/URL) - mp (lib/composed-metadata-provider - (lib.tu/mock-metadata-provider {:fields [homepage]}) - meta/metadata-provider) - query (lib/query mp (lib.metadata/table mp (meta/id :people)))] + (let [mp (homepage-provider) + query (lib/query mp (lib.metadata/table mp (meta/id :people)))] (testing "Extracting Domain" (lib.drill-thru.tu/test-drill-application {:drill-type :drill-thru/column-extract @@ -329,6 +336,61 @@ (u/regex->str url->host-regex)] (u/regex->str host->subdomain-regex)]]}]}})))) +(deftest ^:parallel column-extract-url-requires-regex-test + (let [query-regex (lib/query (homepage-provider) (meta/table-metadata :people)) + no-regex (homepage-provider (meta/updated-metadata-provider update :features disj :regex)) + query-no-regex (lib/query no-regex (meta/table-metadata :people))] + (testing "when the database supports :regex URL extraction is available" + (lib.drill-thru.tu/test-drill-application + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "HOMEPAGE" + :custom-query query-regex + :expected {:type :drill-thru/column-extract + :display-name "Extract domain, subdomain…" + :extractions [{:key :domain, :display-name "Domain"} + {:key :subdomain, :display-name "Subdomain"}]} + :drill-args ["subdomain"] + :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Subdomain"} + [:regex-match-first {} + [:field {} 9999001] + (u/regex->str url->host-regex)] + (u/regex->str host->subdomain-regex)]]}]}})) + (testing "when the database does not support :regex URL extraction is not available" + (lib.drill-thru.tu/test-drill-not-returned + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "HOMEPAGE" + :custom-query query-no-regex})))) + +(deftest ^:parallel column-extract-email-requires-regex-test + (let [query-regex (lib/query meta/metadata-provider (meta/table-metadata :people)) + no-regex (meta/updated-metadata-provider update :features disj :regex) + query-no-regex (lib/query no-regex (meta/table-metadata :people))] + (testing "when the database supports :regex email extraction is available" + (lib.drill-thru.tu/test-drill-application + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "EMAIL" + :custom-query query-regex + :expected {:type :drill-thru/column-extract + :display-name "Extract domain" + :extractions [{:key :email-domain, :display-name "Domain"}]} + :drill-args ["email-domain"] + :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Domain"} + [:field {} (meta/id :people :email)] + (u/regex->str email->domain-regex)]]}]}})) + (testing "when the database does not support :regex email extraction is not available" + (lib.drill-thru.tu/test-drill-not-returned + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "EMAIL" + :custom-query query-no-regex})))) + (deftest ^:parallel url->host-regex-test (are [host url] (= host (second (re-find url->host-regex url))) "cdbaby.com" "https://cdbaby.com/some.txt" -- GitLab