Skip to content
Snippets Groups Projects
Unverified Commit 0bde1fc9 authored by Braden Shepherdson's avatar Braden Shepherdson Committed by GitHub
Browse files

[MBQL lib] Column Extract from URL, email requires regex support (#41484)

Not all drivers support regular expressions. Don't return the
Column Extractions for URL -> (sub)domain and email -> domain unless the
database can support the regular expression matches.

Part of the follow-up for Extract Column epic #38964.
parent 320307f7
No related branches found
No related tags found
No related merge requests found
......@@ -7,13 +7,18 @@
Query transformation:
- Add an expression that extracts the specified value from this column."
- Add an expression that extracts the specified value from this column.
Extra constraints:
- Database must support `:regex` feature for the URL and Email extractions to work."
(:require
[medley.core :as m]
[metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter]
[metabase.lib.drill-thru.common :as lib.drill-thru.common]
[metabase.lib.expression :as lib.expression]
[metabase.lib.filter :as lib.filter]
[metabase.lib.metadata :as lib.metadata]
[metabase.lib.metadata.calculation :as lib.metadata.calculation]
[metabase.lib.schema :as lib.schema]
[metabase.lib.schema.drill-thru :as lib.schema.drill-thru]
......@@ -82,10 +87,17 @@
;;12 34 5 6 7 8 9 10
#"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")
(defn- column-extract-drill-for-column [column]
(defn- regex-available? [metadata-providerable]
((:features (lib.metadata/database metadata-providerable)) :regex))
(defn- column-extract-drill-for-column [query column]
(cond
(lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…")
:extractions (column-extract-temporal-units column)}
;; The URL and email extractions are powered by regular expressions, and not every database supports those.
;; If the target database doesn't support :regex feature, return nil.
(not (regex-available? query)) nil
(lib.types.isa/email? column) {:display-name (i18n/tru "Extract domain")
:extractions [{:key :email-domain
:display-name (i18n/tru "Domain")}]}
......@@ -103,7 +115,7 @@
stage-number :- :int
{:keys [column column-ref value]} :- ::lib.schema.drill-thru/context]
(when (and column (nil? value))
(when-let [drill (column-extract-drill-for-column column)]
(when-let [drill (column-extract-drill-for-column query column)]
(merge drill
{:lib/type :metabase.lib.drill-thru/drill-thru
:type :drill-thru/column-extract}
......
......@@ -281,19 +281,26 @@
#?(:clj @#'lib.drill-thru.column-extract/email->domain-regex
:cljs lib.drill-thru.column-extract/email->domain-regex))
(def ^:private homepage
(assoc (meta/field-metadata :people :email)
:id 9999001
:name "HOMEPAGE"
:display-name "Homepage URL"
:base-type :type/Text
:effective-type :type/Text
:semantic-type :type/URL))
(defn- homepage-provider
([] (homepage-provider meta/metadata-provider))
([base-provider]
(lib/composed-metadata-provider
(lib.tu/mock-metadata-provider {:fields [homepage]})
base-provider)))
(deftest ^:parallel column-extract-url->domain-test
;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE.
(let [homepage (assoc (meta/field-metadata :people :email)
:id 9999001
:name "HOMEPAGE"
:display-name "Homepage URL"
:base-type :type/Text
:effective-type :type/Text
:semantic-type :type/URL)
mp (lib/composed-metadata-provider
(lib.tu/mock-metadata-provider {:fields [homepage]})
meta/metadata-provider)
query (lib/query mp (lib.metadata/table mp (meta/id :people)))]
(let [mp (homepage-provider)
query (lib/query mp (lib.metadata/table mp (meta/id :people)))]
(testing "Extracting Domain"
(lib.drill-thru.tu/test-drill-application
{:drill-type :drill-thru/column-extract
......@@ -329,6 +336,61 @@
(u/regex->str url->host-regex)]
(u/regex->str host->subdomain-regex)]]}]}}))))
(deftest ^:parallel column-extract-url-requires-regex-test
(let [query-regex (lib/query (homepage-provider) (meta/table-metadata :people))
no-regex (homepage-provider (meta/updated-metadata-provider update :features disj :regex))
query-no-regex (lib/query no-regex (meta/table-metadata :people))]
(testing "when the database supports :regex URL extraction is available"
(lib.drill-thru.tu/test-drill-application
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "HOMEPAGE"
:custom-query query-regex
:expected {:type :drill-thru/column-extract
:display-name "Extract domain, subdomain…"
:extractions [{:key :domain, :display-name "Domain"}
{:key :subdomain, :display-name "Subdomain"}]}
:drill-args ["subdomain"]
:expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Subdomain"}
[:regex-match-first {}
[:field {} 9999001]
(u/regex->str url->host-regex)]
(u/regex->str host->subdomain-regex)]]}]}}))
(testing "when the database does not support :regex URL extraction is not available"
(lib.drill-thru.tu/test-drill-not-returned
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "HOMEPAGE"
:custom-query query-no-regex}))))
(deftest ^:parallel column-extract-email-requires-regex-test
(let [query-regex (lib/query meta/metadata-provider (meta/table-metadata :people))
no-regex (meta/updated-metadata-provider update :features disj :regex)
query-no-regex (lib/query no-regex (meta/table-metadata :people))]
(testing "when the database supports :regex email extraction is available"
(lib.drill-thru.tu/test-drill-application
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "EMAIL"
:custom-query query-regex
:expected {:type :drill-thru/column-extract
:display-name "Extract domain"
:extractions [{:key :email-domain, :display-name "Domain"}]}
:drill-args ["email-domain"]
:expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Domain"}
[:field {} (meta/id :people :email)]
(u/regex->str email->domain-regex)]]}]}}))
(testing "when the database does not support :regex email extraction is not available"
(lib.drill-thru.tu/test-drill-not-returned
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "EMAIL"
:custom-query query-no-regex}))))
(deftest ^:parallel url->host-regex-test
(are [host url] (= host (second (re-find url->host-regex url)))
"cdbaby.com" "https://cdbaby.com/some.txt"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment