From 0bde1fc9ebadd706ca4a93f3d556422549e77a2b Mon Sep 17 00:00:00 2001
From: Braden Shepherdson <braden@metabase.com>
Date: Wed, 17 Apr 2024 08:59:00 -0400
Subject: [PATCH] [MBQL lib] Column Extract from URL, email requires regex
 support (#41484)

Not all drivers support regular expressions. Don't return the
Column Extractions for URL -> (sub)domain and email -> domain unless the
database can support the regular expression matches.

Part of the follow-up for Extract Column epic #38964.
---
 .../lib/drill_thru/column_extract.cljc        | 18 +++-
 .../lib/drill_thru/column_extract_test.cljc   | 84 ++++++++++++++++---
 2 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/src/metabase/lib/drill_thru/column_extract.cljc b/src/metabase/lib/drill_thru/column_extract.cljc
index a129b0f8196..63f021eac42 100644
--- a/src/metabase/lib/drill_thru/column_extract.cljc
+++ b/src/metabase/lib/drill_thru/column_extract.cljc
@@ -7,13 +7,18 @@
 
   Query transformation:
 
-  - Add an expression that extracts the specified value from this column."
+  - Add an expression that extracts the specified value from this column.
+
+  Extra constraints:
+
+  - Database must support `:regex` feature for the URL and Email extractions to work."
   (:require
    [medley.core :as m]
    [metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter]
    [metabase.lib.drill-thru.common :as lib.drill-thru.common]
    [metabase.lib.expression :as lib.expression]
    [metabase.lib.filter :as lib.filter]
+   [metabase.lib.metadata :as lib.metadata]
    [metabase.lib.metadata.calculation :as lib.metadata.calculation]
    [metabase.lib.schema :as lib.schema]
    [metabase.lib.schema.drill-thru :as lib.schema.drill-thru]
@@ -82,10 +87,17 @@
   ;;12         34        5  6       7                8       9      10
   #"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")
 
-(defn- column-extract-drill-for-column [column]
+(defn- regex-available? [metadata-providerable]
+  ((:features (lib.metadata/database metadata-providerable)) :regex))
+
+(defn- column-extract-drill-for-column [query column]
   (cond
     (lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…")
                                       :extractions  (column-extract-temporal-units column)}
+
+    ;; The URL and email extractions are powered by regular expressions, and not every database supports those.
+    ;; If the target database doesn't support :regex feature, return nil.
+    (not (regex-available? query))   nil
     (lib.types.isa/email? column)    {:display-name (i18n/tru "Extract domain")
                                       :extractions  [{:key          :email-domain
                                                       :display-name (i18n/tru "Domain")}]}
@@ -103,7 +115,7 @@
    stage-number                :- :int
    {:keys [column column-ref value]} :- ::lib.schema.drill-thru/context]
   (when (and column (nil? value))
-    (when-let [drill (column-extract-drill-for-column column)]
+    (when-let [drill (column-extract-drill-for-column query column)]
       (merge drill
              {:lib/type :metabase.lib.drill-thru/drill-thru
               :type     :drill-thru/column-extract}
diff --git a/test/metabase/lib/drill_thru/column_extract_test.cljc b/test/metabase/lib/drill_thru/column_extract_test.cljc
index f08d02d283e..f72427543e9 100644
--- a/test/metabase/lib/drill_thru/column_extract_test.cljc
+++ b/test/metabase/lib/drill_thru/column_extract_test.cljc
@@ -281,19 +281,26 @@
   #?(:clj  @#'lib.drill-thru.column-extract/email->domain-regex
      :cljs lib.drill-thru.column-extract/email->domain-regex))
 
+(def ^:private homepage
+  (assoc (meta/field-metadata :people :email)
+         :id             9999001
+         :name           "HOMEPAGE"
+         :display-name   "Homepage URL"
+         :base-type      :type/Text
+         :effective-type :type/Text
+         :semantic-type  :type/URL))
+
+(defn- homepage-provider
+  ([] (homepage-provider meta/metadata-provider))
+  ([base-provider]
+   (lib/composed-metadata-provider
+     (lib.tu/mock-metadata-provider {:fields [homepage]})
+     base-provider)))
+
 (deftest ^:parallel column-extract-url->domain-test
   ;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE.
-  (let [homepage (assoc (meta/field-metadata :people :email)
-                        :id             9999001
-                        :name           "HOMEPAGE"
-                        :display-name   "Homepage URL"
-                        :base-type      :type/Text
-                        :effective-type :type/Text
-                        :semantic-type  :type/URL)
-        mp       (lib/composed-metadata-provider
-                   (lib.tu/mock-metadata-provider {:fields [homepage]})
-                   meta/metadata-provider)
-        query    (lib/query mp (lib.metadata/table mp (meta/id :people)))]
+  (let [mp    (homepage-provider)
+        query (lib/query mp (lib.metadata/table mp (meta/id :people)))]
     (testing "Extracting Domain"
       (lib.drill-thru.tu/test-drill-application
         {:drill-type     :drill-thru/column-extract
@@ -329,6 +336,61 @@
                                                     (u/regex->str url->host-regex)]
                                                    (u/regex->str host->subdomain-regex)]]}]}}))))
 
+(deftest ^:parallel column-extract-url-requires-regex-test
+  (let [query-regex    (lib/query (homepage-provider) (meta/table-metadata :people))
+        no-regex       (homepage-provider (meta/updated-metadata-provider update :features disj :regex))
+        query-no-regex (lib/query no-regex (meta/table-metadata :people))]
+    (testing "when the database supports :regex URL extraction is available"
+      (lib.drill-thru.tu/test-drill-application
+        {:drill-type     :drill-thru/column-extract
+         :click-type     :header
+         :query-type     :unaggregated
+         :column-name    "HOMEPAGE"
+         :custom-query   query-regex
+         :expected       {:type         :drill-thru/column-extract
+                          :display-name "Extract domain, subdomain…"
+                          :extractions  [{:key :domain,    :display-name "Domain"}
+                                         {:key :subdomain, :display-name "Subdomain"}]}
+         :drill-args     ["subdomain"]
+         :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Subdomain"}
+                                                   [:regex-match-first {}
+                                                    [:field {} 9999001]
+                                                    (u/regex->str url->host-regex)]
+                                                   (u/regex->str host->subdomain-regex)]]}]}}))
+    (testing "when the database does not support :regex URL extraction is not available"
+      (lib.drill-thru.tu/test-drill-not-returned
+        {:drill-type     :drill-thru/column-extract
+         :click-type     :header
+         :query-type     :unaggregated
+         :column-name    "HOMEPAGE"
+         :custom-query   query-no-regex}))))
+
+(deftest ^:parallel column-extract-email-requires-regex-test
+  (let [query-regex    (lib/query meta/metadata-provider (meta/table-metadata :people))
+        no-regex       (meta/updated-metadata-provider update :features disj :regex)
+        query-no-regex (lib/query no-regex (meta/table-metadata :people))]
+    (testing "when the database supports :regex email extraction is available"
+      (lib.drill-thru.tu/test-drill-application
+        {:drill-type     :drill-thru/column-extract
+         :click-type     :header
+         :query-type     :unaggregated
+         :column-name    "EMAIL"
+         :custom-query   query-regex
+         :expected       {:type         :drill-thru/column-extract
+                          :display-name "Extract domain"
+                          :extractions  [{:key :email-domain, :display-name "Domain"}]}
+         :drill-args     ["email-domain"]
+         :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Domain"}
+                                                   [:field {} (meta/id :people :email)]
+                                                   (u/regex->str email->domain-regex)]]}]}}))
+    (testing "when the database does not support :regex email extraction is not available"
+      (lib.drill-thru.tu/test-drill-not-returned
+        {:drill-type     :drill-thru/column-extract
+         :click-type     :header
+         :query-type     :unaggregated
+         :column-name    "EMAIL"
+         :custom-query   query-no-regex}))))
+
 (deftest ^:parallel url->host-regex-test
   (are [host url] (= host (second (re-find url->host-regex url)))
        "cdbaby.com"         "https://cdbaby.com/some.txt"
-- 
GitLab