From 7a4918d0621ee35fb0a1a334a51032c120868e4d Mon Sep 17 00:00:00 2001 From: Braden Shepherdson <braden@metabase.com> Date: Mon, 18 Mar 2024 14:02:49 -0400 Subject: [PATCH] [MLv2] Support `Extract` action for (sub)domains from URLs (#40188) Milestone 2 of #38964. --- src/metabase/lib/common.cljc | 5 + .../lib/drill_thru/column_extract.cljc | 82 ++++++++-- src/metabase/lib/expression.cljc | 1 + src/metabase/util.cljc | 9 ++ .../lib/drill_thru/column_extract_test.cljc | 148 +++++++++++++++++- .../lib/drill_thru/test_util/canned.cljc | 64 +++++++- 6 files changed, 286 insertions(+), 23 deletions(-) diff --git a/src/metabase/lib/common.cljc b/src/metabase/lib/common.cljc index a10b5dab347..dca195c583a 100644 --- a/src/metabase/lib/common.cljc +++ b/src/metabase/lib/common.cljc @@ -5,6 +5,7 @@ [metabase.lib.options :as lib.options] [metabase.lib.ref :as lib.ref] [metabase.lib.schema.common :as schema.common] + [metabase.util :as u] [metabase.util.malli :as mu]) #?(:cljs (:require-macros [metabase.lib.common]))) @@ -40,6 +41,10 @@ [xs] (mapv ->op-arg xs)) +(defmethod ->op-arg :dispatch-type/regex + [regex] + (u/regex->str regex)) + (defmethod ->op-arg :metadata/column [field-metadata] (lib.ref/ref field-metadata)) diff --git a/src/metabase/lib/drill_thru/column_extract.cljc b/src/metabase/lib/drill_thru/column_extract.cljc index cf97dc249fc..597ad65735f 100644 --- a/src/metabase/lib/drill_thru/column_extract.cljc +++ b/src/metabase/lib/drill_thru/column_extract.cljc @@ -9,6 +9,7 @@ - Add an expression that extracts the specified value from this column." (:require + [medley.core :as m] [metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter] [metabase.lib.drill-thru.common :as lib.drill-thru.common] [metabase.lib.expression :as lib.expression] @@ -33,6 +34,50 @@ {:key unit :display-name (lib.temporal-bucket/describe-temporal-unit unit)})))) +(def ^:private url->host-regex + ;; protocol host etc. + #"^(?:[^:/?#]*:?//)?([^/?#]*).*$") + +(def ^:private host->domain-regex + ;; Deliberately no ^ at the start; there might be several subdomains before this spot. + ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk. + ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops. + ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list + ;; from Mozilla or accept that this regex is a bit best-effort. + + ;; Skip www domain maybe short tail TLD + #"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$") + +(def ^:private host->subdomain-regex + ;; This grabs the first segment that isn't "www", AND excludes the main domain name. + ;; See [[host->domain-regex]] for more details about how those are matched. + ;; Referencing the indexes below: + ;; 1. Only at the start of the input + ;; 2. Consume "www." if present + ;; 3. Start capturing the subdomain we want + ;; 4. Negative lookahead: That subdomain can't be "www"; we don't want to backtrack and find "www". + ;; 5. Negative lookahead to make sure this isn't the proper domain: + ;; 6. Main domain name + ;; 7. Optional short tail (eg. co.uk) + ;; 8. Top-level domain, ending the input + ;; 9. Matching the actual subdomain + ;; 10. And its dot, which is outside the capture. + ;;12 34 5 6 7 8 9 10 + #"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.") + +;; Full size, I think we can get away with a simpler one - just the first match that isn't the main domain or www. +#_#"^(?:www\.)?((?!www\.)(?!(?:[^\.]+\.[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.(?:[^\.]+\.)+(?:[^\.]{1,3}\.)?[^\.]+$" + +(defn- column-extract-drill-for-column [column] + (cond + (lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…") + :extractions (column-extract-temporal-units column)} + (lib.types.isa/URL? column) {:display-name (i18n/tru "Extract domain, subdomain…") + :extractions [{:key :domain + :display-name (i18n/tru "Domain")} + {:key :subdomain + :display-name (i18n/tru "Subdomain")}]})) + (mu/defn column-extract-drill :- [:maybe ::lib.schema.drill-thru/drill-thru.column-extract] "Column clicks on temporal columns only. @@ -40,15 +85,13 @@ [query :- ::lib.schema/query stage-number :- :int {:keys [column column-ref value]} :- ::lib.schema.drill-thru/context] - (when (and column - (nil? value) - (lib.types.isa/temporal? column)) - (merge {:lib/type :metabase.lib.drill-thru/drill-thru - :type :drill-thru/column-extract - :display-name (i18n/tru "Extract day, month…") - :extractions (column-extract-temporal-units column)} - (lib.drill-thru.column-filter/prepare-query-for-drill-addition - query stage-number column column-ref :expression)))) + (when (and column (nil? value)) + (when-let [drill (column-extract-drill-for-column column)] + (merge drill + {:lib/type :metabase.lib.drill-thru/drill-thru + :type :drill-thru/column-extract} + (lib.drill-thru.column-filter/prepare-query-for-drill-addition + query stage-number column column-ref :expression))))) (defmethod lib.drill-thru.common/drill-thru-info-method :drill-thru/column-extract [_query _stage-number drill] @@ -64,17 +107,26 @@ (defn- extraction-expression [column tag] (case tag + ;; Temporal extractions :hour-of-day (lib.expression/get-hour column) :day-of-month (lib.expression/get-day column) :day-of-week (case-expression #(lib.expression/get-day-of-week column) tag 7) :month-of-year (case-expression #(lib.expression/get-month column) tag 12) :quarter-of-year (case-expression #(lib.expression/get-quarter column) tag 4) - :year (lib.expression/get-year column))) + :year (lib.expression/get-year column) + ;; URLs + :domain (-> column + (lib.expression/regex-match-first url->host-regex) + (lib.expression/regex-match-first host->domain-regex)) + :subdomain (-> column + (lib.expression/regex-match-first url->host-regex) + (lib.expression/regex-match-first host->subdomain-regex)))) (defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract - [_query _stage-number {:keys [query stage-number column]} & [tag]] - (let [unit (keyword tag) - unique-name-fn (lib.util/unique-name-generator)] + [_query _stage-number {:keys [query stage-number column extractions]} & [tag]] + (let [tag (keyword tag) + {:keys [display-name]} (m/find-first #(= (:key %) tag) extractions) + unique-name-fn (lib.util/unique-name-generator)] (doseq [col-name (->> (lib.util/query-stage query stage-number) (lib.metadata.calculation/returned-columns query stage-number) (map :name))] @@ -82,5 +134,5 @@ (lib.expression/expression query stage-number - (unique-name-fn (lib.temporal-bucket/describe-temporal-unit unit)) - (extraction-expression column unit)))) + (unique-name-fn display-name) + (extraction-expression column tag)))) diff --git a/src/metabase/lib/expression.cljc b/src/metabase/lib/expression.cljc index 33cf179a048..9c5c21945d5 100644 --- a/src/metabase/lib/expression.cljc +++ b/src/metabase/lib/expression.cljc @@ -279,6 +279,7 @@ (lib.common/defop substring [s start end]) (lib.common/defop replace [s search replacement]) (lib.common/defop regexextract [s regex]) +(lib.common/defop regex-match-first [s regex]) (lib.common/defop length [s]) (lib.common/defop trim [s]) (lib.common/defop ltrim [s]) diff --git a/src/metabase/util.cljc b/src/metabase/util.cljc index 93f3f0ce295..2da4c4ddca4 100644 --- a/src/metabase/util.cljc +++ b/src/metabase/util.cljc @@ -179,6 +179,15 @@ (str (upper-case-en (subs s 0 1)) (lower-case-en (subs s 1)))))) +(defn regex->str + "Returns the contents of a regex as a string. + + This is simply [[str]] in Clojure but needs to remove slashes (`\"/regex contents/\"`) in CLJS." + [regex] + #?(:clj (str regex) + :cljs (let [s (str regex)] + (subs s 1 (dec (count s)))))) + ;;; define custom CSK conversion functions so we don't run into problems if the system locale is Turkish ;; so Kondo doesn't complain diff --git a/test/metabase/lib/drill_thru/column_extract_test.cljc b/test/metabase/lib/drill_thru/column_extract_test.cljc index 86674f5a64a..8135aa6c5b9 100644 --- a/test/metabase/lib/drill_thru/column_extract_test.cljc +++ b/test/metabase/lib/drill_thru/column_extract_test.cljc @@ -1,16 +1,18 @@ (ns metabase.lib.drill-thru.column-extract-test "See also [[metabase.query-processor-test.drill-thru-e2e-test/quick-filter-on-bucketed-date-test]]" (:require - [clojure.test :refer [deftest testing]] + [clojure.test :refer [are deftest testing]] [medley.core :as m] [metabase.lib.core :as lib] + [metabase.lib.drill-thru.column-extract :as lib.drill-thru.column-extract] [metabase.lib.drill-thru.test-util :as lib.drill-thru.tu] [metabase.lib.drill-thru.test-util.canned :as canned] [metabase.lib.metadata :as lib.metadata] [metabase.lib.test-metadata :as meta] + [metabase.lib.test-util :as lib.tu] + [metabase.util :as u] #?@(:clj ([metabase.test :as mt]) - :cljs ([metabase.test-runner.assert-exprs.approximately-equal])) - [metabase.lib.test-util :as lib.tu])) + :cljs ([metabase.test-runner.assert-exprs.approximately-equal])))) #?(:cljs (comment metabase.test-runner.assert-exprs.approximately-equal/keep-me)) @@ -28,12 +30,13 @@ (concat time-extraction-units date-extraction-units)) (deftest ^:parallel column-extract-availability-test - (testing "column-extract is avaiable for column clicks on temporal columns" + (testing "column-extract is available for column clicks on temporal and URL columns" (canned/canned-test :drill-thru/column-extract - (fn [_test-case _context {:keys [click column-type]}] + (fn [_test-case {:keys [column] :as _context} {:keys [click column-type]}] (and (= click :header) - (= column-type :datetime)))))) + (or (= column-type :datetime) + (= (:semantic-type column) :type/URL))))))) (deftest ^:parallel returns-column-extract-test-1 (lib.drill-thru.tu/test-returns-drill @@ -261,3 +264,136 @@ :custom-query query :expected {:type :drill-thru/column-extract :extractions date-extraction-units}}))) + +(def ^:private url->host-regex + #?(:clj @#'lib.drill-thru.column-extract/url->host-regex + :cljs lib.drill-thru.column-extract/url->host-regex)) + +(def ^:private host->domain-regex + #?(:clj @#'lib.drill-thru.column-extract/host->domain-regex + :cljs lib.drill-thru.column-extract/host->domain-regex)) + +(def ^:private host->subdomain-regex + #?(:clj @#'lib.drill-thru.column-extract/host->subdomain-regex + :cljs lib.drill-thru.column-extract/host->subdomain-regex)) + +(deftest ^:parallel column-extract-url->domain-test + ;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE. + (let [homepage (assoc (meta/field-metadata :people :email) + :id 9999001 + :name "HOMEPAGE" + :display-name "Homepage URL" + :base-type :type/Text + :effective-type :type/Text + :semantic-type :type/URL) + mp (lib/composed-metadata-provider + (lib.tu/mock-metadata-provider {:fields [homepage]}) + meta/metadata-provider) + query (lib/query mp (lib.metadata/table mp (meta/id :people)))] + (testing "Extracting Domain" + (lib.drill-thru.tu/test-drill-application + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "HOMEPAGE" + :custom-query query + :expected {:type :drill-thru/column-extract + :display-name "Extract domain, subdomain…" + :extractions [{:key :domain, :display-name "Domain"} + {:key :subdomain, :display-name "Subdomain"}]} + :drill-args ["domain"] + :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Domain"} + [:regex-match-first {} + [:field {} 9999001] + (u/regex->str url->host-regex)] + (u/regex->str host->domain-regex)]]}]}})) + (testing "Extracting Subdomain" + (lib.drill-thru.tu/test-drill-application + {:drill-type :drill-thru/column-extract + :click-type :header + :query-type :unaggregated + :column-name "HOMEPAGE" + :custom-query query + :expected {:type :drill-thru/column-extract + :display-name "Extract domain, subdomain…" + :extractions [{:key :domain, :display-name "Domain"} + {:key :subdomain, :display-name "Subdomain"}]} + :drill-args ["subdomain"] + :expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Subdomain"} + [:regex-match-first {} + [:field {} 9999001] + (u/regex->str url->host-regex)] + (u/regex->str host->subdomain-regex)]]}]}})))) + +(deftest ^:parallel url->host-regex-test + (are [host url] (= host (second (re-find url->host-regex url))) + "cdbaby.com" "https://cdbaby.com/some.txt" + "fema.gov" "https://fema.gov/some/path/Vatini?search=foo" + "www.geocities.jp" "https://www.geocities.jp/some/path/Turbitt?search=foo" + "jalbum.net" "https://jalbum.net/some/path/Kirsz?search=foo" + "usa.gov" "https://usa.gov/some/path/Curdell?search=foo" + "taxes.va.gov" "http://taxes.va.gov/some/path/Marritt?search=foo" + "log.stuff.gmpg.org" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo" + "hatena.ne.jp" "http://hatena.ne.jp/" + "telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail" + "bbc.co.uk" "bbc.co.uk/some/path?search=foo")) + +(deftest ^:parallel host->domain-regex-test + (are [domain host] (= domain (second (re-find host->domain-regex host))) + ;; Easy cases: second-last part is the domain. + "cdbaby" "cdbaby.com" + "fema" "fema.gov" + "geocities" "www.geocities.jp" + "jalbum" "sub.jalbum.net" + "jalbum" "subdomains.go.here.jalbum.net" + "gmpg" "log.stuff.gmpg.org" + + ;; The second-last part is the domain even if it's short, sometimes. + "usa" "usa.gov" + "va" "va.gov" + + ;; Oops, we picked a subdomain! But see below. + "taxes" "taxes.va.gov" ; True domain is va + "hatena" "hatena.ne.jp" ; True domain is ne + + ;; Sometimes the second-last part is a short suffix. + ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database, + ;; we use a best-effort matcher that gets the domain right most of the time. + "telegraph" "telegraph.co.uk" + "bbc" "bbc.co.uk" + "dot" "dot.va.gov" + + ;; "www" is disregarded as a possible subdomain. + "usa" "www.usa.gov" + "va" "www.va.gov" + "dot" "www.dot.va.gov")) + +(deftest ^:parallel host->subdomain-regex-test + (are [subdomain host] (= subdomain (second (re-find host->subdomain-regex host))) + ;; Blanks. "www" doesn't count. + nil "cdbaby.com" + nil "fema.gov" + nil "www.geocities.jp" + nil "usa.gov" + nil "va.gov" + + ;; Basics - taking the first segment that isn't "www", IF it isn't the domain. + "sub" "sub.jalbum.net" + "subdomains" "subdomains.go.here.jalbum.net" + "log" "log.stuff.gmpg.org" + + ;; Oops, we missed those. This is the reverse of the problem when picking the domain. + nil "taxes.va.gov" ; True domain is va, subdomain is taxes. + nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena. + + ;; Sometimes the second-last part is a short suffix. + ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database, + ;; we use a best-effort matcher that gets the domain right most of the time. + nil "telegraph.co.uk" + "local" "local.news.telegraph.co.uk" + nil "bbc.co.uk" + "video" "video.bbc.co.uk" + ;; "www" is disregarded as a possible subdomain, so these are also incorrect. + nil "www.usa.gov" + nil "www.dot.va.gov" + "licensing" "www.licensing.dot.va.gov")) diff --git a/test/metabase/lib/drill_thru/test_util/canned.cljc b/test/metabase/lib/drill_thru/test_util/canned.cljc index 4ac499acb1b..737cf2dd464 100644 --- a/test/metabase/lib/drill_thru/test_util/canned.cljc +++ b/test/metabase/lib/drill_thru/test_util/canned.cljc @@ -174,7 +174,26 @@ "PRODUCT_ID" nil "CREATED_AT" "2024-09-08T22:03:20.239+03:00"} :aggregations 0 - :breakouts 0}})) + :breakouts 0} + + :test.query/people + {:query (lib/query metadata-provider (meta/table-metadata :people)) + :row {"ID" "222" + "NAME" "J. Some Guy" + "EMAIL" "someguy@isp.com" + "PASSWORD" "eafc45bf-cf8e-4c96-ab35-ce44d0021597" + "ADDRESS" "2112 Rush St" + "CITY" "Portland" + "STATE" "ME" + "ZIP" "66223" + "LATITUDE" 43.6307309 + "LONGITUDE" -70.8311294 + "SOURCE" "Facebook" + "BIRTH_DATE" "1987-06-14T00:00:00Z" + "CREATED_AT" "2024-09-08T22:03:20.239-04:00"} + :aggregations 0 + :breakouts 0} + })) (defn returned "Given a test case, a context, and a target drill type (eg. `:drill-thru/quick-filter`), calls [[lib/available-drill-thrus]] and looks for the specified drill. @@ -309,6 +328,37 @@ (click tc :header "PRODUCT_ID" :basic :fk) (click tc :header "CREATED_AT" :basic :datetime)]) + ;; Simple query against People. + ;; This one has a :type/Email (EMAIL) for Column Extract drills. + (let [tc (test-case metadata-provider :test.query/people)] + [(click tc :cell "ID" :basic :pk) + (click tc :cell "ADDRESS" :basic :string) + (click tc :cell "EMAIL" :basic :string) + (click tc :cell "PASSWORD" :basic :string) + (click tc :cell "NAME" :basic :string) + (click tc :cell "CITY" :basic :string) + (click tc :cell "STATE" :basic :string) + (click tc :cell "ZIP" :basic :string) + (click tc :cell "LATITUDE" :basic :number) + (click tc :cell "LONGITUDE" :basic :number) + (click tc :cell "SOURCE" :basic :string) + (click tc :cell "BIRTH_DATE" :basic :datetime) + (click tc :cell "CREATED_AT" :basic :datetime) + + (click tc :header "ID" :basic :pk) + (click tc :header "ADDRESS" :basic :string) + (click tc :header "EMAIL" :basic :string) + (click tc :header "PASSWORD" :basic :string) + (click tc :header "NAME" :basic :string) + (click tc :header "CITY" :basic :string) + (click tc :header "STATE" :basic :string) + (click tc :header "ZIP" :basic :string) + (click tc :header "LATITUDE" :basic :number) + (click tc :header "LONGITUDE" :basic :number) + (click tc :header "SOURCE" :basic :string) + (click tc :header "BIRTH_DATE" :basic :datetime) + (click tc :header "CREATED_AT" :basic :datetime)]) + ;; Simple query against Products, but it lies! ;; Claims VENDOR is :type/SerializedJSON (derives from :type/Structured). (let [tc (-> metadata-provider @@ -317,7 +367,17 @@ :semantic-type :type/SerializedJSON}]}) (test-case :test.query/products))] [(click tc :cell "VENDOR" :basic :string) - (click tc :header "VENDOR" :basic :string)])] + (click tc :header "VENDOR" :basic :string)]) + + ;; Simple query against People, but it lies! + ;; Claims EMAIL is :type/URL (relevant to Column Extract drills). + (let [tc (-> metadata-provider + (merged-mock/merged-mock-metadata-provider + {:fields [{:id (meta/id :people :email) + :semantic-type :type/URL}]}) + (test-case :test.query/people))] + [(click tc :cell "EMAIL" :basic :string) + (click tc :header "EMAIL" :basic :string)])] (apply concat)))) (defn canned-test -- GitLab