Skip to content
Snippets Groups Projects
Unverified Commit 7a4918d0 authored by Braden Shepherdson's avatar Braden Shepherdson Committed by GitHub
Browse files

[MLv2] Support `Extract` action for (sub)domains from URLs (#40188)

Milestone 2 of #38964.
parent b04e1a35
No related branches found
No related tags found
No related merge requests found
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
[metabase.lib.options :as lib.options] [metabase.lib.options :as lib.options]
[metabase.lib.ref :as lib.ref] [metabase.lib.ref :as lib.ref]
[metabase.lib.schema.common :as schema.common] [metabase.lib.schema.common :as schema.common]
[metabase.util :as u]
[metabase.util.malli :as mu]) [metabase.util.malli :as mu])
#?(:cljs (:require-macros [metabase.lib.common]))) #?(:cljs (:require-macros [metabase.lib.common])))
...@@ -40,6 +41,10 @@ ...@@ -40,6 +41,10 @@
[xs] [xs]
(mapv ->op-arg xs)) (mapv ->op-arg xs))
(defmethod ->op-arg :dispatch-type/regex
[regex]
(u/regex->str regex))
(defmethod ->op-arg :metadata/column (defmethod ->op-arg :metadata/column
[field-metadata] [field-metadata]
(lib.ref/ref field-metadata)) (lib.ref/ref field-metadata))
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
- Add an expression that extracts the specified value from this column." - Add an expression that extracts the specified value from this column."
(:require (:require
[medley.core :as m]
[metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter] [metabase.lib.drill-thru.column-filter :as lib.drill-thru.column-filter]
[metabase.lib.drill-thru.common :as lib.drill-thru.common] [metabase.lib.drill-thru.common :as lib.drill-thru.common]
[metabase.lib.expression :as lib.expression] [metabase.lib.expression :as lib.expression]
...@@ -33,6 +34,50 @@ ...@@ -33,6 +34,50 @@
{:key unit {:key unit
:display-name (lib.temporal-bucket/describe-temporal-unit unit)})))) :display-name (lib.temporal-bucket/describe-temporal-unit unit)}))))
(def ^:private url->host-regex
;; protocol host etc.
#"^(?:[^:/?#]*:?//)?([^/?#]*).*$")
(def ^:private host->domain-regex
;; Deliberately no ^ at the start; there might be several subdomains before this spot.
;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
;; from Mozilla or accept that this regex is a bit best-effort.
;; Skip www domain maybe short tail TLD
#"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$")
(def ^:private host->subdomain-regex
;; This grabs the first segment that isn't "www", AND excludes the main domain name.
;; See [[host->domain-regex]] for more details about how those are matched.
;; Referencing the indexes below:
;; 1. Only at the start of the input
;; 2. Consume "www." if present
;; 3. Start capturing the subdomain we want
;; 4. Negative lookahead: That subdomain can't be "www"; we don't want to backtrack and find "www".
;; 5. Negative lookahead to make sure this isn't the proper domain:
;; 6. Main domain name
;; 7. Optional short tail (eg. co.uk)
;; 8. Top-level domain, ending the input
;; 9. Matching the actual subdomain
;; 10. And its dot, which is outside the capture.
;;12 34 5 6 7 8 9 10
#"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")
;; Full size, I think we can get away with a simpler one - just the first match that isn't the main domain or www.
#_#"^(?:www\.)?((?!www\.)(?!(?:[^\.]+\.[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.(?:[^\.]+\.)+(?:[^\.]{1,3}\.)?[^\.]+$"
(defn- column-extract-drill-for-column [column]
(cond
(lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…")
:extractions (column-extract-temporal-units column)}
(lib.types.isa/URL? column) {:display-name (i18n/tru "Extract domain, subdomain…")
:extractions [{:key :domain
:display-name (i18n/tru "Domain")}
{:key :subdomain
:display-name (i18n/tru "Subdomain")}]}))
(mu/defn column-extract-drill :- [:maybe ::lib.schema.drill-thru/drill-thru.column-extract] (mu/defn column-extract-drill :- [:maybe ::lib.schema.drill-thru/drill-thru.column-extract]
"Column clicks on temporal columns only. "Column clicks on temporal columns only.
...@@ -40,15 +85,13 @@ ...@@ -40,15 +85,13 @@
[query :- ::lib.schema/query [query :- ::lib.schema/query
stage-number :- :int stage-number :- :int
{:keys [column column-ref value]} :- ::lib.schema.drill-thru/context] {:keys [column column-ref value]} :- ::lib.schema.drill-thru/context]
(when (and column (when (and column (nil? value))
(nil? value) (when-let [drill (column-extract-drill-for-column column)]
(lib.types.isa/temporal? column)) (merge drill
(merge {:lib/type :metabase.lib.drill-thru/drill-thru {:lib/type :metabase.lib.drill-thru/drill-thru
:type :drill-thru/column-extract :type :drill-thru/column-extract}
:display-name (i18n/tru "Extract day, month…") (lib.drill-thru.column-filter/prepare-query-for-drill-addition
:extractions (column-extract-temporal-units column)} query stage-number column column-ref :expression)))))
(lib.drill-thru.column-filter/prepare-query-for-drill-addition
query stage-number column column-ref :expression))))
(defmethod lib.drill-thru.common/drill-thru-info-method :drill-thru/column-extract (defmethod lib.drill-thru.common/drill-thru-info-method :drill-thru/column-extract
[_query _stage-number drill] [_query _stage-number drill]
...@@ -64,17 +107,26 @@ ...@@ -64,17 +107,26 @@
(defn- extraction-expression [column tag] (defn- extraction-expression [column tag]
(case tag (case tag
;; Temporal extractions
:hour-of-day (lib.expression/get-hour column) :hour-of-day (lib.expression/get-hour column)
:day-of-month (lib.expression/get-day column) :day-of-month (lib.expression/get-day column)
:day-of-week (case-expression #(lib.expression/get-day-of-week column) tag 7) :day-of-week (case-expression #(lib.expression/get-day-of-week column) tag 7)
:month-of-year (case-expression #(lib.expression/get-month column) tag 12) :month-of-year (case-expression #(lib.expression/get-month column) tag 12)
:quarter-of-year (case-expression #(lib.expression/get-quarter column) tag 4) :quarter-of-year (case-expression #(lib.expression/get-quarter column) tag 4)
:year (lib.expression/get-year column))) :year (lib.expression/get-year column)
;; URLs
:domain (-> column
(lib.expression/regex-match-first url->host-regex)
(lib.expression/regex-match-first host->domain-regex))
:subdomain (-> column
(lib.expression/regex-match-first url->host-regex)
(lib.expression/regex-match-first host->subdomain-regex))))
(defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract (defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract
[_query _stage-number {:keys [query stage-number column]} & [tag]] [_query _stage-number {:keys [query stage-number column extractions]} & [tag]]
(let [unit (keyword tag) (let [tag (keyword tag)
unique-name-fn (lib.util/unique-name-generator)] {:keys [display-name]} (m/find-first #(= (:key %) tag) extractions)
unique-name-fn (lib.util/unique-name-generator)]
(doseq [col-name (->> (lib.util/query-stage query stage-number) (doseq [col-name (->> (lib.util/query-stage query stage-number)
(lib.metadata.calculation/returned-columns query stage-number) (lib.metadata.calculation/returned-columns query stage-number)
(map :name))] (map :name))]
...@@ -82,5 +134,5 @@ ...@@ -82,5 +134,5 @@
(lib.expression/expression (lib.expression/expression
query query
stage-number stage-number
(unique-name-fn (lib.temporal-bucket/describe-temporal-unit unit)) (unique-name-fn display-name)
(extraction-expression column unit)))) (extraction-expression column tag))))
...@@ -279,6 +279,7 @@ ...@@ -279,6 +279,7 @@
(lib.common/defop substring [s start end]) (lib.common/defop substring [s start end])
(lib.common/defop replace [s search replacement]) (lib.common/defop replace [s search replacement])
(lib.common/defop regexextract [s regex]) (lib.common/defop regexextract [s regex])
(lib.common/defop regex-match-first [s regex])
(lib.common/defop length [s]) (lib.common/defop length [s])
(lib.common/defop trim [s]) (lib.common/defop trim [s])
(lib.common/defop ltrim [s]) (lib.common/defop ltrim [s])
......
...@@ -179,6 +179,15 @@ ...@@ -179,6 +179,15 @@
(str (upper-case-en (subs s 0 1)) (str (upper-case-en (subs s 0 1))
(lower-case-en (subs s 1)))))) (lower-case-en (subs s 1))))))
(defn regex->str
"Returns the contents of a regex as a string.
This is simply [[str]] in Clojure but needs to remove slashes (`\"/regex contents/\"`) in CLJS."
[regex]
#?(:clj (str regex)
:cljs (let [s (str regex)]
(subs s 1 (dec (count s))))))
;;; define custom CSK conversion functions so we don't run into problems if the system locale is Turkish ;;; define custom CSK conversion functions so we don't run into problems if the system locale is Turkish
;; so Kondo doesn't complain ;; so Kondo doesn't complain
......
(ns metabase.lib.drill-thru.column-extract-test (ns metabase.lib.drill-thru.column-extract-test
"See also [[metabase.query-processor-test.drill-thru-e2e-test/quick-filter-on-bucketed-date-test]]" "See also [[metabase.query-processor-test.drill-thru-e2e-test/quick-filter-on-bucketed-date-test]]"
(:require (:require
[clojure.test :refer [deftest testing]] [clojure.test :refer [are deftest testing]]
[medley.core :as m] [medley.core :as m]
[metabase.lib.core :as lib] [metabase.lib.core :as lib]
[metabase.lib.drill-thru.column-extract :as lib.drill-thru.column-extract]
[metabase.lib.drill-thru.test-util :as lib.drill-thru.tu] [metabase.lib.drill-thru.test-util :as lib.drill-thru.tu]
[metabase.lib.drill-thru.test-util.canned :as canned] [metabase.lib.drill-thru.test-util.canned :as canned]
[metabase.lib.metadata :as lib.metadata] [metabase.lib.metadata :as lib.metadata]
[metabase.lib.test-metadata :as meta] [metabase.lib.test-metadata :as meta]
[metabase.lib.test-util :as lib.tu]
[metabase.util :as u]
#?@(:clj ([metabase.test :as mt]) #?@(:clj ([metabase.test :as mt])
:cljs ([metabase.test-runner.assert-exprs.approximately-equal])) :cljs ([metabase.test-runner.assert-exprs.approximately-equal]))))
[metabase.lib.test-util :as lib.tu]))
#?(:cljs (comment metabase.test-runner.assert-exprs.approximately-equal/keep-me)) #?(:cljs (comment metabase.test-runner.assert-exprs.approximately-equal/keep-me))
...@@ -28,12 +30,13 @@ ...@@ -28,12 +30,13 @@
(concat time-extraction-units date-extraction-units)) (concat time-extraction-units date-extraction-units))
(deftest ^:parallel column-extract-availability-test (deftest ^:parallel column-extract-availability-test
(testing "column-extract is avaiable for column clicks on temporal columns" (testing "column-extract is available for column clicks on temporal and URL columns"
(canned/canned-test (canned/canned-test
:drill-thru/column-extract :drill-thru/column-extract
(fn [_test-case _context {:keys [click column-type]}] (fn [_test-case {:keys [column] :as _context} {:keys [click column-type]}]
(and (= click :header) (and (= click :header)
(= column-type :datetime)))))) (or (= column-type :datetime)
(= (:semantic-type column) :type/URL)))))))
(deftest ^:parallel returns-column-extract-test-1 (deftest ^:parallel returns-column-extract-test-1
(lib.drill-thru.tu/test-returns-drill (lib.drill-thru.tu/test-returns-drill
...@@ -261,3 +264,136 @@ ...@@ -261,3 +264,136 @@
:custom-query query :custom-query query
:expected {:type :drill-thru/column-extract :expected {:type :drill-thru/column-extract
:extractions date-extraction-units}}))) :extractions date-extraction-units}})))
(def ^:private url->host-regex
#?(:clj @#'lib.drill-thru.column-extract/url->host-regex
:cljs lib.drill-thru.column-extract/url->host-regex))
(def ^:private host->domain-regex
#?(:clj @#'lib.drill-thru.column-extract/host->domain-regex
:cljs lib.drill-thru.column-extract/host->domain-regex))
(def ^:private host->subdomain-regex
#?(:clj @#'lib.drill-thru.column-extract/host->subdomain-regex
:cljs lib.drill-thru.column-extract/host->subdomain-regex))
(deftest ^:parallel column-extract-url->domain-test
;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE.
(let [homepage (assoc (meta/field-metadata :people :email)
:id 9999001
:name "HOMEPAGE"
:display-name "Homepage URL"
:base-type :type/Text
:effective-type :type/Text
:semantic-type :type/URL)
mp (lib/composed-metadata-provider
(lib.tu/mock-metadata-provider {:fields [homepage]})
meta/metadata-provider)
query (lib/query mp (lib.metadata/table mp (meta/id :people)))]
(testing "Extracting Domain"
(lib.drill-thru.tu/test-drill-application
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "HOMEPAGE"
:custom-query query
:expected {:type :drill-thru/column-extract
:display-name "Extract domain, subdomain…"
:extractions [{:key :domain, :display-name "Domain"}
{:key :subdomain, :display-name "Subdomain"}]}
:drill-args ["domain"]
:expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Domain"}
[:regex-match-first {}
[:field {} 9999001]
(u/regex->str url->host-regex)]
(u/regex->str host->domain-regex)]]}]}}))
(testing "Extracting Subdomain"
(lib.drill-thru.tu/test-drill-application
{:drill-type :drill-thru/column-extract
:click-type :header
:query-type :unaggregated
:column-name "HOMEPAGE"
:custom-query query
:expected {:type :drill-thru/column-extract
:display-name "Extract domain, subdomain…"
:extractions [{:key :domain, :display-name "Domain"}
{:key :subdomain, :display-name "Subdomain"}]}
:drill-args ["subdomain"]
:expected-query {:stages [{:expressions [[:regex-match-first {:lib/expression-name "Subdomain"}
[:regex-match-first {}
[:field {} 9999001]
(u/regex->str url->host-regex)]
(u/regex->str host->subdomain-regex)]]}]}}))))
(deftest ^:parallel url->host-regex-test
(are [host url] (= host (second (re-find url->host-regex url)))
"cdbaby.com" "https://cdbaby.com/some.txt"
"fema.gov" "https://fema.gov/some/path/Vatini?search=foo"
"www.geocities.jp" "https://www.geocities.jp/some/path/Turbitt?search=foo"
"jalbum.net" "https://jalbum.net/some/path/Kirsz?search=foo"
"usa.gov" "https://usa.gov/some/path/Curdell?search=foo"
"taxes.va.gov" "http://taxes.va.gov/some/path/Marritt?search=foo"
"log.stuff.gmpg.org" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo"
"hatena.ne.jp" "http://hatena.ne.jp/"
"telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail"
"bbc.co.uk" "bbc.co.uk/some/path?search=foo"))
(deftest ^:parallel host->domain-regex-test
(are [domain host] (= domain (second (re-find host->domain-regex host)))
;; Easy cases: second-last part is the domain.
"cdbaby" "cdbaby.com"
"fema" "fema.gov"
"geocities" "www.geocities.jp"
"jalbum" "sub.jalbum.net"
"jalbum" "subdomains.go.here.jalbum.net"
"gmpg" "log.stuff.gmpg.org"
;; The second-last part is the domain even if it's short, sometimes.
"usa" "usa.gov"
"va" "va.gov"
;; Oops, we picked a subdomain! But see below.
"taxes" "taxes.va.gov" ; True domain is va
"hatena" "hatena.ne.jp" ; True domain is ne
;; Sometimes the second-last part is a short suffix.
;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database,
;; we use a best-effort matcher that gets the domain right most of the time.
"telegraph" "telegraph.co.uk"
"bbc" "bbc.co.uk"
"dot" "dot.va.gov"
;; "www" is disregarded as a possible subdomain.
"usa" "www.usa.gov"
"va" "www.va.gov"
"dot" "www.dot.va.gov"))
(deftest ^:parallel host->subdomain-regex-test
(are [subdomain host] (= subdomain (second (re-find host->subdomain-regex host)))
;; Blanks. "www" doesn't count.
nil "cdbaby.com"
nil "fema.gov"
nil "www.geocities.jp"
nil "usa.gov"
nil "va.gov"
;; Basics - taking the first segment that isn't "www", IF it isn't the domain.
"sub" "sub.jalbum.net"
"subdomains" "subdomains.go.here.jalbum.net"
"log" "log.stuff.gmpg.org"
;; Oops, we missed those. This is the reverse of the problem when picking the domain.
nil "taxes.va.gov" ; True domain is va, subdomain is taxes.
nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena.
;; Sometimes the second-last part is a short suffix.
;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database,
;; we use a best-effort matcher that gets the domain right most of the time.
nil "telegraph.co.uk"
"local" "local.news.telegraph.co.uk"
nil "bbc.co.uk"
"video" "video.bbc.co.uk"
;; "www" is disregarded as a possible subdomain, so these are also incorrect.
nil "www.usa.gov"
nil "www.dot.va.gov"
"licensing" "www.licensing.dot.va.gov"))
...@@ -174,7 +174,26 @@ ...@@ -174,7 +174,26 @@
"PRODUCT_ID" nil "PRODUCT_ID" nil
"CREATED_AT" "2024-09-08T22:03:20.239+03:00"} "CREATED_AT" "2024-09-08T22:03:20.239+03:00"}
:aggregations 0 :aggregations 0
:breakouts 0}})) :breakouts 0}
:test.query/people
{:query (lib/query metadata-provider (meta/table-metadata :people))
:row {"ID" "222"
"NAME" "J. Some Guy"
"EMAIL" "someguy@isp.com"
"PASSWORD" "eafc45bf-cf8e-4c96-ab35-ce44d0021597"
"ADDRESS" "2112 Rush St"
"CITY" "Portland"
"STATE" "ME"
"ZIP" "66223"
"LATITUDE" 43.6307309
"LONGITUDE" -70.8311294
"SOURCE" "Facebook"
"BIRTH_DATE" "1987-06-14T00:00:00Z"
"CREATED_AT" "2024-09-08T22:03:20.239-04:00"}
:aggregations 0
:breakouts 0}
}))
(defn returned "Given a test case, a context, and a target drill type (eg. `:drill-thru/quick-filter`), calls (defn returned "Given a test case, a context, and a target drill type (eg. `:drill-thru/quick-filter`), calls
[[lib/available-drill-thrus]] and looks for the specified drill. [[lib/available-drill-thrus]] and looks for the specified drill.
...@@ -309,6 +328,37 @@ ...@@ -309,6 +328,37 @@
(click tc :header "PRODUCT_ID" :basic :fk) (click tc :header "PRODUCT_ID" :basic :fk)
(click tc :header "CREATED_AT" :basic :datetime)]) (click tc :header "CREATED_AT" :basic :datetime)])
;; Simple query against People.
;; This one has a :type/Email (EMAIL) for Column Extract drills.
(let [tc (test-case metadata-provider :test.query/people)]
[(click tc :cell "ID" :basic :pk)
(click tc :cell "ADDRESS" :basic :string)
(click tc :cell "EMAIL" :basic :string)
(click tc :cell "PASSWORD" :basic :string)
(click tc :cell "NAME" :basic :string)
(click tc :cell "CITY" :basic :string)
(click tc :cell "STATE" :basic :string)
(click tc :cell "ZIP" :basic :string)
(click tc :cell "LATITUDE" :basic :number)
(click tc :cell "LONGITUDE" :basic :number)
(click tc :cell "SOURCE" :basic :string)
(click tc :cell "BIRTH_DATE" :basic :datetime)
(click tc :cell "CREATED_AT" :basic :datetime)
(click tc :header "ID" :basic :pk)
(click tc :header "ADDRESS" :basic :string)
(click tc :header "EMAIL" :basic :string)
(click tc :header "PASSWORD" :basic :string)
(click tc :header "NAME" :basic :string)
(click tc :header "CITY" :basic :string)
(click tc :header "STATE" :basic :string)
(click tc :header "ZIP" :basic :string)
(click tc :header "LATITUDE" :basic :number)
(click tc :header "LONGITUDE" :basic :number)
(click tc :header "SOURCE" :basic :string)
(click tc :header "BIRTH_DATE" :basic :datetime)
(click tc :header "CREATED_AT" :basic :datetime)])
;; Simple query against Products, but it lies! ;; Simple query against Products, but it lies!
;; Claims VENDOR is :type/SerializedJSON (derives from :type/Structured). ;; Claims VENDOR is :type/SerializedJSON (derives from :type/Structured).
(let [tc (-> metadata-provider (let [tc (-> metadata-provider
...@@ -317,7 +367,17 @@ ...@@ -317,7 +367,17 @@
:semantic-type :type/SerializedJSON}]}) :semantic-type :type/SerializedJSON}]})
(test-case :test.query/products))] (test-case :test.query/products))]
[(click tc :cell "VENDOR" :basic :string) [(click tc :cell "VENDOR" :basic :string)
(click tc :header "VENDOR" :basic :string)])] (click tc :header "VENDOR" :basic :string)])
;; Simple query against People, but it lies!
;; Claims EMAIL is :type/URL (relevant to Column Extract drills).
(let [tc (-> metadata-provider
(merged-mock/merged-mock-metadata-provider
{:fields [{:id (meta/id :people :email)
:semantic-type :type/URL}]})
(test-case :test.query/people))]
[(click tc :cell "EMAIL" :basic :string)
(click tc :header "EMAIL" :basic :string)])]
(apply concat)))) (apply concat))))
(defn canned-test (defn canned-test
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment