Skip to content
Snippets Groups Projects
Unverified Commit 15a411c2 authored by Braden Shepherdson's avatar Braden Shepherdson Committed by GitHub
Browse files

[MLv2] Support `Extract` to get the domain from emails (#40200)

Milestone 3 of #38964.
parent 918878eb
No related branches found
No related tags found
No related merge requests found
......@@ -48,6 +48,23 @@
;; Skip www domain maybe short tail TLD
#"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$")
(def ^:private email->domain-regex
;; See [[host->domain-regex]] on the challenges of parsing domains with regexes.
;; Referencing the indexes below:
;; 1. Positive lookbehind: Starting after @ or .
;; 2. Negative lookahead: Don't capture www as the domain
;; 3. One domain segment
;; 4. Positive lookahead:
;; Either:
;; 5. Short final segment (eg. .co.uk)
;; 6. Top-level domain
;; 7. Anchor to end
;; Or:
;; 8. Top-level domain
;; 9. Anchor to end
;;1 2 3 (4 5 6 7| 8 9)
#"(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)")
(def ^:private host->subdomain-regex
;; This grabs the first segment that isn't "www", AND excludes the main domain name.
;; See [[host->domain-regex]] for more details about how those are matched.
......@@ -65,13 +82,13 @@
;;12 34 5 6 7 8 9 10
#"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.")
;; Full size, I think we can get away with a simpler one - just the first match that isn't the main domain or www.
#_#"^(?:www\.)?((?!www\.)(?!(?:[^\.]+\.[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.(?:[^\.]+\.)+(?:[^\.]{1,3}\.)?[^\.]+$"
(defn- column-extract-drill-for-column [column]
(cond
(lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…")
:extractions (column-extract-temporal-units column)}
(lib.types.isa/email? column) {:display-name (i18n/tru "Extract domain")
:extractions [{:key :email-domain
:display-name (i18n/tru "Domain")}]}
(lib.types.isa/URL? column) {:display-name (i18n/tru "Extract domain, subdomain…")
:extractions [{:key :domain
:display-name (i18n/tru "Domain")}
......@@ -120,7 +137,9 @@
(lib.expression/regex-match-first host->domain-regex))
:subdomain (-> column
(lib.expression/regex-match-first url->host-regex)
(lib.expression/regex-match-first host->subdomain-regex))))
(lib.expression/regex-match-first host->subdomain-regex))
;; Emails
:email-domain (lib.expression/regex-match-first column email->domain-regex)))
(defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract
[_query _stage-number {:keys [query stage-number column extractions]} & [tag]]
......
......@@ -36,7 +36,7 @@
(fn [_test-case {:keys [column] :as _context} {:keys [click column-type]}]
(and (= click :header)
(or (= column-type :datetime)
(= (:semantic-type column) :type/URL)))))))
(#{:type/URL :type/Email} (:semantic-type column))))))))
(deftest ^:parallel returns-column-extract-test-1
(lib.drill-thru.tu/test-returns-drill
......@@ -277,6 +277,10 @@
#?(:clj @#'lib.drill-thru.column-extract/host->subdomain-regex
:cljs lib.drill-thru.column-extract/host->subdomain-regex))
(def ^:private email->domain-regex
#?(:clj @#'lib.drill-thru.column-extract/email->domain-regex
:cljs lib.drill-thru.column-extract/email->domain-regex))
(deftest ^:parallel column-extract-url->domain-test
;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE.
(let [homepage (assoc (meta/field-metadata :people :email)
......@@ -397,3 +401,12 @@
nil "www.usa.gov"
nil "www.dot.va.gov"
"licensing" "www.licensing.dot.va.gov"))
(deftest ^:parallel email->domain-regex-test
(are [domain email] (= domain (re-find email->domain-regex email))
"metabase" "braden@metabase.com"
"homeoffice" "mholmes@homeoffice.gov.uk"
"someisp" "john.smith@mail.someisp.com"
"amazon" "trk@amazon.co.uk"
"hatena" "takashi@hatena.ne.jp"
"ne" "takashi@www.ne.jp"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment