diff --git a/src/metabase/lib/drill_thru/column_extract.cljc b/src/metabase/lib/drill_thru/column_extract.cljc index 597ad65735fc5a909b14c2b7e091a97ca6ab440c..a129b0f8196b37bd96eb60476d54db12121a5300 100644 --- a/src/metabase/lib/drill_thru/column_extract.cljc +++ b/src/metabase/lib/drill_thru/column_extract.cljc @@ -48,6 +48,23 @@ ;; Skip www domain maybe short tail TLD #"(?:www\.)?([^\.]+)\.(?:[^\.]{1,3}\.)?[^\.]+$") +(def ^:private email->domain-regex + ;; See [[host->domain-regex]] on the challenges of parsing domains with regexes. + ;; Referencing the indexes below: + ;; 1. Positive lookbehind: Starting after @ or . + ;; 2. Negative lookahead: Don't capture www as the domain + ;; 3. One domain segment + ;; 4. Positive lookahead: + ;; Either: + ;; 5. Short final segment (eg. .co.uk) + ;; 6. Top-level domain + ;; 7. Anchor to end + ;; Or: + ;; 8. Top-level domain + ;; 9. Anchor to end + ;;1 2 3 (4 5 6 7| 8 9) + #"(?<=[@\.])(?!www\.)[^@\.]+(?=\.[^@\.]{1,3}\.[^@\.]+$|\.[^@\.]+$)") + (def ^:private host->subdomain-regex ;; This grabs the first segment that isn't "www", AND excludes the main domain name. ;; See [[host->domain-regex]] for more details about how those are matched. @@ -65,13 +82,13 @@ ;;12 34 5 6 7 8 9 10 #"^(?:www\.)?((?!www\.)(?![^\.]+\.(?:[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.") -;; Full size, I think we can get away with a simpler one - just the first match that isn't the main domain or www. -#_#"^(?:www\.)?((?!www\.)(?!(?:[^\.]+\.[^\.]{1,3}\.)?[^\.]+$)[^\.]+)\.(?:[^\.]+\.)+(?:[^\.]{1,3}\.)?[^\.]+$" - (defn- column-extract-drill-for-column [column] (cond (lib.types.isa/temporal? column) {:display-name (i18n/tru "Extract day, month…") :extractions (column-extract-temporal-units column)} + (lib.types.isa/email? column) {:display-name (i18n/tru "Extract domain") + :extractions [{:key :email-domain + :display-name (i18n/tru "Domain")}]} (lib.types.isa/URL? column) {:display-name (i18n/tru "Extract domain, subdomain…") :extractions [{:key :domain :display-name (i18n/tru "Domain")} @@ -120,7 +137,9 @@ (lib.expression/regex-match-first host->domain-regex)) :subdomain (-> column (lib.expression/regex-match-first url->host-regex) - (lib.expression/regex-match-first host->subdomain-regex)))) + (lib.expression/regex-match-first host->subdomain-regex)) + ;; Emails + :email-domain (lib.expression/regex-match-first column email->domain-regex))) (defmethod lib.drill-thru.common/drill-thru-method :drill-thru/column-extract [_query _stage-number {:keys [query stage-number column extractions]} & [tag]] diff --git a/test/metabase/lib/drill_thru/column_extract_test.cljc b/test/metabase/lib/drill_thru/column_extract_test.cljc index 8135aa6c5b94d4eb672abd295e5ca84bbf34c0c5..f08d02d283e05c62b147f2465794e9f3f91be7b1 100644 --- a/test/metabase/lib/drill_thru/column_extract_test.cljc +++ b/test/metabase/lib/drill_thru/column_extract_test.cljc @@ -36,7 +36,7 @@ (fn [_test-case {:keys [column] :as _context} {:keys [click column-type]}] (and (= click :header) (or (= column-type :datetime) - (= (:semantic-type column) :type/URL))))))) + (#{:type/URL :type/Email} (:semantic-type column)))))))) (deftest ^:parallel returns-column-extract-test-1 (lib.drill-thru.tu/test-returns-drill @@ -277,6 +277,10 @@ #?(:clj @#'lib.drill-thru.column-extract/host->subdomain-regex :cljs lib.drill-thru.column-extract/host->subdomain-regex)) +(def ^:private email->domain-regex + #?(:clj @#'lib.drill-thru.column-extract/email->domain-regex + :cljs lib.drill-thru.column-extract/email->domain-regex)) + (deftest ^:parallel column-extract-url->domain-test ;; There's no URL columns in the same dataset, but let's pretend there's one called People.HOMEPAGE. (let [homepage (assoc (meta/field-metadata :people :email) @@ -397,3 +401,12 @@ nil "www.usa.gov" nil "www.dot.va.gov" "licensing" "www.licensing.dot.va.gov")) + +(deftest ^:parallel email->domain-regex-test + (are [domain email] (= domain (re-find email->domain-regex email)) + "metabase" "braden@metabase.com" + "homeoffice" "mholmes@homeoffice.gov.uk" + "someisp" "john.smith@mail.someisp.com" + "amazon" "trk@amazon.co.uk" + "hatena" "takashi@hatena.ne.jp" + "ne" "takashi@www.ne.jp"))