From f4a93cba9cae22eb2381f462634a2e43e605df95 Mon Sep 17 00:00:00 2001 From: lbrdnk <lbrdnk@users.noreply.github.com> Date: Fri, 19 Jul 2024 19:19:52 +0200 Subject: [PATCH] Move desugar-host-and-domain to jvm-util (#45808) --- src/metabase/legacy_mbql/jvm_util.clj | 95 ++++++++++++++++++ src/metabase/legacy_mbql/util.cljc | 102 +++----------------- test/metabase/legacy_mbql/jvm_util_test.clj | 100 +++++++++++++++++++ test/metabase/legacy_mbql/util_test.cljc | 95 ------------------ 4 files changed, 207 insertions(+), 185 deletions(-) create mode 100644 src/metabase/legacy_mbql/jvm_util.clj create mode 100644 test/metabase/legacy_mbql/jvm_util_test.clj diff --git a/src/metabase/legacy_mbql/jvm_util.clj b/src/metabase/legacy_mbql/jvm_util.clj new file mode 100644 index 00000000000..f94e9b61c9e --- /dev/null +++ b/src/metabase/legacy_mbql/jvm_util.clj @@ -0,0 +1,95 @@ +(ns metabase.legacy-mbql.jvm-util + "This namespace contains functionality that is not compatible with js, hence can not be stored in correspoding + cljc ns, ie. [[metabase.legacy-mbql.util]]." + (:require + [metabase.lib.util.match :as lib.util.match])) + +;;;; Following regex definitions are incompatible with Safari browser. Code is unused on FE. + +(def ^:private host-regex + ;; Extracts the "host" from a URL or an email. + ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk. + ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number. + ;; + ;; For an email, this is generally the part after the @, but it will skip any subdomains: + ;; someone@email.mycompany.net -> mycompany.net + ;; + ;; Referencing the indexes below: + ;; 1. Positive lookbehind: + ;; Just past one of: + ;; 2. @ from an email or URL userinfo@ prefix + ;; 3. // from a URL scheme + ;; 4. . from a previous subdomain segment + ;; 5. Start of string + ;; 6. Negative lookahead: don't capture www as part of the domain + ;; 7. Main domain segment + ;; 8. Ending in a dot + ;; 9. Optional short final segment (eg. co in .co.uk) + ;; 10. Top-level domain + ;; 11. Optional :port, /path, ?query or #hash + ;; 12. Anchor to the end + ;;1 2 3 4 5 6 7 8 9 10 11 12 + #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)") + +(def ^:private domain-regex + ;; Deliberately no ^ at the start; there might be several subdomains before this spot. + ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk. + ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops. + ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list + ;; from Mozilla or accept that this regex is a bit best-effort. + ;; Referencing the indexes below: + ;; 1. Positive lookbehind: + ;; Just past one of: + ;; 2. @ from an email or URL userinfo@ prefix + ;; 3. // from a URL scheme + ;; 4. . from a previous subdomain segment + ;; 5. Start of string + ;; 6. Negative lookahead: don't capture www as the domain + ;; 7. One domain segment + ;; 8. Positive lookahead: + ;; Either: + ;; 9. Short final segment (eg. .co.uk) + ;; 10. Top-level domain + ;; 11. Optional :port, /path, ?query or #hash + ;; 12. Anchor to end + ;; Or: + ;; 13. Top-level domain + ;; 14. Optional :port, /path, ?query or #hash + ;; 15. Anchor to end + ;;1 2 3 4 5 6 7 (8 9 10 11 12| 13 14 15) + #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)") + +(def ^:private subdomain-regex + ;; This grabs the first segment that isn't "www", AND excludes the main domain name. + ;; See [[domain-regex]] for more details about how those are matched. + ;; Referencing the indexes below: + ;; 1. Positive lookbehind: + ;; Just past one of: + ;; 2. @ from an email or URL userinfo@ prefix + ;; 3. // from a URL scheme + ;; 4. . from a previous subdomain segment + ;; 5. Start of string + ;; 6. Negative lookahead: don't capture www as the domain + ;; 7. Negative lookahead: don't capture the main domain name or part of the TLD + ;; That would look like: + ;; 8. The next segment we *would* capture as the subdomain + ;; 9. Optional short segment, like "co" in .co.uk + ;; 10. Top-level domain + ;; 11. Optionally more URL things: :port or /path or ?query or #fragment + ;; 12. End of string + ;; 13. Match the actual subdomain + ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture. + ;;1 2 3 4 5 6 7 8 9 10 11 12 13 14 + #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)") + +(defn desugar-host-and-domain + "Unwrap host and domain." + [expression] + (lib.util.match/replace + expression + [:host column] + (recur [:regex-match-first column (str host-regex)]) + [:domain column] + (recur [:regex-match-first column (str domain-regex)]) + [:subdomain column] + (recur [:regex-match-first column (str subdomain-regex)]))) diff --git a/src/metabase/legacy_mbql/util.cljc b/src/metabase/legacy_mbql/util.cljc index 0d01a7056ad..119aee88442 100644 --- a/src/metabase/legacy_mbql/util.cljc +++ b/src/metabase/legacy_mbql/util.cljc @@ -14,7 +14,8 @@ [metabase.util.log :as log] [metabase.util.malli :as mu] #?@(:clj - [[metabase.models.dispatch :as models.dispatch] + [[metabase.legacy-mbql.jvm-util :as mbql.jvm-u] + [metabase.models.dispatch :as models.dispatch] [metabase.util.i18n]]))) (mu/defn normalize-token :- :keyword @@ -320,91 +321,6 @@ [:/ x y z & more] (recur (into [:/ [:/ x y]] (cons z more))))) -(def ^:private host-regex - ;; Extracts the "host" from a URL or an email. - ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk. - ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number. - ;; - ;; For an email, this is generally the part after the @, but it will skip any subdomains: - ;; someone@email.mycompany.net -> mycompany.net - ;; - ;; Referencing the indexes below: - ;; 1. Positive lookbehind: - ;; Just past one of: - ;; 2. @ from an email or URL userinfo@ prefix - ;; 3. // from a URL scheme - ;; 4. . from a previous subdomain segment - ;; 5. Start of string - ;; 6. Negative lookahead: don't capture www as part of the domain - ;; 7. Main domain segment - ;; 8. Ending in a dot - ;; 9. Optional short final segment (eg. co in .co.uk) - ;; 10. Top-level domain - ;; 11. Optional :port, /path, ?query or #hash - ;; 12. Anchor to the end - ;;1 2 3 4 5 6 7 8 9 10 11 12 - #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)") - -(def ^:private domain-regex - ;; Deliberately no ^ at the start; there might be several subdomains before this spot. - ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk. - ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops. - ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list - ;; from Mozilla or accept that this regex is a bit best-effort. - ;; Referencing the indexes below: - ;; 1. Positive lookbehind: - ;; Just past one of: - ;; 2. @ from an email or URL userinfo@ prefix - ;; 3. // from a URL scheme - ;; 4. . from a previous subdomain segment - ;; 5. Start of string - ;; 6. Negative lookahead: don't capture www as the domain - ;; 7. One domain segment - ;; 8. Positive lookahead: - ;; Either: - ;; 9. Short final segment (eg. .co.uk) - ;; 10. Top-level domain - ;; 11. Optional :port, /path, ?query or #hash - ;; 12. Anchor to end - ;; Or: - ;; 13. Top-level domain - ;; 14. Optional :port, /path, ?query or #hash - ;; 15. Anchor to end - ;;1 2 3 4 5 6 7 (8 9 10 11 12| 13 14 15) - #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)") - -(def ^:private subdomain-regex - ;; This grabs the first segment that isn't "www", AND excludes the main domain name. - ;; See [[domain-regex]] for more details about how those are matched. - ;; Referencing the indexes below: - ;; 1. Positive lookbehind: - ;; Just past one of: - ;; 2. @ from an email or URL userinfo@ prefix - ;; 3. // from a URL scheme - ;; 4. . from a previous subdomain segment - ;; 5. Start of string - ;; 6. Negative lookahead: don't capture www as the domain - ;; 7. Negative lookahead: don't capture the main domain name or part of the TLD - ;; That would look like: - ;; 8. The next segment we *would* capture as the subdomain - ;; 9. Optional short segment, like "co" in .co.uk - ;; 10. Top-level domain - ;; 11. Optionally more URL things: :port or /path or ?query or #fragment - ;; 12. End of string - ;; 13. Match the actual subdomain - ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture. - ;;1 2 3 4 5 6 7 8 9 10 11 12 13 14 - #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)") - -(defn- desugar-host-and-domain [expression] - (lib.util.match/replace expression - [:host column] - (recur [:regex-match-first column (str host-regex)]) - [:domain column] - (recur [:regex-match-first column (str domain-regex)]) - [:subdomain column] - (recur [:regex-match-first column (str subdomain-regex)]))) - (defn- temporal-case-expression "Creates a `:case` expression with a condition for each value of the given unit." [column unit n] @@ -434,10 +350,16 @@ "Rewrite various 'syntactic sugar' expressions like `:/` with more than two args into something simpler for drivers to compile." [expression :- ::mbql.s/FieldOrExpressionDef] - (-> expression - desugar-divide-with-extra-args - desugar-host-and-domain - desugar-temporal-names)) + ;; The `mbql.jvm-u/desugar-host-and-domain` is implemented only for jvm because regexes are not compatible with + ;; Safari. + (let [desugar-host-and-domain* #?(:clj mbql.jvm-u/desugar-host-and-domain + :cljs (fn [x] + (log/warn "`desugar-host-and-domain` implemented only on JVM.") + x))] + (-> expression + desugar-divide-with-extra-args + desugar-host-and-domain* + desugar-temporal-names))) (defn- maybe-desugar-expression [clause] (cond-> clause diff --git a/test/metabase/legacy_mbql/jvm_util_test.clj b/test/metabase/legacy_mbql/jvm_util_test.clj new file mode 100644 index 00000000000..612fb9b5c3b --- /dev/null +++ b/test/metabase/legacy_mbql/jvm_util_test.clj @@ -0,0 +1,100 @@ +(ns metabase.legacy-mbql.jvm-util-test + (:require + [clojure.test :as t] + [metabase.legacy-mbql.jvm-util :as mbql.jvm-u] + [metabase.legacy-mbql.util :as mbql.u])) + +(t/deftest ^:parallel host-regex-on-urls-test + (t/are [host url] (= host (re-find @#'mbql.jvm-u/host-regex url)) + "cdbaby.com" "https://cdbaby.com/some.txt" + "fema.gov" "https://fema.gov/some/path/Vatini?search=foo" + "geocities.jp" "https://www.geocities.jp/some/path/Turbitt?search=foo" + "jalbum.net" "https://jalbum.net/some/path/Kirsz?search=foo" + "usa.gov" "https://usa.gov/some/path/Curdell?search=foo" + ;; Oops, this one captures a subdomain because it can't tell va.gov is supposed to be that short. + "taxes.va.gov" "http://taxes.va.gov/some/path/Marritt?search=foo" + "gmpg.org" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo" + "hatena.ne.jp" "http://hatena.ne.jp/" + "telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail" + "bbc.co.uk" "bbc.co.uk/some/path?search=foo" + "bbc.co.uk" "news.bbc.co.uk:port")) + +(t/deftest ^:parallel host-regex-on-emails-test + (t/are [host email] (= host (re-find @#'mbql.jvm-u/host-regex email)) + "metabase.com" "braden@metabase.com" + "homeoffice.gov.uk" "mholmes@homeoffice.gov.uk" + "someisp.com" "john.smith@mail.someisp.com" + "amazon.co.uk" "trk@amazon.co.uk" + "hatena.ne.jp" "takashi@hatena.ne.jp" + "hatena.ne.jp" "takashi@mail.hatena.ne.jp" + "ne.jp" "takashi@www.ne.jp")) + +(t/deftest ^:parallel domain-regex-on-urls-test + (t/are [domain url] (= domain (re-find @#'mbql.jvm-u/domain-regex url)) + "cdbaby" "https://cdbaby.com/some.txt" + "fema" "https://fema.gov/some/path/Vatini?search=foo" + "geocities" "https://www.geocities.jp/some/path/Turbitt?search=foo" + "jalbum" "https://jalbum.net/some/path/Kirsz?search=foo" + "usa" "https://usa.gov/some/path/Curdell?search=foo" + "taxes" "http://taxes.va.gov/some/path/Marritt?search=foo" + "gmpg" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo" + "hatena" "http://hatena.ne.jp/" + "telegraph" "//telegraph.co.uk?foo=bar#tail" + "bbc" "bbc.co.uk/some/path?search=foo")) + +(t/deftest ^:parallel domain-regex-on-emails-test + (t/are [domain email] (= domain (re-find @#'mbql.jvm-u/domain-regex email)) + "metabase" "braden@metabase.com" + "homeoffice" "mholmes@homeoffice.gov.uk" + "someisp" "john.smith@mail.someisp.com" + "amazon" "trk@amazon.co.uk" + "hatena" "takashi@hatena.ne.jp" + "ne" "takashi@www.ne.jp")) + +(t/deftest ^:parallel subdomain-regex-on-urls-test + (t/are [subdomain url] (= subdomain (re-find @#'mbql.jvm-u/subdomain-regex url)) + ;; Blanks. "www" doesn't count. + nil "cdbaby.com" + nil "https://fema.gov" + nil "http://www.geocities.jp" + nil "usa.gov/some/page.cgi.htm" + nil "va.gov" + + ;; Basics - taking the first segment that isn't "www", IF it isn't the domain. + "sub" "sub.jalbum.net" + "subdomains" "subdomains.go.here.jalbum.net" + "log" "log.stuff.gmpg.org" + "log" "https://log.stuff.gmpg.org" + "log" "log.stuff.gmpg.org/some/path" + "log" "log.stuff.gmpg.org?search=yes" + + ;; Oops, we miss these! This is the reverse of the problem when picking the domain. + ;; We can't tell without maintaining a huge list that va and ne are the real domains, and not the trailing + ;; fragments like .co.uk - see below. + nil "taxes.va.gov" ; True domain is va, subdomain is taxes. + nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena. + + ;; Sometimes the second-last part is a short suffix. + ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database, + ;; we use a best-effort matcher that gets the domain right most of the time. + nil "telegraph.co.uk" + nil "https://telegraph.co.uk" + nil "telegraph.co.uk/some/article.php" + "local" "local.news.telegraph.co.uk" + nil "bbc.co.uk#fragment" + "video" "video.bbc.co.uk" + ;; "www" is disregarded as a possible subdomain. + nil "www.usa.gov" + nil "www.dot.va.gov" + "licensing" "www.licensing.dot.va.gov")) + +(t/deftest ^:parallel desugar-host-and-domain-test + (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/host-regex)] + (mbql.u/desugar-expression [:host [:field 1 nil]])) + "`host` should desugar to a `regex-match-first` clause with the host regex") + (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/domain-regex)] + (mbql.u/desugar-expression [:domain [:field 1 nil]])) + "`domain` should desugar to a `regex-match-first` clause with the domain regex") + (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/subdomain-regex)] + (mbql.u/desugar-expression [:subdomain [:field 1 nil]])) + "`subdomain` should desugar to a `regex-match-first` clause with the subdomain regex")) diff --git a/test/metabase/legacy_mbql/util_test.cljc b/test/metabase/legacy_mbql/util_test.cljc index 2d9b93069f1..b446a086d24 100644 --- a/test/metabase/legacy_mbql/util_test.cljc +++ b/test/metabase/legacy_mbql/util_test.cljc @@ -877,101 +877,6 @@ [:relative-datetime 0 :quarter]] (mbql.u/desugar-time-interval [:time-interval [:expression "Date"] :current :quarter])))) -(t/deftest ^:parallel host-regex-on-urls-test - (t/are [host url] (= host (re-find @#'mbql.u/host-regex url)) - "cdbaby.com" "https://cdbaby.com/some.txt" - "fema.gov" "https://fema.gov/some/path/Vatini?search=foo" - "geocities.jp" "https://www.geocities.jp/some/path/Turbitt?search=foo" - "jalbum.net" "https://jalbum.net/some/path/Kirsz?search=foo" - "usa.gov" "https://usa.gov/some/path/Curdell?search=foo" - ;; Oops, this one captures a subdomain because it can't tell va.gov is supposed to be that short. - "taxes.va.gov" "http://taxes.va.gov/some/path/Marritt?search=foo" - "gmpg.org" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo" - "hatena.ne.jp" "http://hatena.ne.jp/" - "telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail" - "bbc.co.uk" "bbc.co.uk/some/path?search=foo" - "bbc.co.uk" "news.bbc.co.uk:port")) - -(t/deftest ^:parallel host-regex-on-emails-test - (t/are [host email] (= host (re-find @#'mbql.u/host-regex email)) - "metabase.com" "braden@metabase.com" - "homeoffice.gov.uk" "mholmes@homeoffice.gov.uk" - "someisp.com" "john.smith@mail.someisp.com" - "amazon.co.uk" "trk@amazon.co.uk" - "hatena.ne.jp" "takashi@hatena.ne.jp" - "hatena.ne.jp" "takashi@mail.hatena.ne.jp" - "ne.jp" "takashi@www.ne.jp")) - -(t/deftest ^:parallel domain-regex-on-urls-test - (t/are [domain url] (= domain (re-find @#'mbql.u/domain-regex url)) - "cdbaby" "https://cdbaby.com/some.txt" - "fema" "https://fema.gov/some/path/Vatini?search=foo" - "geocities" "https://www.geocities.jp/some/path/Turbitt?search=foo" - "jalbum" "https://jalbum.net/some/path/Kirsz?search=foo" - "usa" "https://usa.gov/some/path/Curdell?search=foo" - "taxes" "http://taxes.va.gov/some/path/Marritt?search=foo" - "gmpg" "http://log.stuff.gmpg.org/some/path/Cambden?search=foo" - "hatena" "http://hatena.ne.jp/" - "telegraph" "//telegraph.co.uk?foo=bar#tail" - "bbc" "bbc.co.uk/some/path?search=foo")) - -(t/deftest ^:parallel domain-regex-on-emails-test - (t/are [domain email] (= domain (re-find @#'mbql.u/domain-regex email)) - "metabase" "braden@metabase.com" - "homeoffice" "mholmes@homeoffice.gov.uk" - "someisp" "john.smith@mail.someisp.com" - "amazon" "trk@amazon.co.uk" - "hatena" "takashi@hatena.ne.jp" - "ne" "takashi@www.ne.jp")) - -(t/deftest ^:parallel subdomain-regex-on-urls-test - (t/are [subdomain url] (= subdomain (re-find @#'mbql.u/subdomain-regex url)) - ;; Blanks. "www" doesn't count. - nil "cdbaby.com" - nil "https://fema.gov" - nil "http://www.geocities.jp" - nil "usa.gov/some/page.cgi.htm" - nil "va.gov" - - ;; Basics - taking the first segment that isn't "www", IF it isn't the domain. - "sub" "sub.jalbum.net" - "subdomains" "subdomains.go.here.jalbum.net" - "log" "log.stuff.gmpg.org" - "log" "https://log.stuff.gmpg.org" - "log" "log.stuff.gmpg.org/some/path" - "log" "log.stuff.gmpg.org?search=yes" - - ;; Oops, we miss these! This is the reverse of the problem when picking the domain. - ;; We can't tell without maintaining a huge list that va and ne are the real domains, and not the trailing - ;; fragments like .co.uk - see below. - nil "taxes.va.gov" ; True domain is va, subdomain is taxes. - nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena. - - ;; Sometimes the second-last part is a short suffix. - ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database, - ;; we use a best-effort matcher that gets the domain right most of the time. - nil "telegraph.co.uk" - nil "https://telegraph.co.uk" - nil "telegraph.co.uk/some/article.php" - "local" "local.news.telegraph.co.uk" - nil "bbc.co.uk#fragment" - "video" "video.bbc.co.uk" - ;; "www" is disregarded as a possible subdomain. - nil "www.usa.gov" - nil "www.dot.va.gov" - "licensing" "www.licensing.dot.va.gov")) - -(t/deftest ^:parallel desugar-host-and-domain-test - (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/host-regex)] - (mbql.u/desugar-expression [:host [:field 1 nil]])) - "`host` should desugar to a `regex-match-first` clause with the host regex") - (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/domain-regex)] - (mbql.u/desugar-expression [:domain [:field 1 nil]])) - "`domain` should desugar to a `regex-match-first` clause with the domain regex") - (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/subdomain-regex)] - (mbql.u/desugar-expression [:subdomain [:field 1 nil]])) - "`subdomain` should desugar to a `regex-match-first` clause with the subdomain regex")) - (t/deftest ^:parallel desugar-month-quarter-day-name-test (t/is (= [:case [[[:= [:field 1 nil] 1] "Jan"] [[:= [:field 1 nil] 2] "Feb"] -- GitLab