From f4a93cba9cae22eb2381f462634a2e43e605df95 Mon Sep 17 00:00:00 2001
From: lbrdnk <lbrdnk@users.noreply.github.com>
Date: Fri, 19 Jul 2024 19:19:52 +0200
Subject: [PATCH] Move desugar-host-and-domain to jvm-util (#45808)

---
 src/metabase/legacy_mbql/jvm_util.clj       |  95 ++++++++++++++++++
 src/metabase/legacy_mbql/util.cljc          | 102 +++-----------------
 test/metabase/legacy_mbql/jvm_util_test.clj | 100 +++++++++++++++++++
 test/metabase/legacy_mbql/util_test.cljc    |  95 ------------------
 4 files changed, 207 insertions(+), 185 deletions(-)
 create mode 100644 src/metabase/legacy_mbql/jvm_util.clj
 create mode 100644 test/metabase/legacy_mbql/jvm_util_test.clj

diff --git a/src/metabase/legacy_mbql/jvm_util.clj b/src/metabase/legacy_mbql/jvm_util.clj
new file mode 100644
index 00000000000..f94e9b61c9e
--- /dev/null
+++ b/src/metabase/legacy_mbql/jvm_util.clj
@@ -0,0 +1,95 @@
+(ns metabase.legacy-mbql.jvm-util
+  "This namespace contains functionality that is not compatible with js, hence can not be stored in correspoding
+  cljc ns, ie. [[metabase.legacy-mbql.util]]."
+  (:require
+   [metabase.lib.util.match :as lib.util.match]))
+
+;;;; Following regex definitions are incompatible with Safari browser. Code is unused on FE.
+
+(def ^:private host-regex
+  ;; Extracts the "host" from a URL or an email.
+  ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk.
+  ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number.
+  ;;
+  ;; For an email, this is generally the part after the @, but it will skip any subdomains:
+  ;;   someone@email.mycompany.net -> mycompany.net
+  ;;
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as part of the domain
+  ;; 7.  Main domain segment
+  ;; 8.  Ending in a dot
+  ;; 9.  Optional short final segment (eg. co in .co.uk)
+  ;; 10. Top-level domain
+  ;; 11. Optional :port, /path, ?query or #hash
+  ;; 12. Anchor to the end
+  ;;1   2 3  4  5 6        7          8 9                     10         11           12
+  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)")
+
+(def ^:private domain-regex
+  ;; Deliberately no ^ at the start; there might be several subdomains before this spot.
+  ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
+  ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
+  ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
+  ;; from Mozilla or accept that this regex is a bit best-effort.
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as the domain
+  ;; 7.  One domain segment
+  ;; 8.  Positive lookahead:
+  ;;       Either:
+  ;; 9.      Short final segment (eg. .co.uk)
+  ;; 10.     Top-level domain
+  ;; 11.     Optional :port, /path, ?query or #hash
+  ;; 12.     Anchor to end
+  ;;       Or:
+  ;; 13.     Top-level domain
+  ;; 14.     Optional :port, /path, ?query or #hash
+  ;; 15.     Anchor to end
+  ;;1   2 3  4  5 6        7          (8   9                10         11          12|  13         14           15)
+  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)")
+
+(def ^:private subdomain-regex
+  ;; This grabs the first segment that isn't "www", AND excludes the main domain name.
+  ;; See [[domain-regex]] for more details about how those are matched.
+  ;; Referencing the indexes below:
+  ;; 1.  Positive lookbehind:
+  ;;       Just past one of:
+  ;; 2.      @  from an email or URL userinfo@ prefix
+  ;; 3.      // from a URL scheme
+  ;; 4.      .  from a previous subdomain segment
+  ;; 5.      Start of string
+  ;; 6.  Negative lookahead: don't capture www as the domain
+  ;; 7.  Negative lookahead: don't capture the main domain name or part of the TLD
+  ;;       That would look like:
+  ;; 8.      The next segment we *would* capture as the subdomain
+  ;; 9.      Optional short segment, like "co" in .co.uk
+  ;; 10.     Top-level domain
+  ;; 11.     Optionally more URL things: :port or /path or ?query or #fragment
+  ;; 12.     End of string
+  ;; 13. Match the actual subdomain
+  ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture.
+  ;;1   2 3  4  5 6        7  8           9                    10        11           12 13       14
+  #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)")
+
+(defn desugar-host-and-domain
+  "Unwrap host and domain."
+  [expression]
+  (lib.util.match/replace
+   expression
+   [:host column]
+   (recur [:regex-match-first column (str host-regex)])
+   [:domain column]
+   (recur [:regex-match-first column (str domain-regex)])
+   [:subdomain column]
+   (recur [:regex-match-first column (str subdomain-regex)])))
diff --git a/src/metabase/legacy_mbql/util.cljc b/src/metabase/legacy_mbql/util.cljc
index 0d01a7056ad..119aee88442 100644
--- a/src/metabase/legacy_mbql/util.cljc
+++ b/src/metabase/legacy_mbql/util.cljc
@@ -14,7 +14,8 @@
    [metabase.util.log :as log]
    [metabase.util.malli :as mu]
    #?@(:clj
-       [[metabase.models.dispatch :as models.dispatch]
+       [[metabase.legacy-mbql.jvm-util :as mbql.jvm-u]
+        [metabase.models.dispatch :as models.dispatch]
         [metabase.util.i18n]])))
 
 (mu/defn normalize-token :- :keyword
@@ -320,91 +321,6 @@
     [:/ x y z & more]
     (recur (into [:/ [:/ x y]] (cons z more)))))
 
-(def ^:private host-regex
-  ;; Extracts the "host" from a URL or an email.
-  ;; By host we mean the main domain name and the TLD, eg. metabase.com, amazon.co.jp, bbc.co.uk.
-  ;; For a URL, this is not the RFC3986 "host", which would include any subdomains and the optional `:3000` port number.
-  ;;
-  ;; For an email, this is generally the part after the @, but it will skip any subdomains:
-  ;;   someone@email.mycompany.net -> mycompany.net
-  ;;
-  ;; Referencing the indexes below:
-  ;; 1.  Positive lookbehind:
-  ;;       Just past one of:
-  ;; 2.      @  from an email or URL userinfo@ prefix
-  ;; 3.      // from a URL scheme
-  ;; 4.      .  from a previous subdomain segment
-  ;; 5.      Start of string
-  ;; 6.  Negative lookahead: don't capture www as part of the domain
-  ;; 7.  Main domain segment
-  ;; 8.  Ending in a dot
-  ;; 9.  Optional short final segment (eg. co in .co.uk)
-  ;; 10. Top-level domain
-  ;; 11. Optional :port, /path, ?query or #hash
-  ;; 12. Anchor to the end
-  ;;1   2 3  4  5 6        7          8 9                     10         11           12
-  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+\.(?:[^@\.:/?#]{1,3}\.)?[^@\.:/?#]+(?=[:/?#].*$|$)")
-
-(def ^:private domain-regex
-  ;; Deliberately no ^ at the start; there might be several subdomains before this spot.
-  ;; By "short tail" below, I mean a pseudo-TLD nested under a proper TLD. For example, mycompany.co.uk.
-  ;; This can accidentally capture a short domain name, eg. "subdomain.aol.com" -> "subdomain", oops.
-  ;; But there's a load of these, not a short list we can include here, so it's either preprocess the (huge) master list
-  ;; from Mozilla or accept that this regex is a bit best-effort.
-  ;; Referencing the indexes below:
-  ;; 1.  Positive lookbehind:
-  ;;       Just past one of:
-  ;; 2.      @  from an email or URL userinfo@ prefix
-  ;; 3.      // from a URL scheme
-  ;; 4.      .  from a previous subdomain segment
-  ;; 5.      Start of string
-  ;; 6.  Negative lookahead: don't capture www as the domain
-  ;; 7.  One domain segment
-  ;; 8.  Positive lookahead:
-  ;;       Either:
-  ;; 9.      Short final segment (eg. .co.uk)
-  ;; 10.     Top-level domain
-  ;; 11.     Optional :port, /path, ?query or #hash
-  ;; 12.     Anchor to end
-  ;;       Or:
-  ;; 13.     Top-level domain
-  ;; 14.     Optional :port, /path, ?query or #hash
-  ;; 15.     Anchor to end
-  ;;1   2 3  4  5 6        7          (8   9                10         11          12|  13         14           15)
-  #"(?<=@|//|\.|^)(?!www\.)[^@\.:/?#]+(?=\.[^@\.:/?#]{1,3}\.[^@\.:/?#]+(?:[:/?#].*)?$|\.[^@\.:/?#]+(?:[:/?#].*)?$)")
-
-(def ^:private subdomain-regex
-  ;; This grabs the first segment that isn't "www", AND excludes the main domain name.
-  ;; See [[domain-regex]] for more details about how those are matched.
-  ;; Referencing the indexes below:
-  ;; 1.  Positive lookbehind:
-  ;;       Just past one of:
-  ;; 2.      @  from an email or URL userinfo@ prefix
-  ;; 3.      // from a URL scheme
-  ;; 4.      .  from a previous subdomain segment
-  ;; 5.      Start of string
-  ;; 6.  Negative lookahead: don't capture www as the domain
-  ;; 7.  Negative lookahead: don't capture the main domain name or part of the TLD
-  ;;       That would look like:
-  ;; 8.      The next segment we *would* capture as the subdomain
-  ;; 9.      Optional short segment, like "co" in .co.uk
-  ;; 10.     Top-level domain
-  ;; 11.     Optionally more URL things: :port or /path or ?query or #fragment
-  ;; 12.     End of string
-  ;; 13. Match the actual subdomain
-  ;; 14. Positive lookahead: the . after the subdomain, which we want to detect but not capture.
-  ;;1   2 3  4  5 6        7  8           9                    10        11           12 13       14
-  #"(?<=@|//|\.|^)(?!www\.)(?![^\.:/?#]+\.(?:[^\.:/?#]{1,3}\.)?[^\.:/?#]+(?:[:/?#].*)?$)[^\.:/?#]+(?=\.)")
-
-(defn- desugar-host-and-domain [expression]
-  (lib.util.match/replace expression
-    [:host column]
-    (recur [:regex-match-first column (str host-regex)])
-    [:domain column]
-    (recur [:regex-match-first column (str domain-regex)])
-    [:subdomain column]
-    (recur [:regex-match-first column (str subdomain-regex)])))
-
 (defn- temporal-case-expression
   "Creates a `:case` expression with a condition for each value of the given unit."
   [column unit n]
@@ -434,10 +350,16 @@
   "Rewrite various 'syntactic sugar' expressions like `:/` with more than two args into something simpler for drivers
   to compile."
   [expression :- ::mbql.s/FieldOrExpressionDef]
-  (-> expression
-      desugar-divide-with-extra-args
-      desugar-host-and-domain
-      desugar-temporal-names))
+  ;; The `mbql.jvm-u/desugar-host-and-domain` is implemented only for jvm because regexes are not compatible with
+  ;; Safari.
+  (let [desugar-host-and-domain* #?(:clj  mbql.jvm-u/desugar-host-and-domain
+                                    :cljs (fn [x]
+                                            (log/warn "`desugar-host-and-domain` implemented only on JVM.")
+                                            x))]
+    (-> expression
+        desugar-divide-with-extra-args
+        desugar-host-and-domain*
+        desugar-temporal-names)))
 
 (defn- maybe-desugar-expression [clause]
   (cond-> clause
diff --git a/test/metabase/legacy_mbql/jvm_util_test.clj b/test/metabase/legacy_mbql/jvm_util_test.clj
new file mode 100644
index 00000000000..612fb9b5c3b
--- /dev/null
+++ b/test/metabase/legacy_mbql/jvm_util_test.clj
@@ -0,0 +1,100 @@
+(ns metabase.legacy-mbql.jvm-util-test
+  (:require
+   [clojure.test :as t]
+   [metabase.legacy-mbql.jvm-util :as mbql.jvm-u]
+   [metabase.legacy-mbql.util :as mbql.u]))
+
+(t/deftest ^:parallel host-regex-on-urls-test
+  (t/are [host url] (= host (re-find @#'mbql.jvm-u/host-regex url))
+    "cdbaby.com"      "https://cdbaby.com/some.txt"
+    "fema.gov"        "https://fema.gov/some/path/Vatini?search=foo"
+    "geocities.jp"    "https://www.geocities.jp/some/path/Turbitt?search=foo"
+    "jalbum.net"      "https://jalbum.net/some/path/Kirsz?search=foo"
+    "usa.gov"         "https://usa.gov/some/path/Curdell?search=foo"
+       ;; Oops, this one captures a subdomain because it can't tell va.gov is supposed to be that short.
+    "taxes.va.gov"    "http://taxes.va.gov/some/path/Marritt?search=foo"
+    "gmpg.org"        "http://log.stuff.gmpg.org/some/path/Cambden?search=foo"
+    "hatena.ne.jp"    "http://hatena.ne.jp/"
+    "telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail"
+    "bbc.co.uk"       "bbc.co.uk/some/path?search=foo"
+    "bbc.co.uk"       "news.bbc.co.uk:port"))
+
+(t/deftest ^:parallel host-regex-on-emails-test
+  (t/are [host email] (= host (re-find @#'mbql.jvm-u/host-regex email))
+    "metabase.com"      "braden@metabase.com"
+    "homeoffice.gov.uk" "mholmes@homeoffice.gov.uk"
+    "someisp.com"       "john.smith@mail.someisp.com"
+    "amazon.co.uk"      "trk@amazon.co.uk"
+    "hatena.ne.jp"      "takashi@hatena.ne.jp"
+    "hatena.ne.jp"      "takashi@mail.hatena.ne.jp"
+    "ne.jp"             "takashi@www.ne.jp"))
+
+(t/deftest ^:parallel domain-regex-on-urls-test
+  (t/are [domain url] (= domain (re-find @#'mbql.jvm-u/domain-regex url))
+    "cdbaby"    "https://cdbaby.com/some.txt"
+    "fema"      "https://fema.gov/some/path/Vatini?search=foo"
+    "geocities" "https://www.geocities.jp/some/path/Turbitt?search=foo"
+    "jalbum"    "https://jalbum.net/some/path/Kirsz?search=foo"
+    "usa"       "https://usa.gov/some/path/Curdell?search=foo"
+    "taxes"     "http://taxes.va.gov/some/path/Marritt?search=foo"
+    "gmpg"      "http://log.stuff.gmpg.org/some/path/Cambden?search=foo"
+    "hatena"    "http://hatena.ne.jp/"
+    "telegraph" "//telegraph.co.uk?foo=bar#tail"
+    "bbc"       "bbc.co.uk/some/path?search=foo"))
+
+(t/deftest ^:parallel domain-regex-on-emails-test
+  (t/are [domain email] (= domain (re-find @#'mbql.jvm-u/domain-regex email))
+    "metabase"   "braden@metabase.com"
+    "homeoffice" "mholmes@homeoffice.gov.uk"
+    "someisp"    "john.smith@mail.someisp.com"
+    "amazon"     "trk@amazon.co.uk"
+    "hatena"     "takashi@hatena.ne.jp"
+    "ne"         "takashi@www.ne.jp"))
+
+(t/deftest ^:parallel subdomain-regex-on-urls-test
+  (t/are [subdomain url] (= subdomain (re-find @#'mbql.jvm-u/subdomain-regex url))
+       ;; Blanks. "www" doesn't count.
+    nil "cdbaby.com"
+    nil "https://fema.gov"
+    nil "http://www.geocities.jp"
+    nil "usa.gov/some/page.cgi.htm"
+    nil "va.gov"
+
+       ;; Basics - taking the first segment that isn't "www", IF it isn't the domain.
+    "sub"        "sub.jalbum.net"
+    "subdomains" "subdomains.go.here.jalbum.net"
+    "log"        "log.stuff.gmpg.org"
+    "log"        "https://log.stuff.gmpg.org"
+    "log"        "log.stuff.gmpg.org/some/path"
+    "log"        "log.stuff.gmpg.org?search=yes"
+
+       ;; Oops, we miss these! This is the reverse of the problem when picking the domain.
+       ;; We can't tell without maintaining a huge list that va and ne are the real domains, and not the trailing
+       ;; fragments like .co.uk - see below.
+    nil "taxes.va.gov" ; True domain is va, subdomain is taxes.
+    nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena.
+
+       ;; Sometimes the second-last part is a short suffix.
+       ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database,
+       ;; we use a best-effort matcher that gets the domain right most of the time.
+    nil         "telegraph.co.uk"
+    nil         "https://telegraph.co.uk"
+    nil         "telegraph.co.uk/some/article.php"
+    "local"     "local.news.telegraph.co.uk"
+    nil         "bbc.co.uk#fragment"
+    "video"     "video.bbc.co.uk"
+       ;; "www" is disregarded as a possible subdomain.
+    nil         "www.usa.gov"
+    nil         "www.dot.va.gov"
+    "licensing" "www.licensing.dot.va.gov"))
+
+(t/deftest ^:parallel desugar-host-and-domain-test
+  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/host-regex)]
+           (mbql.u/desugar-expression [:host [:field 1 nil]]))
+        "`host` should desugar to a `regex-match-first` clause with the host regex")
+  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/domain-regex)]
+           (mbql.u/desugar-expression [:domain [:field 1 nil]]))
+        "`domain` should desugar to a `regex-match-first` clause with the domain regex")
+  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.jvm-u/subdomain-regex)]
+           (mbql.u/desugar-expression [:subdomain [:field 1 nil]]))
+        "`subdomain` should desugar to a `regex-match-first` clause with the subdomain regex"))
diff --git a/test/metabase/legacy_mbql/util_test.cljc b/test/metabase/legacy_mbql/util_test.cljc
index 2d9b93069f1..b446a086d24 100644
--- a/test/metabase/legacy_mbql/util_test.cljc
+++ b/test/metabase/legacy_mbql/util_test.cljc
@@ -877,101 +877,6 @@
             [:relative-datetime 0 :quarter]]
            (mbql.u/desugar-time-interval [:time-interval [:expression "Date"] :current :quarter]))))
 
-(t/deftest ^:parallel host-regex-on-urls-test
-  (t/are [host url] (= host (re-find @#'mbql.u/host-regex url))
-    "cdbaby.com"      "https://cdbaby.com/some.txt"
-    "fema.gov"        "https://fema.gov/some/path/Vatini?search=foo"
-    "geocities.jp"    "https://www.geocities.jp/some/path/Turbitt?search=foo"
-    "jalbum.net"      "https://jalbum.net/some/path/Kirsz?search=foo"
-    "usa.gov"         "https://usa.gov/some/path/Curdell?search=foo"
-    ;; Oops, this one captures a subdomain because it can't tell va.gov is supposed to be that short.
-    "taxes.va.gov"    "http://taxes.va.gov/some/path/Marritt?search=foo"
-    "gmpg.org"        "http://log.stuff.gmpg.org/some/path/Cambden?search=foo"
-    "hatena.ne.jp"    "http://hatena.ne.jp/"
-    "telegraph.co.uk" "//telegraph.co.uk?foo=bar#tail"
-    "bbc.co.uk"       "bbc.co.uk/some/path?search=foo"
-    "bbc.co.uk"       "news.bbc.co.uk:port"))
-
-(t/deftest ^:parallel host-regex-on-emails-test
-  (t/are [host email] (= host (re-find @#'mbql.u/host-regex email))
-    "metabase.com"      "braden@metabase.com"
-    "homeoffice.gov.uk" "mholmes@homeoffice.gov.uk"
-    "someisp.com"       "john.smith@mail.someisp.com"
-    "amazon.co.uk"      "trk@amazon.co.uk"
-    "hatena.ne.jp"      "takashi@hatena.ne.jp"
-    "hatena.ne.jp"      "takashi@mail.hatena.ne.jp"
-    "ne.jp"             "takashi@www.ne.jp"))
-
-(t/deftest ^:parallel domain-regex-on-urls-test
-  (t/are [domain url] (= domain (re-find @#'mbql.u/domain-regex url))
-    "cdbaby"    "https://cdbaby.com/some.txt"
-    "fema"      "https://fema.gov/some/path/Vatini?search=foo"
-    "geocities" "https://www.geocities.jp/some/path/Turbitt?search=foo"
-    "jalbum"    "https://jalbum.net/some/path/Kirsz?search=foo"
-    "usa"       "https://usa.gov/some/path/Curdell?search=foo"
-    "taxes"     "http://taxes.va.gov/some/path/Marritt?search=foo"
-    "gmpg"      "http://log.stuff.gmpg.org/some/path/Cambden?search=foo"
-    "hatena"    "http://hatena.ne.jp/"
-    "telegraph" "//telegraph.co.uk?foo=bar#tail"
-    "bbc"       "bbc.co.uk/some/path?search=foo"))
-
-(t/deftest ^:parallel domain-regex-on-emails-test
-  (t/are [domain email] (= domain (re-find @#'mbql.u/domain-regex email))
-    "metabase"   "braden@metabase.com"
-    "homeoffice" "mholmes@homeoffice.gov.uk"
-    "someisp"    "john.smith@mail.someisp.com"
-    "amazon"     "trk@amazon.co.uk"
-    "hatena"     "takashi@hatena.ne.jp"
-    "ne"         "takashi@www.ne.jp"))
-
-(t/deftest ^:parallel subdomain-regex-on-urls-test
-  (t/are [subdomain url] (= subdomain (re-find @#'mbql.u/subdomain-regex url))
-       ;; Blanks. "www" doesn't count.
-    nil "cdbaby.com"
-    nil "https://fema.gov"
-    nil "http://www.geocities.jp"
-    nil "usa.gov/some/page.cgi.htm"
-    nil "va.gov"
-
-       ;; Basics - taking the first segment that isn't "www", IF it isn't the domain.
-    "sub"        "sub.jalbum.net"
-    "subdomains" "subdomains.go.here.jalbum.net"
-    "log"        "log.stuff.gmpg.org"
-    "log"        "https://log.stuff.gmpg.org"
-    "log"        "log.stuff.gmpg.org/some/path"
-    "log"        "log.stuff.gmpg.org?search=yes"
-
-       ;; Oops, we miss these! This is the reverse of the problem when picking the domain.
-       ;; We can't tell without maintaining a huge list that va and ne are the real domains, and not the trailing
-       ;; fragments like .co.uk - see below.
-    nil "taxes.va.gov" ; True domain is va, subdomain is taxes.
-    nil "hatena.ne.jp" ; True domain is ne, subdomain is hatena.
-
-       ;; Sometimes the second-last part is a short suffix.
-       ;; Mozilla maintains a huge list of these, but since this has to go into a regex and get passed to the database,
-       ;; we use a best-effort matcher that gets the domain right most of the time.
-    nil         "telegraph.co.uk"
-    nil         "https://telegraph.co.uk"
-    nil         "telegraph.co.uk/some/article.php"
-    "local"     "local.news.telegraph.co.uk"
-    nil         "bbc.co.uk#fragment"
-    "video"     "video.bbc.co.uk"
-       ;; "www" is disregarded as a possible subdomain.
-    nil         "www.usa.gov"
-    nil         "www.dot.va.gov"
-    "licensing" "www.licensing.dot.va.gov"))
-
-(t/deftest ^:parallel desugar-host-and-domain-test
-  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/host-regex)]
-           (mbql.u/desugar-expression [:host [:field 1 nil]]))
-        "`host` should desugar to a `regex-match-first` clause with the host regex")
-  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/domain-regex)]
-           (mbql.u/desugar-expression [:domain [:field 1 nil]]))
-        "`domain` should desugar to a `regex-match-first` clause with the domain regex")
-  (t/is (= [:regex-match-first [:field 1 nil] (str @#'mbql.u/subdomain-regex)]
-           (mbql.u/desugar-expression [:subdomain [:field 1 nil]]))
-        "`subdomain` should desugar to a `regex-match-first` clause with the subdomain regex"))
-
 (t/deftest ^:parallel desugar-month-quarter-day-name-test
   (t/is (= [:case [[[:= [:field 1 nil] 1]  "Jan"]
                    [[:= [:field 1 nil] 2]  "Feb"]
-- 
GitLab