From 862daf338a63d23a9bba8eab3707e33bc4788881 Mon Sep 17 00:00:00 2001 From: Chris Truter <crisptrutski@users.noreply.github.com> Date: Thu, 4 Apr 2024 12:30:28 +0200 Subject: [PATCH] Break out upload.types namespace (#40741) --- src/metabase/upload.clj | 308 +--------------- src/metabase/upload/parsing.clj | 18 +- src/metabase/upload/types.cljc | 301 +++++++++++++++ test/metabase/upload/types_test.clj | 332 +++++++++++++++++ test/metabase/upload_test.clj | 347 +++--------------- test/metabase/util/ordered_hierarchy_test.clj | 142 +++---- 6 files changed, 770 insertions(+), 678 deletions(-) create mode 100644 src/metabase/upload/types.cljc create mode 100644 test/metabase/upload/types_test.clj diff --git a/src/metabase/upload.clj b/src/metabase/upload.clj index 36fc8b9e877..f1e96a9912a 100644 --- a/src/metabase/upload.clj +++ b/src/metabase/upload.clj @@ -1,5 +1,4 @@ (ns metabase.upload - (:refer-clojure :exclude [derive make-hierarchy parents]) (:require [clj-bom.core :as bom] [clojure.data :as data] @@ -29,265 +28,17 @@ [metabase.sync.sync-metadata.fields :as sync-fields] [metabase.sync.sync-metadata.tables :as sync-tables] [metabase.upload.parsing :as upload-parsing] + [metabase.upload.types :as upload-types] [metabase.util :as u] [metabase.util.i18n :refer [tru]] [metabase.util.malli :as mu] [metabase.util.malli.schema :as ms] - [metabase.util.ordered-hierarchy :as ordered-hierarchy :refer [make-hierarchy]] [toucan2.core :as t2]) (:import (java.io File))) (set! *warn-on-reflection* true) -;;;; +------------------+ -;;;; | Schema detection | -;;;; +------------------+ - -;; Upload value-types form a DAG (directed acyclic graph) where each type can be relaxed into any of its ancestors. -;; We parse each value in the CSV file to the most-specific possible type for each column. - -;; The most-specific possible type for a column is the closest common ancestor of the types for each value in the -;; column, found by walking through the graph in topological order, following edges from left to right. -;; Note that this type is not guaranteed to be one of the least common ancestors! -;; -;; See [[metabase.util.ordered-hierarchy/first-common-ancestor]] for more details. -;; -;; <pre><code> -;; -;; text -;; | -;; | -;; varchar-255────── -;; / / \ \ -;; / / \ \ -;; boolean float datetime offset-datetime -;; | │ │ -;; | │ │ -;; │ *float-or-int* │ -;; │ │ │ -;; │ │ │ -;; | int date -;; | / \ -;; | / \ -;; *boolean-int* auto-incrementing-int-pk -;; -;; </code></pre> -;; -;; We have a number of special "abstract" nodes in this graph: -;; -;; - `*boolean-int*` is an ambiguous node, that could either be parsed as a boolean or as an integer. -;; - `*float-or-int*` is any integer, whether it has an explicit decimal point or not. -;; -;; While a `*boolean-int*` is a genuinely ambiguous value, `*float-or-int*` exist to power our desired value-type -;; coercion and column-type promotion behaviour. -;; -;; - If we encounter a `*float-or-int*` inside an `int` column, then we can safely coerce it down to an integer. -;; - If we encounter a `float` (i.e. a non-zero fraction component), then we need to promote the column to a `float.` -;; -;; Columns can not have an abstract type, which has no meaning outside of inference and reconciliation. -;; If we are left with an abstract type after having processed all the values, we first check whether we can coerce -;; the type to the existing column type, and otherwise traverse further up the graph until we reach a concrete type. -;; -;; For ease of reference and explicitness these corresponding values are given in the `abstract->concrete` map. -;; One can figure out these mappings by simply looking up through the ancestors. For now, we require that it is always -;; a direct ancestor, and lay out or graph so that it is the left-most one. - -(def ^:private h - "This hierarchy defines a relationship between value types and their specializations. - We use an [[metabase.util.ordered-hierarchy]] for its topological sorting, which simplify writing efficient and - consistent implementations for of our type inference, parsing, and relaxation." - (make-hierarchy - [::text - [::varchar-255 - [::boolean ::*boolean-int*] - [::float - ;; A number value with a decimal separator, but a zero fractional component. - [::*float-or-int* - [::int - ;; A value that could be legally parsed as either a boolean OR an integer - ::*boolean-int* - ::auto-incrementing-int-pk]]] - [::datetime ::date] - ::offset-datetime]])) - -(def ^:private abstract->concrete - "Not all value types correspond to column types. We refer to these as \"abstract\" types, and give them *ear-muffs*. - This maps implicitly defines the abstract types, by mapping them each to a default concretion." - {::*boolean-int* ::boolean - ::*float-or-int* ::float}) - -(def ^:private allowed-promotions - "A mapping of which types a column can be implicitly relaxed to, based on the content of appended values. - If we require a relaxation which is not allow-listed here, we will reject the corresponding file." - {::int #{::float}}) - -(def ^:private column-type->coercible-value-types - "A mapping of which value types should be coerced to the given existing type, rather than triggering promotion." - {::int #{::*float-or-int*}}) - -(defn- coerce? - "Can values of the given type be coerced to the given existing column type, in a lossless fashion?" - [column-type value-type] - (contains? (column-type->coercible-value-types column-type) value-type)) - -(def ^:private value-types - "All type tags which values can be inferred as. An ordered set from most to least specialized." - (ordered-hierarchy/sorted-tags h)) - -(def ^:private column-types - "All type tags that correspond to concrete column types." - (into #{} (remove abstract->concrete) value-types)) - -(defn- column-type? - [value-type] - (contains? column-types value-type)) - -(defn ^:private concretize - "Determine the desired column-type given the existing column-type (nil if it's new) and the value-type of the data. - If there's a valid coercion to the existing type, we will preserve it, but otherwise we will relax abstract types - further to a concrete type." - [existing-type value-type] - (cond - ;; If the type is concrete, there is nothing to do. - (column-type? value-type) value-type - ;; If we know nothing about the value type, treat it as an arbitrary string. - (nil? value-type) ::text - ;; If configured, coerce the value to the existing type - (coerce? existing-type value-type) existing-type - ;; Otherwise, project it to its canonical concretion. - :else (abstract->concrete value-type))) - -;;;;;;;;;;;;;;;;;;;;;;;;;; -;; [[value->type]] helpers - -(defn- with-parens - "Returns a regex that matches the argument, with or without surrounding parentheses." - [number-regex] - (re-pattern (str "(" number-regex ")|(\\(" number-regex "\\))"))) - -(defn- with-currency - "Returns a regex that matches a positive or negative number, including currency symbols" - [number-regex] - ;; currency signs can be all over: $2, -$2, $-2, 2€ - (re-pattern (str upload-parsing/currency-regex "?\\s*-?" - upload-parsing/currency-regex "?" - number-regex - "\\s*" upload-parsing/currency-regex "?"))) - -(defn- int-regex - "Matches numbers which do not have a decimal separator." - [number-separators] - (with-parens - (with-currency - (case number-separators - ("." ".,") #"\d[\d,]*" - ",." #"\d[\d.]*" - ", " #"\d[\d \u00A0]*" - ".’" #"\d[\d’]*")))) - -(defn- float-or-int-regex - "Matches integral numbers, even if they have a decimal separator - e.g. 2 or 2.0" - [number-separators] - (with-parens - (with-currency - (case number-separators - ("." ".,") #"\d[\d,]*(\.0+)?" - ",." #"\d[\d.]*(\,[0]+)?" - ", " #"\d[\d \u00A0]*(\,[0.]+)?" - ".’" #"\d[\d’]*(\.[0.]+)?")))) - -(defn- float-regex - "Matches numbers, regardless of whether they have a decimal separator - e.g. 2, 2.0, or 2.2" - [number-separators] - (with-parens - (with-currency - (case number-separators - ("." ".,") #"\d[\d,]*(\.\d+)?" - ",." #"\d[\d.]*(\,[\d]+)?" - ", " #"\d[\d \u00A0]*(\,[\d.]+)?" - ".’" #"\d[\d’]*(\.[\d.]+)?")))) - -(defmacro does-not-throw? - "Returns true if the given body does not throw an exception." - [body] - `(try - ~body - true - (catch Throwable e# - false))) - -(defn- date-string? [s] - (does-not-throw? (upload-parsing/parse-local-date s))) - -(defn- datetime-string? [s] - (does-not-throw? (upload-parsing/parse-local-datetime s))) - -(defn- offset-datetime-string? [s] - (does-not-throw? (upload-parsing/parse-offset-datetime s))) - -(defn- boolean-string? [s] - (boolean (re-matches #"(?i)true|t|yes|y|1|false|f|no|n|0" s))) - -(defn- boolean-int-string? [s] - (contains? #{"0" "1"} s)) - -(defn- varchar-255? [s] - (<= (count s) 255)) - -(defn- regex-matcher [regex] - (fn [s] - (boolean (re-matches regex s)))) - -;; end [[value->type]] helpers -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -(def ^:private non-inferable-types - #{::auto-incrementing-int-pk}) - -(def ^:private type->check-schema - "Every inferable value-type needs to have a detection function registered." - (into [:map] (map #(vector % [:=> [:cat :string] :boolean]) - (remove non-inferable-types value-types)))) - -(mu/defn ^:private settings->type->check :- type->check-schema - [{:keys [number-separators] :as _settings}] - (let [int-string? (regex-matcher (int-regex number-separators)) - float-or-int? (regex-matcher (float-or-int-regex number-separators)) - float-string? (regex-matcher (float-regex number-separators))] - {::*boolean-int* boolean-int-string? - ::boolean boolean-string? - ::offset-datetime offset-datetime-string? - ::date date-string? - ::datetime datetime-string? - ::int int-string? - ::*float-or-int* float-or-int? - ::float float-string? - ::varchar-255 varchar-255? - ::text (constantly true)})) - -(defn- value->type - "Determine the most specific type that is compatible with the given value. - Numbers are assumed to use separators corresponding to the locale defined in the application settings" - [type->check value] - (when-not (str/blank? value) - (let [trimmed (str/trim value)] - (->> (remove non-inferable-types value-types) - (filter #((type->check %) trimmed)) - first)))) - -(defn- relax-type - "Given an existing column type, and a new value, relax the type until it includes the value." - [type->check current-type value] - (cond (nil? value) current-type - (nil? current-type) (value->type type->check value) - :else (let [trimmed (str/trim value)] - (if (str/blank? trimmed) - current-type - (->> (cons current-type (ancestors h current-type)) - (filter #((type->check %) trimmed)) - first))))) - (defn- normalize-column-name [raw-name] (if (str/blank? raw-name) @@ -307,20 +58,6 @@ (= (normalize-column-name (:name field)) auto-pk-column-name)) (t2/select :model/Field :table_id table-id :active true)))) -(defn- type-relaxer - "Given a map of {value-type -> predicate}, return a reducing fn which updates our inferred schema using the next row." - [settings] - (let [relax (partial relax-type (settings->type->check settings))] - (fn [value-types row] - ;; It's important to realize this lazy sequence, because otherwise we can build a huge stack and overflow. - (vec (u/map-all relax value-types row))))) - -(mu/defn column-types-from-rows :- [:sequential (into [:enum] column-types)] - "Given the types of the existing columns (if there are any), and rows to be added, infer the best supporting types." - [settings existing-types rows] - (->> (reduce (type-relaxer settings) existing-types rows) - (u/map-all concretize existing-types))) - (defn- detect-schema "Consumes the header and rows from a CSV file. @@ -336,11 +73,10 @@ unique-header (map keyword (mbql.u/uniquify-names normalized-header)) column-count (count normalized-header) initial-types (repeat column-count nil) - col-name+type-pairs (->> (column-types-from-rows settings initial-types rows) + col-name+type-pairs (->> (upload-types/column-types-from-rows settings initial-types rows) (map vector unique-header))] {:extant-columns (ordered-map/ordered-map col-name+type-pairs) - :generated-columns (ordered-map/ordered-map auto-pk-column-keyword ::auto-incrementing-int-pk)})) - + :generated-columns (ordered-map/ordered-map auto-pk-column-keyword ::upload-types/auto-incrementing-int-pk)})) ;;;; +------------------+ ;;;; | Parsing values | @@ -372,10 +108,16 @@ (str truncated-name-without-time (t/format time-format (strictly-monotonic-now))))) +(mu/defn ^:private database-type + [driver + column-type :- (into [:enum] upload-types/column-types)] + (let [external-type (keyword "metabase.upload" (name column-type))] + (driver/upload-type->database-type driver external-type))) + (defn- column-definitions "Returns a map of column-name -> column-definition from a map of column-name -> upload-type." [driver col->upload-type] - (update-vals col->upload-type (partial driver/upload-type->database-type driver))) + (update-vals col->upload-type (partial database-type driver))) (defn current-database "The database being used for uploads (as per the `uploads-database-id` setting)." @@ -685,20 +427,6 @@ ;;; | appending to uploaded table ;;; +----------------------------- -(defn- base-type->upload-type - "Returns the most specific upload type for the given base type." - [base-type] - (when base-type - (condp #(isa? %2 %1) base-type - :type/Float ::float - :type/BigInteger ::int - :type/Integer ::int - :type/Boolean ::boolean - :type/DateTimeWithTZ ::offset-datetime - :type/DateTime ::datetime - :type/Date ::date - :type/Text ::text))) - (defn- not-blank [s] (when-not (str/blank? s) s)) @@ -734,12 +462,6 @@ (when-let [error-message (extra-and-missing-error-markdown extra missing)] (throw (ex-info error-message {:status-code 422}))))) -(defn- matching-or-promotable? [current-type relaxed-type] - (or (nil? current-type) - (= current-type relaxed-type) - (when-let [f (allowed-promotions current-type)] - (f relaxed-type)))) - (defn- field-changes "Given existing and newly inferred types for the given `field-names`, calculate which fields need to be added or updated, along with their new types." [field-names existing-types new-types] @@ -756,7 +478,7 @@ (m/map-kv (fn [field-name col-type] [(keyword field-name) - (driver/upload-type->database-type driver col-type)]) + (database-type driver col-type)]) field->col-type)) (defn- add-columns! [driver database table field->type & args] @@ -786,12 +508,12 @@ (not (contains? normed-name->field auto-pk-column-name))) _ (check-schema (dissoc normed-name->field auto-pk-column-name) header) settings (upload-parsing/get-settings) - old-types (map (comp base-type->upload-type :base_type normed-name->field) normed-header) + old-types (map (comp upload-types/base-type->upload-type :base_type normed-name->field) normed-header) ;; in the happy, and most common, case all the values will match the existing types ;; for now we just plan for the worst and perform a fairly expensive operation to detect any type changes ;; we can come back and optimize this to an optimistic-with-fallback approach later. - detected-types (column-types-from-rows settings old-types rows) - new-types (map #(if (matching-or-promotable? %1 %2) %2 %1) old-types detected-types) + detected-types (upload-types/column-types-from-rows settings old-types rows) + new-types (map upload-types/new-type old-types detected-types) ;; avoid any schema modification unless all the promotions required by the file are supported, ;; choosing to not promote means that we will defer failure until we hit the first value that cannot ;; be parsed as its existing type - there is scope to improve these error messages in the future. @@ -818,7 +540,7 @@ (when create-auto-pk? (add-columns! driver database table - {auto-pk-column-keyword ::auto-incrementing-int-pk} + {auto-pk-column-keyword ::upload-types/auto-incrementing-int-pk} :primary-key [auto-pk-column-keyword])) (scan-and-sync-table! database table) diff --git a/src/metabase/upload/parsing.clj b/src/metabase/upload/parsing.clj index 1d96cb94b9b..475a9323dbb 100644 --- a/src/metabase/upload/parsing.clj +++ b/src/metabase/upload/parsing.clj @@ -183,45 +183,45 @@ (fn [upload-type _] upload-type)) -(defmethod upload-type->parser :metabase.upload/varchar-255 +(defmethod upload-type->parser :metabase.upload.types/varchar-255 [_ _] identity) -(defmethod upload-type->parser :metabase.upload/text +(defmethod upload-type->parser :metabase.upload.types/text [_ _] identity) -(defmethod upload-type->parser :metabase.upload/int +(defmethod upload-type->parser :metabase.upload.types/int [_ {:keys [number-separators]}] (partial parse-as-biginteger number-separators)) -(defmethod upload-type->parser :metabase.upload/float +(defmethod upload-type->parser :metabase.upload.types/float [_ {:keys [number-separators]}] (partial parse-number number-separators)) -(defmethod upload-type->parser :metabase.upload/auto-incrementing-int-pk +(defmethod upload-type->parser :metabase.upload.types/auto-incrementing-int-pk [_ {:keys [number-separators]}] (partial parse-as-biginteger number-separators)) -(defmethod upload-type->parser :metabase.upload/boolean +(defmethod upload-type->parser :metabase.upload.types/boolean [_ _] (comp parse-bool str/trim)) -(defmethod upload-type->parser :metabase.upload/date +(defmethod upload-type->parser :metabase.upload.types/date [_ _] (comp parse-local-date str/trim)) -(defmethod upload-type->parser :metabase.upload/datetime +(defmethod upload-type->parser :metabase.upload.types/datetime [_ _] (comp parse-as-datetime str/trim)) -(defmethod upload-type->parser :metabase.upload/offset-datetime +(defmethod upload-type->parser :metabase.upload.types/offset-datetime [_ _] (comp parse-offset-datetime diff --git a/src/metabase/upload/types.cljc b/src/metabase/upload/types.cljc new file mode 100644 index 00000000000..94a1f721309 --- /dev/null +++ b/src/metabase/upload/types.cljc @@ -0,0 +1,301 @@ +(ns metabase.upload.types + (:refer-clojure :exclude [make-hierarchy]) + (:require + [clojure.string :as str] + [metabase.upload.parsing :as upload-parsing] + [metabase.util :as u] + [metabase.util.malli :as mu] + [metabase.util.ordered-hierarchy :as ordered-hierarchy :refer [make-hierarchy]])) + +;; Upload value-types form a directed acyclic graph where each type can be relaxed into any of its ancestors. +;; We parse each value in the CSV file to the most-specific possible type for each column. + +;; The most-specific possible type for a column is the closest common ancestor of the types for each value in the +;; column, found by walking through the graph in topological order, following edges from left to right. +;; Note that this type is not guaranteed to be one of the least common ancestors! +;; +;; See [[metabase.util.ordered-hierarchy/first-common-ancestor]] for more details. +;; +;; <pre><code> +;; +;; text +;; | +;; | +;; varchar-255────── +;; / / \ \ +;; / / \ \ +;; boolean float datetime offset-datetime +;; | │ │ +;; | │ │ +;; │ *float-or-int* │ +;; │ │ │ +;; │ │ │ +;; | int date +;; | / \ +;; | / \ +;; *boolean-int* auto-incrementing-int-pk +;; +;; </code></pre> +;; +;; We have a number of special "abstract" nodes in this graph: +;; +;; - `*boolean-int*` is an ambiguous node, that could either be parsed as a boolean or as an integer. +;; - `*float-or-int*` is any integer, whether it has an explicit decimal point or not. +;; +;; While a `*boolean-int*` is a genuinely ambiguous value, `*float-or-int*` exist to power our desired value-type +;; coercion and column-type promotion behaviour. +;; +;; - If we encounter a `*float-or-int*` inside an `int` column, then we can safely coerce it down to an integer. +;; - If we encounter a `float` (i.e. a non-zero fraction component), then we need to promote the column to a `float.` +;; +;; Columns can not have an abstract type, which has no meaning outside of inference and reconciliation. +;; If we are left with an abstract type after having processed all the values, we first check whether we can coerce +;; the type to the existing column type, and otherwise traverse further up the graph until we reach a concrete type. +;; +;; For ease of reference and explicitness these corresponding values are given in the `abstract->concrete` map. +;; One can figure out these mappings by simply looking up through the ancestors. For now, we require that it is always +;; a direct ancestor, and lay out or graph so that it is the left-most one. + +(def h + "This hierarchy defines a relationship between value types and their specializations. + We use an [[metabase.util.ordered-hierarchy]] for its topological sorting, which simplify writing efficient and + consistent implementations for of our type inference, parsing, and relaxation." + (make-hierarchy + [::text + [::varchar-255 + [::boolean ::*boolean-int*] + [::float + ;; A number value with a decimal separator, but a zero fractional component. + [::*float-or-int* + [::int + ;; A value that could be legally parsed as either a boolean OR an integer + ::*boolean-int* + ::auto-incrementing-int-pk]]] + [::datetime ::date] + ::offset-datetime]])) + +(def ^:private abstract->concrete + "Not all value types correspond to column types. We refer to these as \"abstract\" types, and give them *ear-muffs*. + This maps implicitly defines the abstract types, by mapping them each to a default concretion." + {::*boolean-int* ::boolean + ::*float-or-int* ::float}) + +(def ^:private allowed-promotions + "A mapping of which types a column can be implicitly relaxed to, based on the content of appended values. + If we require a relaxation which is not allow-listed here, we will reject the corresponding file." + {::int #{::float}}) + +(def ^:private column-type->coercible-value-types + "A mapping of which value types should be coerced to the given existing type, rather than triggering promotion." + {::int #{::*float-or-int*}}) + +(defn- coerce? + "Can values of the given type be coerced to the given existing column type, in a lossless fashion?" + [column-type value-type] + (contains? (column-type->coercible-value-types column-type) value-type)) + +(def value-types + "All type tags which values can be inferred as. An ordered set from most to least specialized." + (ordered-hierarchy/sorted-tags h)) + +(def column-types + "All type tags that correspond to concrete column types." + (into #{} (remove abstract->concrete) value-types)) + +(defn- column-type? + [value-type] + (contains? column-types value-type)) + +(defn concretize + "Determine the desired column-type given the existing column-type (nil if it's new) and the value-type of the data. + If there's a valid coercion to the existing type, we will preserve it, but otherwise we will relax abstract types + further to a concrete type." + [existing-type value-type] + (cond + ;; If the type is concrete, there is nothing to do. + (column-type? value-type) value-type + ;; If we know nothing about the value type, treat it as an arbitrary string. + (nil? value-type) ::text + ;; If configured, coerce the value to the existing type + (coerce? existing-type value-type) existing-type + ;; Otherwise, project it to its canonical concretion. + :else (abstract->concrete value-type))) + +;;;;;;;;;;;;;;;;;;;;;;;;;; +;; [[value->type]] helpers + +(defn- with-parens + "Returns a regex that matches the argument, with or without surrounding parentheses." + [number-regex] + (re-pattern (str "(" number-regex ")|(\\(" number-regex "\\))"))) + +(defn- with-currency + "Returns a regex that matches a positive or negative number, including currency symbols" + [number-regex] + ;; currency signs can be all over: $2, -$2, $-2, 2€ + (re-pattern (str upload-parsing/currency-regex "?\\s*-?" + upload-parsing/currency-regex "?" + number-regex + "\\s*" upload-parsing/currency-regex "?"))) + +(defn- int-regex + "Matches numbers which do not have a decimal separator." + [number-separators] + (with-parens + (with-currency + (case number-separators + ("." ".,") #"\d[\d,]*" + ",." #"\d[\d.]*" + ", " #"\d[\d \u00A0]*" + ".’" #"\d[\d’]*")))) + +(defn- float-or-int-regex + "Matches integral numbers, even if they have a decimal separator - e.g. 2 or 2.0" + [number-separators] + (with-parens + (with-currency + (case number-separators + ("." ".,") #"\d[\d,]*(\.0+)?" + ",." #"\d[\d.]*(\,[0]+)?" + ", " #"\d[\d \u00A0]*(\,[0.]+)?" + ".’" #"\d[\d’]*(\.[0.]+)?")))) + +(defn- float-regex + "Matches numbers, regardless of whether they have a decimal separator - e.g. 2, 2.0, or 2.2" + [number-separators] + (with-parens + (with-currency + (case number-separators + ("." ".,") #"\d[\d,]*(\.\d+)?" + ",." #"\d[\d.]*(\,[\d]+)?" + ", " #"\d[\d \u00A0]*(\,[\d.]+)?" + ".’" #"\d[\d’]*(\.[\d.]+)?")))) + +(defmacro does-not-throw? + "Returns true if the given body does not throw an exception." + [body] + `(try + ~body + true + (catch Throwable _e# + false))) + +(defn- date-string? [s] + (does-not-throw? (upload-parsing/parse-local-date s))) + +(defn- datetime-string? [s] + (does-not-throw? (upload-parsing/parse-local-datetime s))) + +(defn- offset-datetime-string? [s] + (does-not-throw? (upload-parsing/parse-offset-datetime s))) + +(defn- boolean-string? [s] + (boolean (re-matches #"(?i)true|t|yes|y|1|false|f|no|n|0" s))) + +(defn- boolean-int-string? [s] + (contains? #{"0" "1"} s)) + +(defn- varchar-255? [s] + (<= (count s) 255)) + +(defn- regex-matcher [regex] + (fn [s] + (boolean (re-matches regex s)))) + +;; end [[value->type]] helpers +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(def ^:private non-inferable-types + #{::auto-incrementing-int-pk}) + +(def ^:private type->check-schema + "Every inferable value-type needs to have a detection function registered." + (into [:map] (map #(vector % [:=> [:cat :string] :boolean]) + (remove non-inferable-types value-types)))) + +(mu/defn ^:private settings->type->check :- type->check-schema + [{:keys [number-separators] :as _settings}] + (let [int-string? (regex-matcher (int-regex number-separators)) + float-or-int? (regex-matcher (float-or-int-regex number-separators)) + float-string? (regex-matcher (float-regex number-separators))] + {::*boolean-int* boolean-int-string? + ::boolean boolean-string? + ::offset-datetime offset-datetime-string? + ::date date-string? + ::datetime datetime-string? + ::int int-string? + ::*float-or-int* float-or-int? + ::float float-string? + ::varchar-255 varchar-255? + ::text (constantly true)})) + +(defn- value->type + "Determine the most specific type that is compatible with the given value. + Numbers are assumed to use separators corresponding to the locale defined in the application settings" + [type->check value] + (when-not (str/blank? value) + (let [trimmed (str/trim value)] + (->> (remove non-inferable-types value-types) + (filter #((type->check %) trimmed)) + first)))) + +(defn- relax-type + "Given an existing column type, and a new value, relax the type until it includes the value." + [type->check current-type value] + (cond (nil? value) current-type + (nil? current-type) (value->type type->check value) + :else (let [trimmed (str/trim value)] + (if (str/blank? trimmed) + current-type + (->> (cons current-type (ancestors h current-type)) + (filter #((type->check %) trimmed)) + first))))) + +(defn type-relaxer + "Given a map of {value-type -> predicate}, return a reducing fn which updates our inferred schema using the next row." + [settings] + (let [relax (partial relax-type (settings->type->check settings))] + (fn [value-types row] + ;; It's important to realize this lazy sequence, because otherwise we can build a huge stack and overflow. + (vec (u/map-all relax value-types row))))) + +(mu/defn column-types-from-rows :- [:sequential (into [:enum] column-types)] + "Given the types of the existing columns (if there are any), and rows to be added, infer the best supporting types." + [settings existing-types rows] + (->> (reduce (type-relaxer settings) existing-types rows) + (u/map-all concretize existing-types))) + +(defn base-type->upload-type + "Returns the most specific upload type for the given base type." + [base-type] + (when base-type + (condp #(isa? %2 %1) base-type + :type/Float ::float + :type/BigInteger ::int + :type/Integer ::int + :type/Boolean ::boolean + :type/DateTimeWithTZ ::offset-datetime + :type/DateTime ::datetime + :type/Date ::date + :type/Text ::text))) + +(defn- promotable? + "Are we allowed to promote a column's schema from `current-type` to `inferred-type`?" + [current-type inferred-type] + (when-let [allowed? (allowed-promotions current-type)] + (allowed? inferred-type))) + +(defn new-type + "Given the `current-type` of a column, and an `inferred-type` for new values to be added, return its new type. + This assumes we have already coerced the new values down to the existing type, if possible." + [current-type inferred-type] + (cond + ;; No restriction on new columns + (nil? current-type) inferred-type + ;; No changes required if inferred type matches + (= current-type inferred-type) current-type + :else + ;; Keep the existing type unless a promotion is allowed. + (if (promotable? current-type inferred-type) + inferred-type + current-type))) diff --git a/test/metabase/upload/types_test.clj b/test/metabase/upload/types_test.clj new file mode 100644 index 00000000000..ae303424556 --- /dev/null +++ b/test/metabase/upload/types_test.clj @@ -0,0 +1,332 @@ +(ns ^:mb/once metabase.upload.types-test + (:require + [clojure.test :refer [deftest is testing]] + [clojure.walk :as walk] + [java-time.api :as t] + [metabase.upload.parsing :as upload-parsing] + [metabase.upload.types :as upload-types] + [metabase.util.ordered-hierarchy :as ordered-hierarchy])) + +(def ^:private bool-type ::upload-types/boolean) +(def ^:private int-type ::upload-types/int) +(def ^:private bool-int-type ::upload-types/*boolean-int*) +(def ^:private float-type ::upload-types/float) +(def ^:private float-or-int-type ::upload-types/*float-or-int*) +(def ^:private vchar-type ::upload-types/varchar-255) +(def ^:private date-type ::upload-types/date) +(def ^:private datetime-type ::upload-types/datetime) +(def ^:private offset-dt-type ::upload-types/offset-datetime) +(def ^:private text-type ::upload-types/text) + +(deftest ^:parallel type-detection-and-parse-test + (doseq [[string-value expected-value expected-type separators] + ;; Number-related + [["0.0" 0 float-or-int-type "."] + ["0.0" 0 float-or-int-type ".,"] + ["0,0" 0 float-or-int-type ",."] + ["0,0" 0 float-or-int-type ", "] + ["0.0" 0 float-or-int-type ".’"] + ["-0.0" -0.0 float-or-int-type "."] + ["-0.0" -0.0 float-or-int-type ".,"] + ["-0,0" -0.0 float-or-int-type ",."] + ["-0,0" -0.0 float-or-int-type ", "] + ["-0.0" -0.0 float-or-int-type ".’"] + ["(0.0)" -0.0 float-or-int-type "."] + ["(0.0)" -0.0 float-or-int-type ".,"] + ["(0,0)" -0.0 float-or-int-type ",."] + ["(0,0)" -0.0 float-or-int-type ", "] + ["(0.0)" -0.0 float-or-int-type ".’"] + ["-4300.00€" -4300 float-or-int-type ".,"] + ["£1,000.00" 1000 float-or-int-type] + ["£1,000.00" 1000 float-or-int-type "."] + ["£1,000.00" 1000 float-or-int-type ".,"] + ["£1.000,00" 1000 float-or-int-type ",."] + ["£1 000,00" 1000 float-or-int-type ", "] + ["£1’000.00" 1000 float-or-int-type ".’"] + ["$2" 2 int-type] + ["$ 3" 3 int-type] + ["-43€" -43 int-type] + ["(86)" -86 int-type] + ["($86)" -86 int-type] + ["£1000" 1000 int-type] + ["£1000" 1000 int-type "."] + ["£1000" 1000 int-type ".,"] + ["£1000" 1000 int-type ",."] + ["£1000" 1000 int-type ", "] + ["£1000" 1000 int-type ".’"] + ["-¥9" -9 int-type] + ["₹ -13" -13 int-type] + ["₪13" 13 int-type] + ["₩-13" -13 int-type] + ["₿42" 42 int-type] + ["-99¢" -99 int-type] + ["2" 2 int-type] + ["-86" -86 int-type] + ["9,986,000" 9986000 int-type] + ["9,986,000" 9986000 int-type "."] + ["9,986,000" 9986000 int-type ".,"] + ["9.986.000" 9986000 int-type ",."] + ["9’986’000" 9986000 int-type ".’"] + ["$0" 0 int-type] + ["-1" -1 int-type] + ["0" false bool-int-type] + ["1" true bool-int-type] + ["9.986.000" "9.986.000" vchar-type ".,"] + ["3.14" 3.14 float-type] + ["3.14" 3.14 float-type "."] + ["3.14" 3.14 float-type ".,"] + ["3,14" 3.14 float-type ",."] + ["3,14" 3.14 float-type ", "] + ["(3.14)" -3.14 float-type] + ["3.14" 3.14 float-type ".’"] + [".14" ".14" vchar-type ".,"] ;; TODO: this should be a float type + ["0.14" 0.14 float-type ".,"] + ["-9986.567" -9986.567 float-type ".,"] + ["$2.0" 2 float-or-int-type ".,"] + ["$ 3.50" 3.50 float-type ".,"] + ["-4300.23€" -4300.23 float-type ".,"] + ["£1,000.23" 1000.23 float-type] + ["£1,000.23" 1000.23 float-type "."] + ["£1,000.23" 1000.23 float-type ".,"] + ["£1.000,23" 1000.23 float-type ",."] + ["£1 000,23" 1000.23 float-type ", "] + ["£1’000.23" 1000.23 float-type ".’"] + ["-¥9.99" -9.99 float-type ".,"] + ["₹ -13.23" -13.23 float-type ".,"] + ["₪13.01" 13.01 float-type ".,"] + ["₩13.33" 13.33 float-type ".,"] + ["₿42.243646" 42.243646 float-type ".,"] + ["-99.99¢" -99.99 float-type ".,"] + ["." "." vchar-type] + ;; String-related + [(apply str (repeat 255 "x")) (apply str (repeat 255 "x")) vchar-type] + [(apply str (repeat 256 "x")) (apply str (repeat 256 "x")) text-type] + ["86 is my favorite number" "86 is my favorite number" vchar-type] + ["My favorite number is 86" "My favorite number is 86" vchar-type] + ;; Date-related + [" 2022-01-01 " #t "2022-01-01" date-type] + [" 2022-02-30 " " 2022-02-30 " vchar-type] + [" -2022-01-01 " #t "-2022-01-01" date-type] + [" Jan 1 2018" #t "2018-01-01" date-type] + [" Jan 02 2018" #t "2018-01-02" date-type] + [" Jan 30 -2018" #t "-2018-01-30" date-type] + [" Jan 1, 2018" #t "2018-01-01" date-type] + [" Jan 02, 2018" #t "2018-01-02" date-type] + [" Feb 30, 2018" " Feb 30, 2018" vchar-type] + [" 1 Jan 2018" #t "2018-01-01" date-type] + [" 02 Jan 2018" #t "2018-01-02" date-type] + [" 1 Jan, 2018" #t "2018-01-01" date-type] + [" 02 Jan, 2018" #t "2018-01-02" date-type] + [" January 1 2018" #t "2018-01-01" date-type] + [" January 02 2018" #t "2018-01-02" date-type] + [" January 1, 2018" #t "2018-01-01" date-type] + [" January 02, 2018" #t "2018-01-02" date-type] + [" 1 January 2018" #t "2018-01-01" date-type] + [" 02 January 2018" #t "2018-01-02" date-type] + [" 1 January, 2018" #t "2018-01-01" date-type] + [" 02 January, 2018" #t "2018-01-02" date-type] + [" Saturday, January 1 2000" #t "2000-01-01" date-type] + [" Sunday, January 02 2000" #t "2000-01-02" date-type] + [" Saturday, January 1, 2000" #t "2000-01-01" date-type] + [" Sunday, January 02, 2000" #t "2000-01-02" date-type] + [" 2022-01-01T01:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01t01:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01 01:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01T01:00:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01t01:00:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01 01:00:00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01T01:00:00.00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01t01:00:00.00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01 01:00:00.00 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01T01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01t01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01 01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] + [" 2022-01-01T01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01t01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01 01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01T01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01t01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01 01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01T01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01t01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01 01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] + [" 2022-01-01T01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type] + [" 2022-01-01t01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type] + [" 2022-01-01 01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type]]] + (let [settings {:number-separators (or separators ".,")} + type->check (#'upload-types/settings->type->check settings) + value-type (#'upload-types/value->type type->check string-value) + ;; get the type of the column, if we created it based on only that value + col-type (first (upload-types/column-types-from-rows settings nil [[string-value]])) + parser (upload-parsing/upload-type->parser col-type settings)] + (testing (format "\"%s\" is a %s" string-value value-type) + (is (= expected-type + value-type))) + (testing (format "\"%s\" is parsed into %s" string-value expected-value) + (is (= expected-value + (parser string-value))))))) + +(deftest ^:parallel type-coalescing-test + (doseq [[type-a type-b expected] + [[bool-type bool-type bool-type] + [bool-type int-type vchar-type] + [bool-type bool-int-type bool-type] + [bool-type date-type vchar-type] + [bool-type datetime-type vchar-type] + [bool-type vchar-type vchar-type] + [bool-type text-type text-type] + [int-type bool-type vchar-type] + [int-type float-type float-type] + [int-type date-type vchar-type] + [int-type datetime-type vchar-type] + [int-type vchar-type vchar-type] + [int-type text-type text-type] + [int-type bool-int-type int-type] + [bool-int-type bool-int-type bool-int-type] + [float-type vchar-type vchar-type] + [float-type text-type text-type] + [float-type date-type vchar-type] + [float-type datetime-type vchar-type] + [float-type text-type text-type] + [float-type date-type vchar-type] + [float-type datetime-type vchar-type] + [date-type datetime-type datetime-type] + [date-type vchar-type vchar-type] + [date-type text-type text-type] + [datetime-type vchar-type vchar-type] + [offset-dt-type vchar-type vchar-type] + [datetime-type text-type text-type] + [offset-dt-type text-type text-type] + [vchar-type text-type text-type]]] + (is (= expected (ordered-hierarchy/first-common-ancestor upload-types/h type-a type-b)) + (format "%s + %s = %s" (name type-a) (name type-b) (name expected))))) + +(deftest ^:parallel coercion-soundness-test + (testing "Every coercion maps to a stricter type that is a direct descendant" + (is (empty? + ;; Build a set of all type-pairs that violate this constraint + (into (sorted-set) + (comp (mapcat (fn [[column-type value-types]] + (for [value-type value-types] + [column-type value-type]))) + (remove (fn [[column-type value-type]] + ;; Strictly speaking we only require that there is a route which only traverses + ;; through abstract nodes, but it's much simpler to enforce this stronger condition: + ;; that `column-type` is a child of `value-type` + (contains? (ordered-hierarchy/children upload-types/h value-type) column-type)))) + @#'upload-types/column-type->coercible-value-types))))) + +(deftest ^:parallel initial-column-type-test + (let [column-type (partial upload-types/concretize nil)] + (testing "Unknown value types are treated as text" + (is (= ::upload-types/text (column-type nil)))) + (testing "Non-abstract value types resolve to themselves" + (let [ts upload-types/column-types] + (is (= (zipmap ts ts) + (zipmap (map column-type ts) ts))))) + (testing "Abstract values are resolved using an explicit map, which is consistent with the hierarchy" + (doseq [[value-type expected-column-type] @#'upload-types/abstract->concrete] + (is (= expected-column-type (column-type value-type))) + ;; Strictly speaking we only require that there is a route from each abstract node to its column type which only + ;; traverses through abstract nodes, but it's much simpler to enforce this stronger condition. + (is (contains? (parents upload-types/h value-type) expected-column-type)))))) + +(deftest ^:parallel append-column-type-test + (doseq [existing-type upload-types/value-types + value-type upload-types/value-types] + (case [existing-type value-type] + [::upload-types/int ::upload-types/*float-or-int*] + (testing "We coerce floats with fractional part to plan integers when appending into an existing integer column" + (is (= ::upload-types/int (upload-types/concretize existing-type value-type)))) + + ;; This is unsatisfying, would be good if this interface also covered promoting columns and rejecting values. + (testing (format "We append %s values to %s columns as if we were inserting them into new columns" existing-type value-type) + (is (= (upload-types/concretize nil value-type) + (upload-types/concretize existing-type value-type))))))) + +(defn- re-namespace + "Replace the namespace in any namespaced keyword with this test namespace, return all other values unchanged." + [x] + (if (and (keyword? x) (namespace x)) + (keyword (name (ns-name *ns*)) (name x)) + x)) + +;; Translate the hierarchy to talk about this namespace, so that we can omit an explicit namespace for brevity. +(def ^:private h (walk/postwalk re-namespace upload-types/h)) + +(deftest ^:parallel parents-test + (testing "Parents are listed according to the order that this tag was derived from each of them" + (is (nil? (parents h ::text))) + (is (= [::text] (vec (parents h ::varchar-255)))) + (is (= [::*float-or-int*] (vec (parents h ::int)))) + (is (= [::boolean ::int] (vec (parents h ::*boolean-int*)))))) + +(deftest ^:parallel children-test + (testing "Children are listed in reverse order to when they were each derived from this tag" + (is (nil? (ordered-hierarchy/children h ::*boolean-int*))) + (is (= [::varchar-255] (vec (ordered-hierarchy/children h ::text)))) + (is (= [::*float-or-int*] (vec (ordered-hierarchy/children h ::float)))) + (is (= [::auto-incrementing-int-pk ::*boolean-int*] (vec (ordered-hierarchy/children h ::int)))))) + +(deftest ^:parallel ancestors-test + (testing "Linear ancestors are listed in order" + (is (nil? (ancestors h ::text))) + (is (= [::text] (vec (ancestors h ::varchar-255)))) + (is (= [::varchar-255 ::text] (vec (ancestors h ::boolean)))) + (is (= [::*float-or-int* ::float ::varchar-255 ::text] (vec (ancestors h ::int))))) + + (testing "Non-linear ancestors are listed in topological order, following edges in the order they were defined." + (is (= [::boolean + ::int + ::*float-or-int* + ::float + ::varchar-255 + ::text] + (vec (ancestors h ::*boolean-int*)))))) + +(deftest ^:parallel descendants-test + (testing "Linear descendants are listed in order" + (is (nil? (descendants h ::*boolean-int*))) + (is (nil? (descendants h ::date))) + (is (= [::date] (vec (descendants h ::datetime)))) + (is (= [::*boolean-int*] (vec (descendants h ::boolean))))) + + (testing "Non-linear descendants are listed in reverse topological order, following edges in reserve order." + (is (= [::*float-or-int* + ::int + ::auto-incrementing-int-pk + ::*boolean-int*] + (vec (descendants h ::float)))) + (is (= [::varchar-255 + ::offset-datetime + ::datetime + ::date + ::float + ::*float-or-int* + ::int + ::auto-incrementing-int-pk + ::boolean + ::*boolean-int*] + (vec (descendants h ::text)))))) + +(deftest ^:parallel sorted-tags-test + (testing "Tags are returned in a topological ordering that also preserves insertion order of the edges." + (is (= [::*boolean-int* + ::boolean + ::auto-incrementing-int-pk + ::int + ::*float-or-int* + ::float + ::date + ::datetime + ::offset-datetime + ::varchar-255 + ::text] + (vec (ordered-hierarchy/sorted-tags h)))))) + +(deftest ^:parallel first-common-ancestor-test + (testing "The first-common-ancestor is the first tag in the lineage of tag-a that is also in the lineage of tag-b" + (is (= ::*boolean-int* (ordered-hierarchy/first-common-ancestor h ::*boolean-int* nil))) + (is (= ::*boolean-int* (ordered-hierarchy/first-common-ancestor h ::*boolean-int* ::*boolean-int*))) + (is (= ::boolean (ordered-hierarchy/first-common-ancestor h ::*boolean-int* ::boolean))) + (is (= ::varchar-255 (ordered-hierarchy/first-common-ancestor h ::boolean ::int))))) diff --git a/test/metabase/upload_test.clj b/test/metabase/upload_test.clj index 2f65fe4e224..b94fd5a8031 100644 --- a/test/metabase/upload_test.clj +++ b/test/metabase/upload_test.clj @@ -7,7 +7,6 @@ [clojure.string :as str] [clojure.test :refer :all] [flatland.ordered.map :as ordered-map] - [java-time.api :as t] [metabase.analytics.snowplow-test :as snowplow-test] [metabase.driver :as driver] [metabase.driver.ddl.interface :as ddl.i] @@ -23,25 +22,23 @@ [metabase.test.data.sql :as sql.tx] [metabase.upload :as upload] [metabase.upload.parsing :as upload-parsing] + [metabase.upload.types :as upload-types] [metabase.util :as u] - [metabase.util.ordered-hierarchy :as ordered-hierarchy] [toucan2.core :as t2]) (:import (java.io File))) (set! *warn-on-reflection* true) -(def ^:private bool-type ::upload/boolean) -(def ^:private int-type ::upload/int) -(def ^:private bool-int-type ::upload/*boolean-int*) -(def ^:private float-type ::upload/float) -(def ^:private float-or-int-type ::upload/*float-or-int*) -(def ^:private vchar-type ::upload/varchar-255) -(def ^:private date-type ::upload/date) -(def ^:private datetime-type ::upload/datetime) -(def ^:private offset-dt-type ::upload/offset-datetime) -(def ^:private text-type ::upload/text) -(def ^:private auto-pk-type ::upload/auto-incrementing-int-pk) +(def ^:private bool-type ::upload-types/boolean) +(def ^:private int-type ::upload-types/int) +(def ^:private float-type ::upload-types/float) +(def ^:private vchar-type ::upload-types/varchar-255) +(def ^:private date-type ::upload-types/date) +(def ^:private datetime-type ::upload-types/datetime) +(def ^:private offset-dt-type ::upload-types/offset-datetime) +(def ^:private text-type ::upload-types/text) +(def ^:private auto-pk-type ::upload-types/auto-incrementing-int-pk) (defn- local-infile-on? [] (= "ON" (-> (sql-jdbc.conn/db->pooled-connection-spec (mt/db)) @@ -115,189 +112,6 @@ (#'upload/scan-and-sync-table! database table)) (t2/select-one :model/Table (:id table)))) -(deftest type-detection-and-parse-test - (doseq [[string-value expected-value expected-type seps] - ;; Number-related - [["0.0" 0 float-or-int-type "."] - ["0.0" 0 float-or-int-type ".,"] - ["0,0" 0 float-or-int-type ",."] - ["0,0" 0 float-or-int-type ", "] - ["0.0" 0 float-or-int-type ".’"] - ["-0.0" -0.0 float-or-int-type "."] - ["-0.0" -0.0 float-or-int-type ".,"] - ["-0,0" -0.0 float-or-int-type ",."] - ["-0,0" -0.0 float-or-int-type ", "] - ["-0.0" -0.0 float-or-int-type ".’"] - ["(0.0)" -0.0 float-or-int-type "."] - ["(0.0)" -0.0 float-or-int-type ".,"] - ["(0,0)" -0.0 float-or-int-type ",."] - ["(0,0)" -0.0 float-or-int-type ", "] - ["(0.0)" -0.0 float-or-int-type ".’"] - ["-4300.00€" -4300 float-or-int-type ".,"] - ["£1,000.00" 1000 float-or-int-type] - ["£1,000.00" 1000 float-or-int-type "."] - ["£1,000.00" 1000 float-or-int-type ".,"] - ["£1.000,00" 1000 float-or-int-type ",."] - ["£1 000,00" 1000 float-or-int-type ", "] - ["£1’000.00" 1000 float-or-int-type ".’"] - ["$2" 2 int-type] - ["$ 3" 3 int-type] - ["-43€" -43 int-type] - ["(86)" -86 int-type] - ["($86)" -86 int-type] - ["£1000" 1000 int-type] - ["£1000" 1000 int-type "."] - ["£1000" 1000 int-type ".,"] - ["£1000" 1000 int-type ",."] - ["£1000" 1000 int-type ", "] - ["£1000" 1000 int-type ".’"] - ["-¥9" -9 int-type] - ["₹ -13" -13 int-type] - ["₪13" 13 int-type] - ["₩-13" -13 int-type] - ["₿42" 42 int-type] - ["-99¢" -99 int-type] - ["2" 2 int-type] - ["-86" -86 int-type] - ["9,986,000" 9986000 int-type] - ["9,986,000" 9986000 int-type "."] - ["9,986,000" 9986000 int-type ".,"] - ["9.986.000" 9986000 int-type ",."] - ["9’986’000" 9986000 int-type ".’"] - ["$0" 0 int-type] - ["-1" -1 int-type] - ["0" false bool-int-type] - ["1" true bool-int-type] - ["9.986.000" "9.986.000" vchar-type ".,"] - ["3.14" 3.14 float-type] - ["3.14" 3.14 float-type "."] - ["3.14" 3.14 float-type ".,"] - ["3,14" 3.14 float-type ",."] - ["3,14" 3.14 float-type ", "] - ["(3.14)" -3.14 float-type] - ["3.14" 3.14 float-type ".’"] - [".14" ".14" vchar-type ".,"] ;; TODO: this should be a float type - ["0.14" 0.14 float-type ".,"] - ["-9986.567" -9986.567 float-type ".,"] - ["$2.0" 2 float-or-int-type ".,"] - ["$ 3.50" 3.50 float-type ".,"] - ["-4300.23€" -4300.23 float-type ".,"] - ["£1,000.23" 1000.23 float-type] - ["£1,000.23" 1000.23 float-type "."] - ["£1,000.23" 1000.23 float-type ".,"] - ["£1.000,23" 1000.23 float-type ",."] - ["£1 000,23" 1000.23 float-type ", "] - ["£1’000.23" 1000.23 float-type ".’"] - ["-¥9.99" -9.99 float-type ".,"] - ["₹ -13.23" -13.23 float-type ".,"] - ["₪13.01" 13.01 float-type ".,"] - ["₩13.33" 13.33 float-type ".,"] - ["₿42.243646" 42.243646 float-type ".,"] - ["-99.99¢" -99.99 float-type ".,"] - ["." "." vchar-type] - ;; String-related - [(apply str (repeat 255 "x")) (apply str (repeat 255 "x")) vchar-type] - [(apply str (repeat 256 "x")) (apply str (repeat 256 "x")) text-type] - ["86 is my favorite number" "86 is my favorite number" vchar-type] - ["My favorite number is 86" "My favorite number is 86" vchar-type] - ;; Date-related - [" 2022-01-01 " #t "2022-01-01" date-type] - [" 2022-02-30 " " 2022-02-30 " vchar-type] - [" -2022-01-01 " #t "-2022-01-01" date-type] - [" Jan 1 2018" #t "2018-01-01" date-type] - [" Jan 02 2018" #t "2018-01-02" date-type] - [" Jan 30 -2018" #t "-2018-01-30" date-type] - [" Jan 1, 2018" #t "2018-01-01" date-type] - [" Jan 02, 2018" #t "2018-01-02" date-type] - [" Feb 30, 2018" " Feb 30, 2018" vchar-type] - [" 1 Jan 2018" #t "2018-01-01" date-type] - [" 02 Jan 2018" #t "2018-01-02" date-type] - [" 1 Jan, 2018" #t "2018-01-01" date-type] - [" 02 Jan, 2018" #t "2018-01-02" date-type] - [" January 1 2018" #t "2018-01-01" date-type] - [" January 02 2018" #t "2018-01-02" date-type] - [" January 1, 2018" #t "2018-01-01" date-type] - [" January 02, 2018" #t "2018-01-02" date-type] - [" 1 January 2018" #t "2018-01-01" date-type] - [" 02 January 2018" #t "2018-01-02" date-type] - [" 1 January, 2018" #t "2018-01-01" date-type] - [" 02 January, 2018" #t "2018-01-02" date-type] - [" Saturday, January 1 2000" #t "2000-01-01" date-type] - [" Sunday, January 02 2000" #t "2000-01-02" date-type] - [" Saturday, January 1, 2000" #t "2000-01-01" date-type] - [" Sunday, January 02, 2000" #t "2000-01-02" date-type] - [" 2022-01-01T01:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01t01:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01 01:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01T01:00:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01t01:00:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01 01:00:00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01T01:00:00.00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01t01:00:00.00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01 01:00:00.00 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01T01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01t01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01 01:00:00.000000000 " #t "2022-01-01T01:00" datetime-type] - [" 2022-01-01T01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01t01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01 01:00:00.00-07 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01T01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01t01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01 01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01T01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01t01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01 01:00:00.00-07:00 " #t "2022-01-01T01:00-07:00" offset-dt-type] - [" 2022-01-01T01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type] - [" 2022-01-01t01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type] - [" 2022-01-01 01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type]]] - (let [settings {:number-separators (or seps ".,")} - type->check (#'upload/settings->type->check settings) - value-type (#'upload/value->type type->check string-value) - ;; get the type of the column, if we created it based on only that value - col-type (first (upload/column-types-from-rows settings nil [[string-value]])) - parser (upload-parsing/upload-type->parser col-type settings)] - (testing (format "\"%s\" is a %s" string-value value-type) - (is (= expected-type - value-type))) - (testing (format "\"%s\" is parsed into %s" string-value expected-value) - (is (= expected-value - (parser string-value))))))) - -(deftest ^:parallel type-coalescing-test - (doseq [[type-a type-b expected] - [[bool-type bool-type bool-type] - [bool-type int-type vchar-type] - [bool-type bool-int-type bool-type] - [bool-type date-type vchar-type] - [bool-type datetime-type vchar-type] - [bool-type vchar-type vchar-type] - [bool-type text-type text-type] - [int-type bool-type vchar-type] - [int-type float-type float-type] - [int-type date-type vchar-type] - [int-type datetime-type vchar-type] - [int-type vchar-type vchar-type] - [int-type text-type text-type] - [int-type bool-int-type int-type] - [bool-int-type bool-int-type bool-int-type] - [float-type vchar-type vchar-type] - [float-type text-type text-type] - [float-type date-type vchar-type] - [float-type datetime-type vchar-type] - [float-type text-type text-type] - [float-type date-type vchar-type] - [float-type datetime-type vchar-type] - [date-type datetime-type datetime-type] - [date-type vchar-type vchar-type] - [date-type text-type text-type] - [datetime-type vchar-type vchar-type] - [offset-dt-type vchar-type vchar-type] - [datetime-type text-type text-type] - [offset-dt-type text-type text-type] - [vchar-type text-type text-type]]] - (is (= expected (#'ordered-hierarchy/first-common-ancestor @#'upload/h type-a type-b)) - (format "%s + %s = %s" (name type-a) (name type-b) (name expected))))) - (defn csv-file-with "Create a temp csv file with the given content and return the file" (^File [rows] @@ -1279,8 +1093,8 @@ :or {table-name (mt/random-name) schema-name (sql.tx/session-schema driver/*driver*) col->upload-type (ordered-map/ordered-map - upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk - :name ::upload/varchar-255) + upload/auto-pk-column-keyword auto-pk-type + :name vchar-type) rows [["Obi-Wan Kenobi"]]}}] (let [driver driver/*driver* db-id (mt/id) @@ -1384,9 +1198,9 @@ ["name,id" "Luke Skywalker,20" "Darth Vader,30"]]] ;; different order (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - :_mb_row_id ::upload/auto-incrementing-int-pk - :id ::upload/int - :name ::upload/varchar-255) + :_mb_row_id auto-pk-type + :id int-type + :name vchar-type) :rows [[10 "Obi-Wan Kenobi"]]})] (let [file (csv-file-with csv-rows (mt/random-name))] (is (some? (append-csv! {:file file @@ -1435,8 +1249,8 @@ (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - :id ::upload/int - :name ::upload/varchar-255) + :id int-type + :name vchar-type) :rows [[1,"some_text"]]})] (let [file (csv-file-with csv-rows (mt/random-name))] @@ -1465,14 +1279,14 @@ (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - :_mb_row_id ::upload/auto-incrementing-int-pk - :biginteger ::upload/int - :float ::upload/float - :text ::upload/varchar-255 - :boolean ::upload/boolean - :date ::upload/date - :datetime ::upload/datetime - :offset_datetime ::upload/offset-datetime) + :_mb_row_id auto-pk-type + :biginteger int-type + :float float-type + :text vchar-type + :boolean bool-type + :date date-type + :datetime datetime-type + :offset_datetime offset-dt-type) :rows [[1000000,1.0,"some_text",false,#t "2020-01-01",#t "2020-01-01T00:00:00",#t "2020-01-01T00:00:00"]]})] (let [csv-rows ["biginteger,float,text,boolean,date,datetime,offset_datetime" "2000000,2.0,some_text,true,2020-02-02,2020-02-02T02:02:02,2020-02-02T02:02:02+02:00"] @@ -1506,7 +1320,7 @@ (testing "If the table doesn't have _mb_row_id but the CSV does, ignore the CSV _mb_row_id but create the column anyway" (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - :name ::upload/varchar-255) + :name vchar-type) :rows [["Obi-Wan Kenobi"]]})] (let [csv-rows ["_MB-row ID,name" "1000,Luke Skywalker"] file (csv-file-with csv-rows (mt/random-name))] @@ -1546,7 +1360,7 @@ (testing "If the table doesn't have _mb_row_id and a failure occurs, we shouldn't create a _mb_row_id column" (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - :bool_column ::upload/boolean) + :bool_column bool-type) :rows [[true]]})] (let [csv-rows ["bool_column" "not a bool"] file (csv-file-with csv-rows (mt/random-name)) @@ -1691,9 +1505,9 @@ (testing "Append should handle the columns in the CSV file being reordered" (with-upload-table! [table (create-upload-table! :col->upload-type (ordered-map/ordered-map - upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk - :name ::upload/varchar-255 - :shame ::upload/varchar-255) + upload/auto-pk-column-keyword auto-pk-type + :name vchar-type + :shame vchar-type) :rows [["Obi-Wan Kenobi" "No one really knows me"]])] (let [csv-rows ["shame,name" "Nothing - you can't prove it,Puke Nightstalker"] @@ -1731,27 +1545,27 @@ (doseq [auto-pk-column? [true false]] (testing (str "\nFor a table that has " (if auto-pk-column? "an" " no") " automatically generated PK already") (doseq [{:keys [upload-type valid invalid msg]} - [{:upload-type ::upload/int + [{:upload-type int-type :valid 1 :invalid "not an int" :msg "'not an int' is not a recognizable number"} - {:upload-type ::upload/float + {:upload-type float-type :valid 1.1 :invalid "not a float" :msg "'not a float' is not a recognizable number"} - {:upload-type ::upload/boolean + {:upload-type bool-type :valid true :invalid "correct" :msg "'correct' is not a recognizable boolean"} - {:upload-type ::upload/date + {:upload-type date-type :valid #t "2000-01-01" :invalid "2023-01-01T00:00:00" :msg "'2023-01-01T00:00:00' is not a recognizable date"} - {:upload-type ::upload/datetime + {:upload-type datetime-type :valid #t "2000-01-01T00:00:00" :invalid "2023-01-01T00:00:00+01" :msg "'2023-01-01T00:00:00+01' is not a recognizable datetime"} - {:upload-type ::upload/offset-datetime + {:upload-type offset-dt-type :valid #t "2000-01-01T00:00:00+01" :invalid "2023-01-01T00:00:00[Europe/Helsinki]" :msg "'2023-01-01T00:00:00[Europe/Helsinki]' is not a recognizable zoned datetime"}]] @@ -1760,9 +1574,9 @@ [table (create-upload-table! {:col->upload-type (cond-> (ordered-map/ordered-map :test_column upload-type - :name ::upload/varchar-255) + :name vchar-type) auto-pk-column? - (assoc upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk)) + (assoc upload/auto-pk-column-keyword auto-pk-type)) :rows [[valid "Obi-Wan Kenobi"]]})] (let [;; The CSV contains 50 valid rows and 1 invalid row csv-rows `["test_column,name" ~@(repeat 50 (str valid ",Darth Vadar")) ~(str invalid ",Luke Skywalker")] @@ -1785,7 +1599,9 @@ (mt/test-drivers (filter (fn [driver] ;; use of varchar(255) is not universal for all drivers, so only test drivers that ;; have different database types for varchar(255) and text - (apply not= (map (partial driver/upload-type->database-type driver) [::upload/varchar-255 ::upload/text]))) + (apply not= (->> [vchar-type text-type] + (map #(keyword "metabase.upload" (name %))) + (map (partial driver/upload-type->database-type driver))))) (mt/normal-drivers-with-feature :uploads)) (with-mysql-local-infile-off (testing "Append fails if the CSV file contains string values that are too long for the column" @@ -1794,8 +1610,8 @@ (binding [driver/*insert-chunk-rows* 1] (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk - :test_column ::upload/varchar-255) + upload/auto-pk-column-keyword auto-pk-type + :test_column vchar-type) :rows [["valid"]]})] (let [csv-rows `["test_column" ~@(repeat 50 "valid too") ~(apply str (repeat 256 "x"))] file (csv-file-with csv-rows (mt/random-name))] @@ -1816,14 +1632,14 @@ ;; for drivers that insert rows in chunks, we change the chunk size to 1 so that we can test that the ;; inserted rows are rolled back (binding [driver/*insert-chunk-rows* 1] - (let [upload-type ::upload/varchar-255, + (let [upload-type vchar-type, uncoerced (apply str (repeat 256 "x")) coerced (apply str (repeat 255 "x"))] (testing (format "\nUploading %s into a column of type %s should be coerced to %s" uncoerced (name upload-type) coerced) (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk + upload/auto-pk-column-keyword auto-pk-type :test_column upload-type) :rows []})] (let [csv-rows ["test_column" uncoerced] @@ -1845,16 +1661,16 @@ ;; inserted rows are rolled back (binding [driver/*insert-chunk-rows* 1] (doseq [{:keys [upload-type uncoerced coerced fail-msg] :as args} - [{:upload-type ::upload/int, :uncoerced "2.0", :coerced 2} ;; value is coerced to int - {:upload-type ::upload/int, :uncoerced "2.1", :coerced 2.1} ;; column is promoted to float - {:upload-type ::upload/float, :uncoerced "2", :coerced 2.0} - {:upload-type ::upload/boolean, :uncoerced "0", :coerced false} - {:upload-type ::upload/boolean, :uncoerced "1.0", :fail-msg "'1.0' is not a recognizable boolean"} - {:upload-type ::upload/boolean, :uncoerced "0.0", :fail-msg "'0.0' is not a recognizable boolean"} - {:upload-type ::upload/int, :uncoerced "01/01/2012", :fail-msg "'01/01/2012' is not a recognizable number"}]] + [{:upload-type int-type, :uncoerced "2.0", :coerced 2} ;; value is coerced to int + {:upload-type int-type, :uncoerced "2.1", :coerced 2.1} ;; column is promoted to float + {:upload-type float-type, :uncoerced "2", :coerced 2.0} + {:upload-type bool-type, :uncoerced "0", :coerced false} + {:upload-type bool-type, :uncoerced "1.0", :fail-msg "'1.0' is not a recognizable boolean"} + {:upload-type bool-type, :uncoerced "0.0", :fail-msg "'0.0' is not a recognizable boolean"} + {:upload-type int-type, :uncoerced "01/01/2012", :fail-msg "'01/01/2012' is not a recognizable number"}]] (with-upload-table! [table (create-upload-table! {:col->upload-type (ordered-map/ordered-map - upload/auto-pk-column-keyword ::upload/auto-incrementing-int-pk + upload/auto-pk-column-keyword auto-pk-type :test_column upload-type) :rows []})] (let [csv-rows ["test_column" uncoerced] @@ -1878,36 +1694,6 @@ (append!))))) (io/delete-file file))))))))) -(def ^:private concretize @#'upload/concretize) - -(deftest initial-column-type-test - (let [column-type (partial concretize nil)] - (testing "Unknown value types are treated as text" - (is (= ::upload/text (column-type nil)))) - (testing "Non-abstract value types resolve to themselves" - (let [ts @#'upload/column-types] - (is (= (zipmap ts ts) - (zipmap (map column-type ts) ts))))) - (testing "Abstract values are resolved using an explicit map, which is consistent with the hierarchy" - (doseq [[value-type expected-column-type] @#'upload/abstract->concrete] - (is (= expected-column-type (column-type value-type))) - ;; Strictly speaking we only require that there is a route from each abstract node to its column type which only - ;; traverses through abstract nodes, but it's much simpler to enforce this stronger condition. - (is (contains? (parents @#'upload/h value-type) expected-column-type)))))) - -(deftest append-column-type-test - (doseq [existing-type @#'upload/value-types - value-type @#'upload/value-types] - (case [existing-type value-type] - [::upload/int ::upload/*float-or-int*] - (testing "We coerce floats with fractional part to plan integers when appending into an existing integer column" - (is (= ::upload/int (concretize existing-type value-type)))) - - ;; This is unsatisfying, would be good if this interface also covered promoting columns and rejecting values. - (testing (format "We append %s values to %s columns as if we were inserting them into new columns" existing-type value-type) - (is (= (concretize nil value-type) - (concretize existing-type value-type))))))) - (deftest create-from-csv-int-and-float-test (testing "Creation should handle a mix of int and float-or-int values in any order" (mt/test-drivers (mt/normal-drivers-with-feature :uploads) @@ -1951,9 +1737,9 @@ (testing "Append should handle a mix of int and float-or-int values being appended to an int column" (with-upload-table! [table (create-upload-table! :col->upload-type (ordered-map/ordered-map - :_mb_row_id ::upload/auto-incrementing-int-pk - :number_1 ::upload/int - :number_2 ::upload/int) + :_mb_row_id auto-pk-type + :number_1 int-type + :number_2 int-type) :rows [[1, 1]])] (let [csv-rows ["number-1, number-2" @@ -1973,9 +1759,9 @@ (testing "Append should handle a mix of int and float-or-int values being appended to an int column" (with-upload-table! [table (create-upload-table! :col->upload-type (ordered-map/ordered-map - :_mb_row_id ::upload/auto-incrementing-int-pk - :number_1 ::upload/int - :number_2 ::upload/int) + :_mb_row_id auto-pk-type + :number_1 int-type + :number_2 int-type) :rows [[1, 1]])] (let [csv-rows ["number-1, number-2" @@ -1991,18 +1777,3 @@ table-rows))) (io/delete-file file)))))) - -(deftest coercion-soundness-test - (testing "Every coercion maps to a stricter type that is a direct descendant" - (is (empty? - ;; Build a set of all type-pairs that violate this constraint - (into (sorted-set) - (comp (mapcat (fn [[column-type value-types]] - (for [value-type value-types] - [column-type value-type]))) - (remove (fn [[column-type value-type]] - ;; Strictly speaking we only require that there is a route which only traverses - ;; through abstract nodes, but it's much simpler to enforce this stronger condition: - ;; that `column-type` is a child of `value-type` - (contains? (ordered-hierarchy/children @#'upload/h value-type) column-type)))) - @#'upload/column-type->coercible-value-types))))) diff --git a/test/metabase/util/ordered_hierarchy_test.clj b/test/metabase/util/ordered_hierarchy_test.clj index 10e57672fdc..9fc79295b22 100644 --- a/test/metabase/util/ordered_hierarchy_test.clj +++ b/test/metabase/util/ordered_hierarchy_test.clj @@ -1,93 +1,59 @@ (ns metabase.util.ordered-hierarchy-test (:require [clojure.test :refer [deftest is testing]] - [clojure.walk :as walk] - [metabase.upload :as upload] [metabase.util.ordered-hierarchy :as ordered-hierarchy])) -;;; It would be nice to have property tests, to expose any subtle edge cases. -;;; For now, we use a translation of some real world usage in the app. -(def ^:private h - (walk/postwalk - (fn [x] - (if (and (keyword? x) (namespace x)) - (keyword (name (ns-name *ns*)) (name x)) - x)) - @#'upload/h)) - -(deftest ^:parallel parents-test - (testing "Parents are listed according to the order that this tag was derived from each of them" - (is (nil? (parents h ::text))) - (is (= [::text] (vec (parents h ::varchar-255)))) - (is (= [::*float-or-int*] (vec (parents h ::int)))) - (is (= [::boolean ::int] (vec (parents h ::*boolean-int*)))))) - -(deftest ^:parallel children-test - (testing "Children are listed in reverse order to when they were each derived from this tag" - (is (nil? (ordered-hierarchy/children h ::*boolean-int*))) - (is (= [::varchar-255] (vec (ordered-hierarchy/children h ::text)))) - (is (= [::*float-or-int*] (vec (ordered-hierarchy/children h ::float)))) - (is (= [::auto-incrementing-int-pk ::*boolean-int*] (vec (ordered-hierarchy/children h ::int)))))) - -(deftest ^:parallel ancestors-test - (testing "Linear ancestors are listed in order" - (is (nil? (ancestors h ::text))) - (is (= [::text] (vec (ancestors h ::varchar-255)))) - (is (= [::varchar-255 ::text] (vec (ancestors h ::boolean)))) - (is (= [::*float-or-int* ::float ::varchar-255 ::text] (vec (ancestors h ::int))))) - - (testing "Non-linear ancestors are listed in topological order, following edges in the order they were defined." - (is (= [::boolean - ::int - ::*float-or-int* - ::float - ::varchar-255 - ::text] - (vec (ancestors h ::*boolean-int*)))))) - -(deftest ^:parallel descendants-test - (testing "Linear descendants are listed in order" - (is (nil? (descendants h ::*boolean-int*))) - (is (nil? (descendants h ::date))) - (is (= [::date] (vec (descendants h ::datetime)))) - (is (= [::*boolean-int*] (vec (descendants h ::boolean))))) - - (testing "Non-linear descendants are listed in reverse topological order, following edges in reserve order." - (is (= [::*float-or-int* - ::int - ::auto-incrementing-int-pk - ::*boolean-int*] - (vec (descendants h ::float)))) - (is (= [::varchar-255 - ::offset-datetime - ::datetime - ::date - ::float - ::*float-or-int* - ::int - ::auto-incrementing-int-pk - ::boolean - ::*boolean-int*] - (vec (descendants h ::text)))))) - -(deftest ^:parallel sorted-tags-test - (testing "Tags are returned in a topological ordering that also preserves insertion order of the edges." - (is (= [::*boolean-int* - ::boolean - ::auto-incrementing-int-pk - ::int - ::*float-or-int* - ::float - ::date - ::datetime - ::offset-datetime - ::varchar-255 - ::text] - (vec (ordered-hierarchy/sorted-tags h)))))) - -(deftest ^:parallel first-common-ancestor-test - (testing "The first-common-ancestor is the first tag in the lineage of tag-a that is also in the lineage of tag-b" - (is (= ::*boolean-int* (ordered-hierarchy/first-common-ancestor h ::*boolean-int* nil))) - (is (= ::*boolean-int* (ordered-hierarchy/first-common-ancestor h ::*boolean-int* ::*boolean-int*))) - (is (= ::boolean (ordered-hierarchy/first-common-ancestor h ::*boolean-int* ::boolean))) - (is (= ::varchar-255 (ordered-hierarchy/first-common-ancestor h ::boolean ::int))))) +(def ^:private polygons + (ordered-hierarchy/make-hierarchy + [:quadrilateral + [:trapezoid :isosceles-trapezoid :right-trapezoid] + [:kite [:rhombus :square]] + [:parallelogram + :rhombus + [:rectangle :square]]] + [:triangle + :scalene-triangle + [:isosceles-triangle :equilateral-triangle] + [:acute-triangle :equilateral-triangle] + :right-angled-triangle + :obtuse-triangle])) + +(deftest make-hierarchy-test + (testing "Hiccup structures have the expected topological order" + (is (= [:isosceles-trapezoid + :right-trapezoid + :trapezoid + :square + :rhombus + :kite + :rectangle + :parallelogram + :quadrilateral + ;; it's unfortunate that we would exhaustively test all the quadrilateral types, before checking + ;; if it's a triangle (if "hypothetically" we were using the topological order to test a value + ;; ... this is a case where a root-to-leaf traversal would make more sense. + :scalene-triangle + :equilateral-triangle + :isosceles-triangle + :acute-triangle + :right-angled-triangle + :obtuse-triangle + :triangle] + (vec (ordered-hierarchy/sorted-tags polygons))))) + + (testing "Hiccup structures are translated into the expected graph structure" + (is (= {:trapezoid [:quadrilateral] + :isosceles-trapezoid [:trapezoid] + :right-trapezoid [:trapezoid] + :kite [:quadrilateral] + :rhombus [:kite :parallelogram] + :square [:rhombus :rectangle] + :parallelogram [:quadrilateral] + :rectangle [:parallelogram] + :scalene-triangle [:triangle] + :isosceles-triangle [:triangle] + :equilateral-triangle [:isosceles-triangle :acute-triangle] + :acute-triangle [:triangle] + :right-angled-triangle [:triangle] + :obtuse-triangle [:triangle]} + (update-vals (:parents polygons) vec))))) -- GitLab