Skip to content
Snippets Groups Projects
Unverified Commit d25c356a authored by Cal Herries's avatar Cal Herries Committed by GitHub
Browse files

Tidy CSV uploads (#36082)

parent f3f5a93f
No related branches found
No related tags found
No related merge requests found
......@@ -895,7 +895,7 @@
:hierarchy #'hierarchy)
(defmulti insert-into!
"Insert `values` into a table named `table-name`. `values` is a sequence of rows, where each row's order matches
"Insert `values` into a table named `table-name`. `values` is a lazy sequence of rows, where each row's order matches
`column-names`."
{:added "0.47.0", :arglists '([driver db-id table-name column-names values])}
dispatch-on-initialized-driver
......
......@@ -3,7 +3,6 @@
[clj-bom.core :as bom]
[clojure.data.csv :as csv]
[clojure.java.io :as io]
[clojure.set :as set]
[clojure.string :as str]
[flatland.ordered.map :as ordered-map]
[flatland.ordered.set :as ordered-set]
......@@ -21,7 +20,6 @@
(set! *warn-on-reflection* true)
;;;; <pre><code>
;;;;
;;;; +------------------+
......@@ -52,7 +50,8 @@
(def ^:private type->parent
;; listed in depth-first order
{::varchar-255 ::text
{::text nil
::varchar-255 ::text
::float ::varchar-255
::int ::float
::int-pk ::int
......@@ -68,8 +67,7 @@
::int ::int-pk})
(def ^:private types
(set/union (set (keys type->parent))
(set (vals type->parent))))
(keys type->parent))
(def ^:private pk-base-types
(set (keys base-type->pk-type)))
......@@ -78,27 +76,12 @@
(into {} (for [type types]
[type (loop [ret (ordered-set/ordered-set)
type type]
(if-let [parent (type->parent type)]
(if-some [parent (type->parent type)]
(recur (conj ret parent) parent)
ret))])))
(defn- date-string? [s]
(try (t/local-date s)
true
(catch Exception _
false)))
(defn- datetime-string? [s]
(try (upload-parsing/parse-datetime s)
true
(catch Exception _
false)))
(defn- offset-datetime-string? [s]
(try (upload-parsing/parse-offset-datetime s)
true
(catch Exception _
false)))
;;;;;;;;;;;;;;;;;;;;;;;;;;
;; [[value->type]] helpers
(defn- with-parens
"Returns a regex that matches the argument, with or without surrounding parentheses."
......@@ -132,7 +115,31 @@
", " #"\d[\d \u00A0]*\,[\d.]+"
".’" #"\d[\d’]*\.[\d.]+"))))
(defn value->type
(defmacro does-not-throw?
"Returns true if the given body does not throw an exception."
[body]
`(try
~body
true
(catch Throwable e#
false)))
(defn- date-string? [s]
(does-not-throw? (t/local-date s)))
(defn- datetime-string? [s]
(does-not-throw? (upload-parsing/parse-datetime s)))
(defn- offset-datetime-string? [s]
(does-not-throw? (upload-parsing/parse-offset-datetime s)))
(defn- boolean-string? [s]
(boolean (re-matches #"(?i)true|t|yes|y|1|false|f|no|n|0" s)))
;; end [[value->type]] helpers
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn- value->type
"The most-specific possible type for a given value. Possibilities are:
- `::boolean`
......@@ -148,23 +155,22 @@
1. ints/floats are assumed to use the separators and decimal points corresponding to the locale defined in the
application settings
2. 0 and 1 are assumed to be booleans, not ints."
[value]
(let [number-separators (upload-parsing/get-number-separators)
trimmed-val (str/trim value)]
[value {:keys [number-separators] :as _settings}]
(let [trimmed (str/trim value)]
(cond
(str/blank? value) nil
(re-matches #"(?i)true|t|yes|y|1|false|f|no|n|0" trimmed-val) ::boolean
(offset-datetime-string? trimmed-val) ::offset-datetime
(datetime-string? trimmed-val) ::datetime
(date-string? trimmed-val) ::date
(re-matches (int-regex number-separators) trimmed-val) ::int
(re-matches (float-regex number-separators) trimmed-val) ::float
(re-matches #".{1,255}" value) ::varchar-255
:else ::text)))
(str/blank? value) nil
(boolean-string? trimmed) ::boolean
(offset-datetime-string? trimmed) ::offset-datetime
(datetime-string? trimmed) ::datetime
(date-string? trimmed) ::date
(re-matches (int-regex number-separators) trimmed) ::int
(re-matches (float-regex number-separators) trimmed) ::float
(<= (count trimmed) 255) ::varchar-255
:else ::text)))
(defn- row->types
[row]
(map value->type row))
[row settings]
(map #(value->type % settings) row))
(defn- lowest-common-member [[x & xs :as all-xs] ys]
(cond
......@@ -181,15 +187,21 @@
(contains? (type->ancestors type-b) type-a) type-a
:else (lowest-common-member (type->ancestors type-a) (type->ancestors type-b))))
(defn- coalesce-types
[types-so-far new-types]
(->> (map vector types-so-far new-types)
(mapv (partial apply lowest-common-ancestor))))
(defn- map-with-nils
"like map with two args except it continues to apply f until ALL of the colls are
exhausted. if colls are of uneven length, nils are supplied."
[f c1 c2]
(lazy-seq
(let [s1 (seq c1) s2 (seq c2)]
(when (or s1 s2)
(cons (f (first s1) (first s2))
(map-with-nils f (rest s1) (rest s2)))))))
(defn- pad
"Lengthen `values` until it is of length `n` by filling it with nils."
[n values]
(first (partition n n (repeat nil) values)))
(defn- coalesce-types
"compares types-a and types-b pairwise, finding the lowest-common-ancestor for each pair.
types-a and types-b can be different lengths."
[types-a types-b]
(map-with-nils lowest-common-ancestor types-a types-b))
(defn- normalize-column-name
[raw-name]
......@@ -214,35 +226,33 @@
:generated-columns (ordered-map/ordered-map :id ::auto-incrementing-int-pk)}))
(defn- rows->schema
"rows should be a lazy-seq"
[header rows]
(let [normalized-header (->> header
(map normalize-column-name)
(mbql.u/uniquify-names)
(map keyword))
column-count (count normalized-header)]
column-count (count normalized-header)
settings (upload-parsing/get-settings)]
(->> rows
(map row->types)
(map (partial pad column-count))
(map #(row->types % settings))
(reduce coalesce-types (repeat column-count nil))
(map #(or % ::text))
(map vector normalized-header)
(->ordered-maps-with-pk-column))))
;;;; +------------------+
;;;; | Parsing values |
;;;; +------------------+
(defn- parsed-rows
"Returns a lazy seq of parsed rows from the `reader`.
Replaces empty strings with nil."
[col->upload-type reader]
(let [[header & rows] (csv/read-csv reader)
column-count (count header)
parsers (map upload-parsing/upload-type->parser (vals col->upload-type))]
(defn- parse-rows
"Returns a lazy seq of parsed rows from the `reader`. Replaces empty strings with nil."
[col->upload-type rows]
(let [settings (upload-parsing/get-settings)
parsers (map #(upload-parsing/upload-type->parser % settings) (vals col->upload-type))]
(for [row rows]
(for [[value parser] (map vector (pad column-count row) parsers)]
(for [[value parser] (map-with-nils vector row parsers)]
(when (not (str/blank? value))
(parser value))))))
......@@ -298,13 +308,15 @@
[driver db-id table-name ^File csv-file]
(let [{col-to-insert->upload-type :extant-columns
gen-col->upload-type :generated-columns} (detect-schema csv-file)
col-to-create->col-spec (upload-type->col-specs driver
(merge gen-col->upload-type col-to-insert->upload-type))
csv-col-names (keys col-to-insert->upload-type)]
cols->upload-type (merge gen-col->upload-type col-to-insert->upload-type)
col-to-create->col-spec (upload-type->col-specs driver cols->upload-type)
csv-col-names (keys col-to-insert->upload-type)]
(driver/create-table! driver db-id table-name col-to-create->col-spec)
(try
(with-open [reader (io/reader csv-file)]
(let [rows (parsed-rows col-to-insert->upload-type reader)]
(let [rows (->> (csv/read-csv reader)
(drop 1) ; drop header
(parse-rows col-to-insert->upload-type))]
(driver/insert-into! driver db-id table-name csv-col-names rows)
{:num-rows (count rows)
:num-columns (count csv-col-names)
......
......@@ -12,10 +12,13 @@
(def currency-regex "Supported currency signs" #"[$€£¥₹₪₩₿¢\s]")
(defn get-number-separators
"Setting-dependent number separators. Defaults to `.` and `,`. Stored/returned as a string."
(defn get-settings
"Settings that determine how the CSV is parsed.
Includes:
- number-separators: Decimal delimiter defaults to `.` and group delimiter defaults to `,`. Stored/returned as a string."
[]
(get-in (public-settings/custom-formatting) [:type/Number :number_separators] ".,"))
{:number-separators (get-in (public-settings/custom-formatting) [:type/Number :number_separators] ".,")})
(defn parse-bool
"Parses a boolean value (true/t/yes/y/1 and false/f/no/n/0). Case-insensitive."
......@@ -119,57 +122,58 @@
(defmulti upload-type->parser
"Returns a function for the given `metabase.upload` type that will parse a string value (from a CSV) into a value
suitable for insertion."
{:arglists '([upload-type])}
identity)
{:arglists '([upload-type settings])}
(fn [upload-type _]
upload-type))
(defmethod upload-type->parser :metabase.upload/varchar-255
[_]
[_ _]
identity)
(defmethod upload-type->parser :metabase.upload/text
[_]
[_ _]
identity)
(defmethod upload-type->parser :metabase.upload/int
[_]
(partial parse-number (get-number-separators)))
[_ {:keys [number-separators]}]
(partial parse-number number-separators))
(defmethod upload-type->parser :metabase.upload/float
[_]
(partial parse-number (get-number-separators)))
[_ {:keys [number-separators]}]
(partial parse-number number-separators))
(defmethod upload-type->parser :metabase.upload/int-pk
[_]
(partial parse-number (get-number-separators)))
[_ {:keys [number-separators]}]
(partial parse-number number-separators))
(defmethod upload-type->parser :metabase.upload/auto-incrementing-int-pk
[_]
(partial parse-number (get-number-separators)))
[_ {:keys [number-separators]}]
(partial parse-number number-separators))
(defmethod upload-type->parser :metabase.upload/string-pk
[_]
[_ _]
identity)
(defmethod upload-type->parser :metabase.upload/boolean
[_]
[_ _]
(comp
parse-bool
str/trim))
(defmethod upload-type->parser :metabase.upload/date
[_]
[_ _]
(comp
parse-date
str/trim))
(defmethod upload-type->parser :metabase.upload/datetime
[_]
[_ _]
(comp
parse-as-datetime
str/trim))
(defmethod upload-type->parser :metabase.upload/offset-datetime
[_]
[_ _]
(comp
parse-offset-datetime
str/trim))
......@@ -149,15 +149,15 @@
[" 2022-01-01T01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type]
[" 2022-01-01t01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type]
[" 2022-01-01 01:00:00.00Z " (t/offset-date-time "2022-01-01T01:00+00:00") offset-dt-type]]]
(mt/with-temporary-setting-values [custom-formatting (when seps {:type/Number {:number_separators seps}})]
(let [type (upload/value->type string-value)
parser (#'upload-parsing/upload-type->parser type)]
(testing (format "\"%s\" is a %s" string-value type)
(is (= expected-type
type)))
(testing (format "\"%s\" is parsed into %s" string-value expected-value)
(is (= expected-value
(parser string-value))))))))
(let [settings {:number-separators (or seps ".,")}
type (#'upload/value->type string-value settings)
parser (upload-parsing/upload-type->parser type settings)]
(testing (format "\"%s\" is a %s" string-value type)
(is (= expected-type
type)))
(testing (format "\"%s\" is parsed into %s" string-value expected-value)
(is (= expected-value
(parser string-value)))))))
(deftest ^:parallel type-coalescing-test
(doseq [[type-a type-b expected]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment