diff --git a/src/metabase/driver/sql_jdbc/sync/describe_table.clj b/src/metabase/driver/sql_jdbc/sync/describe_table.clj index f3f44ca6fe64519bf646c376a28f3e48bb65d6b9..4851c4699d91e880b7a13cbb07d912bbc1b1b93b 100644 --- a/src/metabase/driver/sql_jdbc/sync/describe_table.clj +++ b/src/metabase/driver/sql_jdbc/sync/describe_table.clj @@ -2,7 +2,6 @@ "SQL JDBC impl for `describe-fields`, `describe-table`, `describe-fks`, `describe-table-fks`, and `describe-nested-field-columns`. `describe-table-fks` is deprecated and will be replaced by `describe-fks` in the future." (:require - [cheshire.core :as json] [clojure.java.jdbc :as jdbc] [clojure.set :as set] [clojure.string :as str] @@ -24,6 +23,7 @@ #_{:clj-kondo/ignore [:discouraged-namespace]} [toucan2.core :as t2]) (:import + (com.fasterxml.jackson.core JsonFactory JsonParser JsonToken JsonParser$NumberType) (java.sql Connection DatabaseMetaData ResultSet))) (set! *warn-on-reflection* true) @@ -349,22 +349,9 @@ :value index-name}))) set))))) -(def ^:dynamic *nested-field-column-max-row-length* - "Max string length for a row for nested field column before we just give up on parsing it. - Marked as mutable because we mutate it for tests." - 50000) - -(defn- flattened-row [field-name row] - (letfn [(flatten-row [row path] - (lazy-seq - (when-let [[[k v] & xs] (seq row)] - (cond (and (map? v) (not-empty v)) - (into (flatten-row v (conj path k)) - (flatten-row xs path)) - :else - (cons [(conj path k) v] - (flatten-row xs path))))))] - (into {} (flatten-row row [field-name])))) +(def ^:const max-nested-field-columns + "Maximum number of nested field columns." + 100) (def ^:private ^{:arglists '([s])} can-parse-datetime? "Returns whether a string can be parsed to an ISO 8601 datetime or not." @@ -372,33 +359,74 @@ (defn- type-by-parsing-string "Mostly just (type member) but with a bit to suss out strings which are ISO8601 and say that they are datetimes" - [member] - (let [member-type (type member)] - (if (and (instance? String member) - (can-parse-datetime? member)) - java.time.LocalDateTime - member-type))) - -(defn- row->types [row] - (into {} (for [[field-name field-val] row - ;; We put top-level array row type semantics on JSON roadmap but skip for now - :when (map? field-val)] - (let [flat-row (flattened-row field-name field-val)] - (into {} (map (fn [[k v]] [k (type-by-parsing-string v)]) flat-row)))))) - -(defn- describe-json-xform [member] - ((comp (map #(for [[k v] % - :when (< (count v) *nested-field-column-max-row-length*)] - [k (json/parse-string v)])) - (map #(into {} %)) - (map row->types)) member)) - -(def ^:const max-nested-field-columns - "Maximum number of nested field columns." - 100) + [value] + (if (and (string? value) + (can-parse-datetime? value)) + java.time.LocalDateTime + (type value))) + +(defn- json-parser ^JsonParser [v] + (let [f (JsonFactory.)] + (if (string? v) + (.createParser f ^String v) + (.createParser f ^java.io.Reader v)))) + +(defn- number-type [t] + (u/case-enum t + JsonParser$NumberType/INT Long + JsonParser$NumberType/LONG Long + JsonParser$NumberType/FLOAT Double + JsonParser$NumberType/DOUBLE Double + JsonParser$NumberType/BIG_INTEGER clojure.lang.BigInt + ;; there seem to be no way to encounter this, search in tests for `BigDecimal` + JsonParser$NumberType/BIG_DECIMAL BigDecimal)) + +(defn- json->types + "Parses given json (a string or a reader) into a map of paths to types, i.e. `{[\"bob\"} String}`. + + Uses Jackson Streaming API to skip allocating data structures, eschews allocating values when possible. + Respects *nested-field-column-max-row-length*." + [v path] + (let [p (json-parser v)] + (loop [path (or path []) + key nil + res (transient {})] + (let [token (.nextToken p)] + (cond + (nil? token) + (persistent! res) + + ;; we could be more precise here and issue warning about nested fields (the one in `describe-json-fields`), + ;; but this limit could be hit by multiple json fields (fetched in `describe-json-fields`) rather than only + ;; by this one. So for the sake of issuing only a single warning in logs we'll spill over limit by a single + ;; entry (instead of doing `<=`). + (< max-nested-field-columns (count res)) + (persistent! res) + + :else + (u/case-enum token + JsonToken/VALUE_NUMBER_INT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p)))) + JsonToken/VALUE_NUMBER_FLOAT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p)))) + JsonToken/VALUE_TRUE (recur path key (assoc! res (conj path key) Boolean)) + JsonToken/VALUE_FALSE (recur path key (assoc! res (conj path key) Boolean)) + JsonToken/VALUE_NULL (recur path key (assoc! res (conj path key) nil)) + JsonToken/VALUE_STRING (recur path key (assoc! res (conj path key) + (type-by-parsing-string (.getText p)))) + JsonToken/FIELD_NAME (recur path (.getText p) res) + JsonToken/START_OBJECT (recur (cond-> path key (conj key)) key res) + JsonToken/END_OBJECT (recur (cond-> path (seq path) pop) key res) + ;; We put top-level array row type semantics on JSON roadmap but skip for now + JsonToken/START_ARRAY (do (.skipChildren p) + (if key + (recur path key (assoc! res (conj path key) clojure.lang.PersistentVector)) + (recur path key res))) + JsonToken/END_ARRAY (recur path key res))))))) + +(defn- json-map->types [json-map] + (apply merge (map #(json->types (second %) [(first %)]) json-map))) (defn- describe-json-rf - "Reducing function that takes a bunch of maps from row->types, + "Reducing function that takes a bunch of maps from json-map->types, and gets them to conform to the type hierarchy, going through and taking the lowest common denominator type at each pass, ignoring the nils." @@ -550,7 +578,7 @@ driver (sample-json-row-honey-sql table-identifier json-field-identifiers pk-identifiers)) query (jdbc/reducible-query jdbc-spec sql-args {:identifiers identity}) - field-types (transduce describe-json-xform describe-json-rf query) + field-types (transduce (map json-map->types) describe-json-rf query) fields (field-types->fields field-types)] (if (> (count fields) max-nested-field-columns) (do diff --git a/src/metabase/util.cljc b/src/metabase/util.cljc index ccdf6d9d32cc45da65af3e832b1b9bbdd9b8f5d4..ff88ef920b0a15502e648e5f830b8d4190b6341e 100644 --- a/src/metabase/util.cljc +++ b/src/metabase/util.cljc @@ -25,12 +25,15 @@ [potemkin :as p] [ring.util.codec :as codec]))) #?(:clj (:import + (clojure.lang Reflector) (java.text Normalizer Normalizer$Form) (java.util Locale) (org.apache.commons.validator.routines RegexValidator UrlValidator))) #?(:cljs (:require-macros [camel-snake-kebab.internals.macros :as csk.macros] [metabase.util]))) +#?(:clj (set! *warn-on-reflection* true)) + (u.ns/import-fns [u.format colorize format-bytes format-color format-milliseconds format-nanoseconds format-seconds]) @@ -945,3 +948,27 @@ (fn [acc x] (if (pred x) (reduced x) acc)) nil coll)) + +#?(:clj + (let [sym->enum (fn ^Enum [sym] + (Reflector/invokeStaticMethod ^Class (resolve (symbol (namespace sym))) + "valueOf" + (to-array [(name sym)]))) + ordinal (fn [^Enum e] (.ordinal e))] + (defmacro case-enum + "Like `case`, but explicitly dispatch on Java enum ordinals. + + Passing the same enum type as the ones you're checking in is on you, this is not checked." + [value & clauses] + (let [types (map (comp type sym->enum first) (partition 2 clauses))] + ;; doesn't check for the value of `case`, but that's on user + (if-not (apply = types) + `(throw (ex-info (str "`case-enum` only works if all supplied enums are of a same type: " ~(vec types)) + {:types ~(vec types)})) + `(case (int (~ordinal ~value)) + ~@(concat + (mapcat (fn [[test result]] + [(ordinal (sym->enum test)) result]) + (partition 2 clauses)) + (when (odd? (count clauses)) + (list (last clauses)))))))))) diff --git a/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj b/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj index 6d511123311315e6d0aab96e3ea69c168396985f..4bb01a13f4a6f931aaa1a1d3af2f00c405c16e08 100644 --- a/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj +++ b/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj @@ -190,32 +190,24 @@ (deftest ^:parallel row->types-test (testing "array rows ignored properly in JSON row->types (#21752)" - (let [arr-row {:bob [:bob :cob :dob 123 "blob"]} - obj-row {:zlob {"blob" 1323}}] - (is (= {} (#'sql-jdbc.describe-table/row->types arr-row))) - (is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/row->types obj-row))))) - (testing "JSON row->types handles bigint OK (#21752)" - (let [int-row {:zlob {"blob" 123N}} - float-row {:zlob {"blob" 1234.02M}}] - (is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/row->types int-row))) - (is (= {[:zlob "blob"] java.math.BigDecimal} (#'sql-jdbc.describe-table/row->types float-row)))))) - -(deftest ^:parallel dont-parse-long-json-xform-test - (testing "obnoxiously long json should not even get parsed (#22636)" - ;; Generating an actually obnoxiously long json took too long, - ;; and actually copy-pasting an obnoxiously long string in there looks absolutely terrible, - ;; so this rebinding is what you get - (let [obnoxiously-long-json "{\"bob\": \"dobbs\"}" - json-map {:somekey obnoxiously-long-json}] - (binding [sql-jdbc.describe-table/*nested-field-column-max-row-length* 3] - (is (= {} - (transduce - #'sql-jdbc.describe-table/describe-json-xform - #'sql-jdbc.describe-table/describe-json-rf [json-map])))) - (is (= {[:somekey "bob"] java.lang.String} - (transduce - #'sql-jdbc.describe-table/describe-json-xform - #'sql-jdbc.describe-table/describe-json-rf [json-map])))))) + (let [arr-row {:bob (json/encode [:bob :cob :dob 123 "blob"])} + obj-row {:zlob (json/encode {:blob Long/MAX_VALUE})}] + (is (= {} (#'sql-jdbc.describe-table/json-map->types arr-row))) + (is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/json-map->types obj-row))))) + (testing "JSON json-map->types handles bigint OK (#22732)" + (let [int-row {:zlob (json/encode {"blob" (inc (bigint Long/MAX_VALUE))})} + float-row {:zlob "{\"blob\": 12345678901234567890.12345678901234567890}"}] + (is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/json-map->types int-row))) + ;; no idea how to force it to use BigDecimal here + (is (= {[:zlob "blob"] Double} (#'sql-jdbc.describe-table/json-map->types float-row)))))) + +(deftest ^:parallel key-limit-test + (testing "we don't read too many keys even from long jsons" + (let [data (into {} (for [i (range (* 2 @#'sql-jdbc.describe-table/max-nested-field-columns))] + [(str "key" i) i]))] + ;; inc the limit since we go 1 over the limit, see comment in `json->types` + (is (= (inc @#'sql-jdbc.describe-table/max-nested-field-columns) + (count (#'sql-jdbc.describe-table/json-map->types {:k (json/encode data)}))))))) (deftest ^:parallel get-table-pks-test ;; FIXME: this should works for all sql drivers @@ -249,18 +241,13 @@ (is (nil? (:visibility-type text-field)))))))))))) (deftest ^:parallel describe-nested-field-columns-test - (testing "flattened-row" - (let [row {:bob {:dobbs 123 :cobbs "boop"}} - flattened {[:mob :bob :dobbs] 123 - [:mob :bob :cobbs] "boop"}] - (is (= flattened (#'sql-jdbc.describe-table/flattened-row :mob row))))) - (testing "row->types" - (let [row {:bob {:dobbs {:robbs 123} :cobbs [1 2 3]}} - types {[:bob :cobbs] clojure.lang.PersistentVector - [:bob :dobbs :robbs] java.lang.Long}] - (is (= types (#'sql-jdbc.describe-table/row->types row))))) - (testing "JSON row->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)" - (let [int-row {:zlob {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")}}] + (testing "json-map->types" + (let [row {:bob (json/encode {:dobbs {:robbs 123} :cobbs [1 2 3]})} + types {[:bob "cobbs"] clojure.lang.PersistentVector + [:bob "dobbs" "robbs"] java.lang.Long}] + (is (= types (#'sql-jdbc.describe-table/json-map->types row))))) + (testing "JSON json-map->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)" + (let [int-row {:zlob (json/encode {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")})}] (is (= #{{:name "zlob → blob", :database-type "decimal", :base-type :type/BigInteger, @@ -269,7 +256,7 @@ :visibility-type :normal, :nfc-path [:zlob "blob"]}} (-> int-row - (#'sql-jdbc.describe-table/row->types) + (#'sql-jdbc.describe-table/json-map->types) (#'sql-jdbc.describe-table/field-types->fields))))))) (deftest ^:parallel nested-field-column-test diff --git a/test/metabase/util_test.cljc b/test/metabase/util_test.cljc index ceff431357059ffb47d8215e8ddce36061a77d94..58abdd584f0f60c60815853e751eab937f21ddc8 100644 --- a/test/metabase/util_test.cljc +++ b/test/metabase/util_test.cljc @@ -9,7 +9,10 @@ [flatland.ordered.map :refer [ordered-map]] #_:clj-kondo/ignore [malli.generator :as mg] - [metabase.util :as u])) + [metabase.util :as u]) + #?(:clj (:import [java.time Month DayOfWeek]))) + +#?(:clj (set! *warn-on-reflection* true)) (deftest ^:parallel add-period-test (is (= "This sentence needs a period." @@ -508,3 +511,21 @@ (let [expected (reduce + (u/map-all (fnil + 0 0 0) xs ys zs)) sum (+ (reduce + 0 xs) (reduce + 0 ys) (reduce + 0 zs))] (= expected sum))))) + +#?(:clj + (deftest ^:parallel case-enum-test + (testing "case does not work" + (is (= 3 (case Month/MAY + Month/APRIL 1 + Month/MAY 2 + 3)))) + (testing "case-enum works" + (is (= 2 (u/case-enum Month/MAY + Month/APRIL 1 + Month/MAY 2 + 3)))) + (testing "checks for type of cases" + (is (thrown? Exception #"`case-enum` only works.*" + (u/case-enum Month/JANUARY + Month/JANUARY 1 + DayOfWeek/SUNDAY 2))))))