Skip to content
Snippets Groups Projects
Unverified Commit 5cfc079d authored by Alexander Solovyov's avatar Alexander Solovyov Committed by GitHub
Browse files

json unfolding: parse incrementally instead of using cheshire (#42638)

this lowers memory usage 100x for 50kb strings (our limit before this change) and speeds up parsing by an order of magnitude.

For example, results for parsing 42kb string (already in memory):

Before: Time per call: 6.32 ms    Alloc per call: 8,732,850b
After:  Time per call: 55.16 us   Alloc per call: 83,254b
parent 214a5989
No related branches found
No related tags found
No related merge requests found
......@@ -2,7 +2,6 @@
"SQL JDBC impl for `describe-fields`, `describe-table`, `describe-fks`, `describe-table-fks`, and `describe-nested-field-columns`.
`describe-table-fks` is deprecated and will be replaced by `describe-fks` in the future."
(:require
[cheshire.core :as json]
[clojure.java.jdbc :as jdbc]
[clojure.set :as set]
[clojure.string :as str]
......@@ -24,6 +23,7 @@
#_{:clj-kondo/ignore [:discouraged-namespace]}
[toucan2.core :as t2])
(:import
(com.fasterxml.jackson.core JsonFactory JsonParser JsonToken JsonParser$NumberType)
(java.sql Connection DatabaseMetaData ResultSet)))
(set! *warn-on-reflection* true)
......@@ -349,22 +349,9 @@
:value index-name})))
set)))))
(def ^:dynamic *nested-field-column-max-row-length*
"Max string length for a row for nested field column before we just give up on parsing it.
Marked as mutable because we mutate it for tests."
50000)
(defn- flattened-row [field-name row]
(letfn [(flatten-row [row path]
(lazy-seq
(when-let [[[k v] & xs] (seq row)]
(cond (and (map? v) (not-empty v))
(into (flatten-row v (conj path k))
(flatten-row xs path))
:else
(cons [(conj path k) v]
(flatten-row xs path))))))]
(into {} (flatten-row row [field-name]))))
(def ^:const max-nested-field-columns
"Maximum number of nested field columns."
100)
(def ^:private ^{:arglists '([s])} can-parse-datetime?
"Returns whether a string can be parsed to an ISO 8601 datetime or not."
......@@ -372,33 +359,74 @@
(defn- type-by-parsing-string
"Mostly just (type member) but with a bit to suss out strings which are ISO8601 and say that they are datetimes"
[member]
(let [member-type (type member)]
(if (and (instance? String member)
(can-parse-datetime? member))
java.time.LocalDateTime
member-type)))
(defn- row->types [row]
(into {} (for [[field-name field-val] row
;; We put top-level array row type semantics on JSON roadmap but skip for now
:when (map? field-val)]
(let [flat-row (flattened-row field-name field-val)]
(into {} (map (fn [[k v]] [k (type-by-parsing-string v)]) flat-row))))))
(defn- describe-json-xform [member]
((comp (map #(for [[k v] %
:when (< (count v) *nested-field-column-max-row-length*)]
[k (json/parse-string v)]))
(map #(into {} %))
(map row->types)) member))
(def ^:const max-nested-field-columns
"Maximum number of nested field columns."
100)
[value]
(if (and (string? value)
(can-parse-datetime? value))
java.time.LocalDateTime
(type value)))
(defn- json-parser ^JsonParser [v]
(let [f (JsonFactory.)]
(if (string? v)
(.createParser f ^String v)
(.createParser f ^java.io.Reader v))))
(defn- number-type [t]
(u/case-enum t
JsonParser$NumberType/INT Long
JsonParser$NumberType/LONG Long
JsonParser$NumberType/FLOAT Double
JsonParser$NumberType/DOUBLE Double
JsonParser$NumberType/BIG_INTEGER clojure.lang.BigInt
;; there seem to be no way to encounter this, search in tests for `BigDecimal`
JsonParser$NumberType/BIG_DECIMAL BigDecimal))
(defn- json->types
"Parses given json (a string or a reader) into a map of paths to types, i.e. `{[\"bob\"} String}`.
Uses Jackson Streaming API to skip allocating data structures, eschews allocating values when possible.
Respects *nested-field-column-max-row-length*."
[v path]
(let [p (json-parser v)]
(loop [path (or path [])
key nil
res (transient {})]
(let [token (.nextToken p)]
(cond
(nil? token)
(persistent! res)
;; we could be more precise here and issue warning about nested fields (the one in `describe-json-fields`),
;; but this limit could be hit by multiple json fields (fetched in `describe-json-fields`) rather than only
;; by this one. So for the sake of issuing only a single warning in logs we'll spill over limit by a single
;; entry (instead of doing `<=`).
(< max-nested-field-columns (count res))
(persistent! res)
:else
(u/case-enum token
JsonToken/VALUE_NUMBER_INT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p))))
JsonToken/VALUE_NUMBER_FLOAT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p))))
JsonToken/VALUE_TRUE (recur path key (assoc! res (conj path key) Boolean))
JsonToken/VALUE_FALSE (recur path key (assoc! res (conj path key) Boolean))
JsonToken/VALUE_NULL (recur path key (assoc! res (conj path key) nil))
JsonToken/VALUE_STRING (recur path key (assoc! res (conj path key)
(type-by-parsing-string (.getText p))))
JsonToken/FIELD_NAME (recur path (.getText p) res)
JsonToken/START_OBJECT (recur (cond-> path key (conj key)) key res)
JsonToken/END_OBJECT (recur (cond-> path (seq path) pop) key res)
;; We put top-level array row type semantics on JSON roadmap but skip for now
JsonToken/START_ARRAY (do (.skipChildren p)
(if key
(recur path key (assoc! res (conj path key) clojure.lang.PersistentVector))
(recur path key res)))
JsonToken/END_ARRAY (recur path key res)))))))
(defn- json-map->types [json-map]
(apply merge (map #(json->types (second %) [(first %)]) json-map)))
(defn- describe-json-rf
"Reducing function that takes a bunch of maps from row->types,
"Reducing function that takes a bunch of maps from json-map->types,
and gets them to conform to the type hierarchy,
going through and taking the lowest common denominator type at each pass,
ignoring the nils."
......@@ -550,7 +578,7 @@
driver
(sample-json-row-honey-sql table-identifier json-field-identifiers pk-identifiers))
query (jdbc/reducible-query jdbc-spec sql-args {:identifiers identity})
field-types (transduce describe-json-xform describe-json-rf query)
field-types (transduce (map json-map->types) describe-json-rf query)
fields (field-types->fields field-types)]
(if (> (count fields) max-nested-field-columns)
(do
......
......@@ -25,12 +25,15 @@
[potemkin :as p]
[ring.util.codec :as codec])))
#?(:clj (:import
(clojure.lang Reflector)
(java.text Normalizer Normalizer$Form)
(java.util Locale)
(org.apache.commons.validator.routines RegexValidator UrlValidator)))
#?(:cljs (:require-macros [camel-snake-kebab.internals.macros :as csk.macros]
[metabase.util])))
#?(:clj (set! *warn-on-reflection* true))
(u.ns/import-fns
[u.format colorize format-bytes format-color format-milliseconds format-nanoseconds format-seconds])
......@@ -945,3 +948,27 @@
(fn [acc x] (if (pred x) (reduced x) acc))
nil
coll))
#?(:clj
(let [sym->enum (fn ^Enum [sym]
(Reflector/invokeStaticMethod ^Class (resolve (symbol (namespace sym)))
"valueOf"
(to-array [(name sym)])))
ordinal (fn [^Enum e] (.ordinal e))]
(defmacro case-enum
"Like `case`, but explicitly dispatch on Java enum ordinals.
Passing the same enum type as the ones you're checking in is on you, this is not checked."
[value & clauses]
(let [types (map (comp type sym->enum first) (partition 2 clauses))]
;; doesn't check for the value of `case`, but that's on user
(if-not (apply = types)
`(throw (ex-info (str "`case-enum` only works if all supplied enums are of a same type: " ~(vec types))
{:types ~(vec types)}))
`(case (int (~ordinal ~value))
~@(concat
(mapcat (fn [[test result]]
[(ordinal (sym->enum test)) result])
(partition 2 clauses))
(when (odd? (count clauses))
(list (last clauses))))))))))
......@@ -190,32 +190,24 @@
(deftest ^:parallel row->types-test
(testing "array rows ignored properly in JSON row->types (#21752)"
(let [arr-row {:bob [:bob :cob :dob 123 "blob"]}
obj-row {:zlob {"blob" 1323}}]
(is (= {} (#'sql-jdbc.describe-table/row->types arr-row)))
(is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/row->types obj-row)))))
(testing "JSON row->types handles bigint OK (#21752)"
(let [int-row {:zlob {"blob" 123N}}
float-row {:zlob {"blob" 1234.02M}}]
(is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/row->types int-row)))
(is (= {[:zlob "blob"] java.math.BigDecimal} (#'sql-jdbc.describe-table/row->types float-row))))))
(deftest ^:parallel dont-parse-long-json-xform-test
(testing "obnoxiously long json should not even get parsed (#22636)"
;; Generating an actually obnoxiously long json took too long,
;; and actually copy-pasting an obnoxiously long string in there looks absolutely terrible,
;; so this rebinding is what you get
(let [obnoxiously-long-json "{\"bob\": \"dobbs\"}"
json-map {:somekey obnoxiously-long-json}]
(binding [sql-jdbc.describe-table/*nested-field-column-max-row-length* 3]
(is (= {}
(transduce
#'sql-jdbc.describe-table/describe-json-xform
#'sql-jdbc.describe-table/describe-json-rf [json-map]))))
(is (= {[:somekey "bob"] java.lang.String}
(transduce
#'sql-jdbc.describe-table/describe-json-xform
#'sql-jdbc.describe-table/describe-json-rf [json-map]))))))
(let [arr-row {:bob (json/encode [:bob :cob :dob 123 "blob"])}
obj-row {:zlob (json/encode {:blob Long/MAX_VALUE})}]
(is (= {} (#'sql-jdbc.describe-table/json-map->types arr-row)))
(is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/json-map->types obj-row)))))
(testing "JSON json-map->types handles bigint OK (#22732)"
(let [int-row {:zlob (json/encode {"blob" (inc (bigint Long/MAX_VALUE))})}
float-row {:zlob "{\"blob\": 12345678901234567890.12345678901234567890}"}]
(is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/json-map->types int-row)))
;; no idea how to force it to use BigDecimal here
(is (= {[:zlob "blob"] Double} (#'sql-jdbc.describe-table/json-map->types float-row))))))
(deftest ^:parallel key-limit-test
(testing "we don't read too many keys even from long jsons"
(let [data (into {} (for [i (range (* 2 @#'sql-jdbc.describe-table/max-nested-field-columns))]
[(str "key" i) i]))]
;; inc the limit since we go 1 over the limit, see comment in `json->types`
(is (= (inc @#'sql-jdbc.describe-table/max-nested-field-columns)
(count (#'sql-jdbc.describe-table/json-map->types {:k (json/encode data)})))))))
(deftest ^:parallel get-table-pks-test
;; FIXME: this should works for all sql drivers
......@@ -249,18 +241,13 @@
(is (nil? (:visibility-type text-field))))))))))))
(deftest ^:parallel describe-nested-field-columns-test
(testing "flattened-row"
(let [row {:bob {:dobbs 123 :cobbs "boop"}}
flattened {[:mob :bob :dobbs] 123
[:mob :bob :cobbs] "boop"}]
(is (= flattened (#'sql-jdbc.describe-table/flattened-row :mob row)))))
(testing "row->types"
(let [row {:bob {:dobbs {:robbs 123} :cobbs [1 2 3]}}
types {[:bob :cobbs] clojure.lang.PersistentVector
[:bob :dobbs :robbs] java.lang.Long}]
(is (= types (#'sql-jdbc.describe-table/row->types row)))))
(testing "JSON row->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)"
(let [int-row {:zlob {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")}}]
(testing "json-map->types"
(let [row {:bob (json/encode {:dobbs {:robbs 123} :cobbs [1 2 3]})}
types {[:bob "cobbs"] clojure.lang.PersistentVector
[:bob "dobbs" "robbs"] java.lang.Long}]
(is (= types (#'sql-jdbc.describe-table/json-map->types row)))))
(testing "JSON json-map->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)"
(let [int-row {:zlob (json/encode {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")})}]
(is (= #{{:name "zlob → blob",
:database-type "decimal",
:base-type :type/BigInteger,
......@@ -269,7 +256,7 @@
:visibility-type :normal,
:nfc-path [:zlob "blob"]}}
(-> int-row
(#'sql-jdbc.describe-table/row->types)
(#'sql-jdbc.describe-table/json-map->types)
(#'sql-jdbc.describe-table/field-types->fields)))))))
(deftest ^:parallel nested-field-column-test
......
......@@ -9,7 +9,10 @@
[flatland.ordered.map :refer [ordered-map]]
#_:clj-kondo/ignore
[malli.generator :as mg]
[metabase.util :as u]))
[metabase.util :as u])
#?(:clj (:import [java.time Month DayOfWeek])))
#?(:clj (set! *warn-on-reflection* true))
(deftest ^:parallel add-period-test
(is (= "This sentence needs a period."
......@@ -508,3 +511,21 @@
(let [expected (reduce + (u/map-all (fnil + 0 0 0) xs ys zs))
sum (+ (reduce + 0 xs) (reduce + 0 ys) (reduce + 0 zs))]
(= expected sum)))))
#?(:clj
(deftest ^:parallel case-enum-test
(testing "case does not work"
(is (= 3 (case Month/MAY
Month/APRIL 1
Month/MAY 2
3))))
(testing "case-enum works"
(is (= 2 (u/case-enum Month/MAY
Month/APRIL 1
Month/MAY 2
3))))
(testing "checks for type of cases"
(is (thrown? Exception #"`case-enum` only works.*"
(u/case-enum Month/JANUARY
Month/JANUARY 1
DayOfWeek/SUNDAY 2))))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment