From 5cfc079d1db6e69bf42705f0eeba431a6e39c6b5 Mon Sep 17 00:00:00 2001 From: Alexander Solovyov <alexander@solovyov.net> Date: Wed, 15 May 2024 13:08:32 +0300 Subject: [PATCH] json unfolding: parse incrementally instead of using cheshire (#42638) this lowers memory usage 100x for 50kb strings (our limit before this change) and speeds up parsing by an order of magnitude. For example, results for parsing 42kb string (already in memory): Before: Time per call: 6.32 ms Alloc per call: 8,732,850b After: Time per call: 55.16 us Alloc per call: 83,254b --- .../driver/sql_jdbc/sync/describe_table.clj | 114 +++++++++++------- src/metabase/util.cljc | 27 +++++ .../sql_jdbc/sync/describe_table_test.clj | 65 ++++------ test/metabase/util_test.cljc | 23 +++- 4 files changed, 146 insertions(+), 83 deletions(-) diff --git a/src/metabase/driver/sql_jdbc/sync/describe_table.clj b/src/metabase/driver/sql_jdbc/sync/describe_table.clj index f3f44ca6fe6..4851c4699d9 100644 --- a/src/metabase/driver/sql_jdbc/sync/describe_table.clj +++ b/src/metabase/driver/sql_jdbc/sync/describe_table.clj @@ -2,7 +2,6 @@ "SQL JDBC impl for `describe-fields`, `describe-table`, `describe-fks`, `describe-table-fks`, and `describe-nested-field-columns`. `describe-table-fks` is deprecated and will be replaced by `describe-fks` in the future." (:require - [cheshire.core :as json] [clojure.java.jdbc :as jdbc] [clojure.set :as set] [clojure.string :as str] @@ -24,6 +23,7 @@ #_{:clj-kondo/ignore [:discouraged-namespace]} [toucan2.core :as t2]) (:import + (com.fasterxml.jackson.core JsonFactory JsonParser JsonToken JsonParser$NumberType) (java.sql Connection DatabaseMetaData ResultSet))) (set! *warn-on-reflection* true) @@ -349,22 +349,9 @@ :value index-name}))) set))))) -(def ^:dynamic *nested-field-column-max-row-length* - "Max string length for a row for nested field column before we just give up on parsing it. - Marked as mutable because we mutate it for tests." - 50000) - -(defn- flattened-row [field-name row] - (letfn [(flatten-row [row path] - (lazy-seq - (when-let [[[k v] & xs] (seq row)] - (cond (and (map? v) (not-empty v)) - (into (flatten-row v (conj path k)) - (flatten-row xs path)) - :else - (cons [(conj path k) v] - (flatten-row xs path))))))] - (into {} (flatten-row row [field-name])))) +(def ^:const max-nested-field-columns + "Maximum number of nested field columns." + 100) (def ^:private ^{:arglists '([s])} can-parse-datetime? "Returns whether a string can be parsed to an ISO 8601 datetime or not." @@ -372,33 +359,74 @@ (defn- type-by-parsing-string "Mostly just (type member) but with a bit to suss out strings which are ISO8601 and say that they are datetimes" - [member] - (let [member-type (type member)] - (if (and (instance? String member) - (can-parse-datetime? member)) - java.time.LocalDateTime - member-type))) - -(defn- row->types [row] - (into {} (for [[field-name field-val] row - ;; We put top-level array row type semantics on JSON roadmap but skip for now - :when (map? field-val)] - (let [flat-row (flattened-row field-name field-val)] - (into {} (map (fn [[k v]] [k (type-by-parsing-string v)]) flat-row)))))) - -(defn- describe-json-xform [member] - ((comp (map #(for [[k v] % - :when (< (count v) *nested-field-column-max-row-length*)] - [k (json/parse-string v)])) - (map #(into {} %)) - (map row->types)) member)) - -(def ^:const max-nested-field-columns - "Maximum number of nested field columns." - 100) + [value] + (if (and (string? value) + (can-parse-datetime? value)) + java.time.LocalDateTime + (type value))) + +(defn- json-parser ^JsonParser [v] + (let [f (JsonFactory.)] + (if (string? v) + (.createParser f ^String v) + (.createParser f ^java.io.Reader v)))) + +(defn- number-type [t] + (u/case-enum t + JsonParser$NumberType/INT Long + JsonParser$NumberType/LONG Long + JsonParser$NumberType/FLOAT Double + JsonParser$NumberType/DOUBLE Double + JsonParser$NumberType/BIG_INTEGER clojure.lang.BigInt + ;; there seem to be no way to encounter this, search in tests for `BigDecimal` + JsonParser$NumberType/BIG_DECIMAL BigDecimal)) + +(defn- json->types + "Parses given json (a string or a reader) into a map of paths to types, i.e. `{[\"bob\"} String}`. + + Uses Jackson Streaming API to skip allocating data structures, eschews allocating values when possible. + Respects *nested-field-column-max-row-length*." + [v path] + (let [p (json-parser v)] + (loop [path (or path []) + key nil + res (transient {})] + (let [token (.nextToken p)] + (cond + (nil? token) + (persistent! res) + + ;; we could be more precise here and issue warning about nested fields (the one in `describe-json-fields`), + ;; but this limit could be hit by multiple json fields (fetched in `describe-json-fields`) rather than only + ;; by this one. So for the sake of issuing only a single warning in logs we'll spill over limit by a single + ;; entry (instead of doing `<=`). + (< max-nested-field-columns (count res)) + (persistent! res) + + :else + (u/case-enum token + JsonToken/VALUE_NUMBER_INT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p)))) + JsonToken/VALUE_NUMBER_FLOAT (recur path key (assoc! res (conj path key) (number-type (.getNumberType p)))) + JsonToken/VALUE_TRUE (recur path key (assoc! res (conj path key) Boolean)) + JsonToken/VALUE_FALSE (recur path key (assoc! res (conj path key) Boolean)) + JsonToken/VALUE_NULL (recur path key (assoc! res (conj path key) nil)) + JsonToken/VALUE_STRING (recur path key (assoc! res (conj path key) + (type-by-parsing-string (.getText p)))) + JsonToken/FIELD_NAME (recur path (.getText p) res) + JsonToken/START_OBJECT (recur (cond-> path key (conj key)) key res) + JsonToken/END_OBJECT (recur (cond-> path (seq path) pop) key res) + ;; We put top-level array row type semantics on JSON roadmap but skip for now + JsonToken/START_ARRAY (do (.skipChildren p) + (if key + (recur path key (assoc! res (conj path key) clojure.lang.PersistentVector)) + (recur path key res))) + JsonToken/END_ARRAY (recur path key res))))))) + +(defn- json-map->types [json-map] + (apply merge (map #(json->types (second %) [(first %)]) json-map))) (defn- describe-json-rf - "Reducing function that takes a bunch of maps from row->types, + "Reducing function that takes a bunch of maps from json-map->types, and gets them to conform to the type hierarchy, going through and taking the lowest common denominator type at each pass, ignoring the nils." @@ -550,7 +578,7 @@ driver (sample-json-row-honey-sql table-identifier json-field-identifiers pk-identifiers)) query (jdbc/reducible-query jdbc-spec sql-args {:identifiers identity}) - field-types (transduce describe-json-xform describe-json-rf query) + field-types (transduce (map json-map->types) describe-json-rf query) fields (field-types->fields field-types)] (if (> (count fields) max-nested-field-columns) (do diff --git a/src/metabase/util.cljc b/src/metabase/util.cljc index ccdf6d9d32c..ff88ef920b0 100644 --- a/src/metabase/util.cljc +++ b/src/metabase/util.cljc @@ -25,12 +25,15 @@ [potemkin :as p] [ring.util.codec :as codec]))) #?(:clj (:import + (clojure.lang Reflector) (java.text Normalizer Normalizer$Form) (java.util Locale) (org.apache.commons.validator.routines RegexValidator UrlValidator))) #?(:cljs (:require-macros [camel-snake-kebab.internals.macros :as csk.macros] [metabase.util]))) +#?(:clj (set! *warn-on-reflection* true)) + (u.ns/import-fns [u.format colorize format-bytes format-color format-milliseconds format-nanoseconds format-seconds]) @@ -945,3 +948,27 @@ (fn [acc x] (if (pred x) (reduced x) acc)) nil coll)) + +#?(:clj + (let [sym->enum (fn ^Enum [sym] + (Reflector/invokeStaticMethod ^Class (resolve (symbol (namespace sym))) + "valueOf" + (to-array [(name sym)]))) + ordinal (fn [^Enum e] (.ordinal e))] + (defmacro case-enum + "Like `case`, but explicitly dispatch on Java enum ordinals. + + Passing the same enum type as the ones you're checking in is on you, this is not checked." + [value & clauses] + (let [types (map (comp type sym->enum first) (partition 2 clauses))] + ;; doesn't check for the value of `case`, but that's on user + (if-not (apply = types) + `(throw (ex-info (str "`case-enum` only works if all supplied enums are of a same type: " ~(vec types)) + {:types ~(vec types)})) + `(case (int (~ordinal ~value)) + ~@(concat + (mapcat (fn [[test result]] + [(ordinal (sym->enum test)) result]) + (partition 2 clauses)) + (when (odd? (count clauses)) + (list (last clauses)))))))))) diff --git a/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj b/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj index 6d511123311..4bb01a13f4a 100644 --- a/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj +++ b/test/metabase/driver/sql_jdbc/sync/describe_table_test.clj @@ -190,32 +190,24 @@ (deftest ^:parallel row->types-test (testing "array rows ignored properly in JSON row->types (#21752)" - (let [arr-row {:bob [:bob :cob :dob 123 "blob"]} - obj-row {:zlob {"blob" 1323}}] - (is (= {} (#'sql-jdbc.describe-table/row->types arr-row))) - (is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/row->types obj-row))))) - (testing "JSON row->types handles bigint OK (#21752)" - (let [int-row {:zlob {"blob" 123N}} - float-row {:zlob {"blob" 1234.02M}}] - (is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/row->types int-row))) - (is (= {[:zlob "blob"] java.math.BigDecimal} (#'sql-jdbc.describe-table/row->types float-row)))))) - -(deftest ^:parallel dont-parse-long-json-xform-test - (testing "obnoxiously long json should not even get parsed (#22636)" - ;; Generating an actually obnoxiously long json took too long, - ;; and actually copy-pasting an obnoxiously long string in there looks absolutely terrible, - ;; so this rebinding is what you get - (let [obnoxiously-long-json "{\"bob\": \"dobbs\"}" - json-map {:somekey obnoxiously-long-json}] - (binding [sql-jdbc.describe-table/*nested-field-column-max-row-length* 3] - (is (= {} - (transduce - #'sql-jdbc.describe-table/describe-json-xform - #'sql-jdbc.describe-table/describe-json-rf [json-map])))) - (is (= {[:somekey "bob"] java.lang.String} - (transduce - #'sql-jdbc.describe-table/describe-json-xform - #'sql-jdbc.describe-table/describe-json-rf [json-map])))))) + (let [arr-row {:bob (json/encode [:bob :cob :dob 123 "blob"])} + obj-row {:zlob (json/encode {:blob Long/MAX_VALUE})}] + (is (= {} (#'sql-jdbc.describe-table/json-map->types arr-row))) + (is (= {[:zlob "blob"] java.lang.Long} (#'sql-jdbc.describe-table/json-map->types obj-row))))) + (testing "JSON json-map->types handles bigint OK (#22732)" + (let [int-row {:zlob (json/encode {"blob" (inc (bigint Long/MAX_VALUE))})} + float-row {:zlob "{\"blob\": 12345678901234567890.12345678901234567890}"}] + (is (= {[:zlob "blob"] clojure.lang.BigInt} (#'sql-jdbc.describe-table/json-map->types int-row))) + ;; no idea how to force it to use BigDecimal here + (is (= {[:zlob "blob"] Double} (#'sql-jdbc.describe-table/json-map->types float-row)))))) + +(deftest ^:parallel key-limit-test + (testing "we don't read too many keys even from long jsons" + (let [data (into {} (for [i (range (* 2 @#'sql-jdbc.describe-table/max-nested-field-columns))] + [(str "key" i) i]))] + ;; inc the limit since we go 1 over the limit, see comment in `json->types` + (is (= (inc @#'sql-jdbc.describe-table/max-nested-field-columns) + (count (#'sql-jdbc.describe-table/json-map->types {:k (json/encode data)}))))))) (deftest ^:parallel get-table-pks-test ;; FIXME: this should works for all sql drivers @@ -249,18 +241,13 @@ (is (nil? (:visibility-type text-field)))))))))))) (deftest ^:parallel describe-nested-field-columns-test - (testing "flattened-row" - (let [row {:bob {:dobbs 123 :cobbs "boop"}} - flattened {[:mob :bob :dobbs] 123 - [:mob :bob :cobbs] "boop"}] - (is (= flattened (#'sql-jdbc.describe-table/flattened-row :mob row))))) - (testing "row->types" - (let [row {:bob {:dobbs {:robbs 123} :cobbs [1 2 3]}} - types {[:bob :cobbs] clojure.lang.PersistentVector - [:bob :dobbs :robbs] java.lang.Long}] - (is (= types (#'sql-jdbc.describe-table/row->types row))))) - (testing "JSON row->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)" - (let [int-row {:zlob {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")}}] + (testing "json-map->types" + (let [row {:bob (json/encode {:dobbs {:robbs 123} :cobbs [1 2 3]})} + types {[:bob "cobbs"] clojure.lang.PersistentVector + [:bob "dobbs" "robbs"] java.lang.Long}] + (is (= types (#'sql-jdbc.describe-table/json-map->types row))))) + (testing "JSON json-map->types handles bigint that comes in and gets interpreted as Java bigint OK (#22732)" + (let [int-row {:zlob (json/encode {"blob" (java.math.BigInteger. "123124124312134235234235345344324352")})}] (is (= #{{:name "zlob → blob", :database-type "decimal", :base-type :type/BigInteger, @@ -269,7 +256,7 @@ :visibility-type :normal, :nfc-path [:zlob "blob"]}} (-> int-row - (#'sql-jdbc.describe-table/row->types) + (#'sql-jdbc.describe-table/json-map->types) (#'sql-jdbc.describe-table/field-types->fields))))))) (deftest ^:parallel nested-field-column-test diff --git a/test/metabase/util_test.cljc b/test/metabase/util_test.cljc index ceff4313570..58abdd584f0 100644 --- a/test/metabase/util_test.cljc +++ b/test/metabase/util_test.cljc @@ -9,7 +9,10 @@ [flatland.ordered.map :refer [ordered-map]] #_:clj-kondo/ignore [malli.generator :as mg] - [metabase.util :as u])) + [metabase.util :as u]) + #?(:clj (:import [java.time Month DayOfWeek]))) + +#?(:clj (set! *warn-on-reflection* true)) (deftest ^:parallel add-period-test (is (= "This sentence needs a period." @@ -508,3 +511,21 @@ (let [expected (reduce + (u/map-all (fnil + 0 0 0) xs ys zs)) sum (+ (reduce + 0 xs) (reduce + 0 ys) (reduce + 0 zs))] (= expected sum))))) + +#?(:clj + (deftest ^:parallel case-enum-test + (testing "case does not work" + (is (= 3 (case Month/MAY + Month/APRIL 1 + Month/MAY 2 + 3)))) + (testing "case-enum works" + (is (= 2 (u/case-enum Month/MAY + Month/APRIL 1 + Month/MAY 2 + 3)))) + (testing "checks for type of cases" + (is (thrown? Exception #"`case-enum` only works.*" + (u/case-enum Month/JANUARY + Month/JANUARY 1 + DayOfWeek/SUNDAY 2)))))) -- GitLab