From 2c3a214f3160f9916f44b65f35bc46510c54736b Mon Sep 17 00:00:00 2001 From: Oleksandr Yakushev <alex@bytopia.org> Date: Fri, 16 Aug 2024 01:24:27 +0300 Subject: [PATCH] perf: Optimize CSV exporting (#45757) * perf: Use more efficient iteration and formatting in CSV export * perf: Replace clojure.data.csv functions with more efficient implementations * fix: Fix formatting for integer numbers in scientific notation --- src/metabase/formatter.clj | 71 ++++++++++++------- .../middleware/format_rows.clj | 5 +- .../query_processor/streaming/csv.clj | 49 ++++++++++--- src/metabase/util/performance.clj | 71 +++++++++++++++++++ test/metabase/formatter_test.clj | 11 ++- 5 files changed, 171 insertions(+), 36 deletions(-) create mode 100644 src/metabase/util/performance.clj diff --git a/src/metabase/formatter.clj b/src/metabase/formatter.clj index 6d2d1a69560..3653719f4c3 100644 --- a/src/metabase/formatter.clj +++ b/src/metabase/formatter.clj @@ -68,8 +68,10 @@ java.lang.Float (format "%.20f" value) (str value)) (strip-trailing-zeroes (str decimal))) - d (last (str/split val-string #"[^\d*]"))] - (count d))))) + decimal-idx (str/index-of val-string decimal)] + (if decimal-idx + (- (count val-string) decimal-idx 1) + 0))))) (defn- sig-figs-after-decimal [value decimal] @@ -123,6 +125,8 @@ (:type/Currency global-settings)) (:type/Number global-settings) column-settings) + currency (when currency? + (keyword (or currency "USD"))) integral? (isa? (or effective_type base_type) :type/Integer) relation? (isa? semantic_type :Relation/*) percent? (or (isa? semantic_type :type/Percentage) (= number-style "percent")) @@ -134,7 +138,10 @@ (cond-> decimal (.setDecimalSeparator decimal)) (cond-> grouping (.setGroupingSeparator grouping))) base (cond-> (if (or scientific? relation?) "0" "#,##0") - (not grouping) (str/replace #"," ""))] + (not grouping) (str/replace #"," "")) + ;; A small cache of decimal-digits->formatter to avoid creating new ones all the time. This cache is bound by + ;; the maximum number of decimal digits we can format which is 20 (constrained by `digits-after-decimal`). + fmtr-cache (volatile! {})] (fn [value] (if (number? value) (let [scaled-value (cond-> (* value (or scale 1)) @@ -144,7 +151,12 @@ decimal-digits (cond decimals decimals ;; if user ever specifies # of decimals, use that integral? 0 - currency? (get-in currency/currency [(keyword (or currency "USD")) :decimal_digits]) + scientific? (min 2 (max decimals-in-value + ;; Scientific representation can introduce its own decimal + ;; digits even in integer numbers. Count how many integer + ;; digits are in the number (but limit to 2). + (int (Math/log10 (abs scaled-value))))) + currency? (get-in currency/currency [currency :decimal_digits]) percent? (min 2 decimals-in-value) ;; 5.5432 -> %554.32 :else (if (>= (abs scaled-value) 1) (min 2 decimals-in-value) ;; values greater than 1 round to 2 decimal places @@ -152,27 +164,36 @@ (if (> n-figs 2) (max 2 (- decimals-in-value (- n-figs 2))) ;; values less than 1 round to 2 sig-dig decimals-in-value)))) - fmt-str (cond-> base - (not (zero? decimal-digits)) (str "." (apply str (repeat decimal-digits "0"))) - scientific? (str "E0")) - fmtr (doto (DecimalFormat. fmt-str symbols) (.setRoundingMode RoundingMode/HALF_UP))] - (map->NumericWrapper - {:num-value value - :num-str (let [inline-currency? (and currency? - (false? (::mb.viz/currency-in-header column-settings)))] - (str (when prefix prefix) - (when (and inline-currency? (or (nil? currency-style) - (= currency-style "symbol"))) - (get-in currency/currency [(keyword (or currency "USD")) :symbol])) - (when (and inline-currency? (= currency-style "code")) - (str (get-in currency/currency [(keyword (or currency "USD")) :code]) \space)) - (cond-> (.format fmtr scaled-value) - (and (not currency?) (not decimals)) - (strip-trailing-zeroes decimal) - percent? (str "%")) - (when (and inline-currency? (= currency-style "name")) - (str \space (get-in currency/currency [(keyword (or currency "USD")) :name_plural]))) - (when suffix suffix)))})) + fmtr (or (@fmtr-cache decimal-digits) + (let [fmt-str (cond-> base + (not (zero? decimal-digits)) (str "." (apply str (repeat decimal-digits "0"))) + scientific? (str "E0")) + fmtr (doto (DecimalFormat. fmt-str symbols) (.setRoundingMode RoundingMode/HALF_UP))] + (vswap! fmtr-cache assoc decimal-digits fmtr) + fmtr))] + (->NumericWrapper + (let [inline-currency? (and currency? + (false? (::mb.viz/currency-in-header column-settings))) + sb (StringBuilder.)] + ;; Using explicit StringBuilder to avoid touching the slow `clojure.core/str` multi-arity. + (when prefix (.append sb prefix)) + (when (and inline-currency? (or (nil? currency-style) + (= currency-style "symbol"))) + (.append sb (get-in currency/currency [currency :symbol]))) + (when (and inline-currency? (= currency-style "code")) + (.append sb (get-in currency/currency [currency :code])) + (.append sb \space)) + (.append sb (cond-> (.format ^DecimalFormat fmtr scaled-value) + (and (not currency?) (not decimals)) + (strip-trailing-zeroes decimal))) + (when percent? + (.append sb "%")) + (when (and inline-currency? (= currency-style "name")) + (.append sb \space) + (.append sb (get-in currency/currency [currency :name_plural]))) + (when suffix (.append sb suffix)) + (str sb)) + value)) value)))) (mu/defn format-number :- (ms/InstanceOfClass NumericWrapper) diff --git a/src/metabase/query_processor/middleware/format_rows.clj b/src/metabase/query_processor/middleware/format_rows.clj index a2aec654f49..86b983c21f7 100644 --- a/src/metabase/query_processor/middleware/format_rows.clj +++ b/src/metabase/query_processor/middleware/format_rows.clj @@ -6,6 +6,7 @@ [metabase.query-processor.timezone :as qp.timezone] [metabase.util.date-2 :as u.date] [metabase.util.log :as log] + [metabase.util.performance :as perf] [potemkin.types :as p.types]) (:import (java.time Instant LocalDate LocalDateTime LocalTime OffsetDateTime OffsetTime ZonedDateTime ZoneId))) @@ -61,7 +62,7 @@ ;; a column will have `converted_timezone` metadata if it is the result of `convert-timezone` expression ;; in that case, we'll format the results with the target timezone. ;; Otherwise format it with results-timezone - cols-zone-id (map #(t/zone-id (get % :converted_timezone timezone-id)) (:cols metadata))] + cols-zone-id (perf/mapv #(t/zone-id (get % :converted_timezone timezone-id)) (:cols metadata))] (fn ([] (rf)) @@ -70,7 +71,7 @@ (rf result)) ([result row] - (rf result (mapv format-value row cols-zone-id)))))) + (rf result (perf/mapv format-value row cols-zone-id)))))) (defn format-rows "Format individual query result values as needed. Ex: format temporal values as ISO-8601 strings w/ timezone offset." diff --git a/src/metabase/query_processor/streaming/csv.clj b/src/metabase/query_processor/streaming/csv.clj index 383e747f997..9dd0588d45d 100644 --- a/src/metabase/query_processor/streaming/csv.clj +++ b/src/metabase/query_processor/streaming/csv.clj @@ -1,13 +1,14 @@ (ns metabase.query-processor.streaming.csv (:require - [clojure.data.csv :as csv] + [clojure.data.csv] [java-time.api :as t] [metabase.formatter :as formatter] [metabase.query-processor.pivot.postprocess :as qp.pivot.postprocess] [metabase.query-processor.streaming.common :as common] [metabase.query-processor.streaming.interface :as qp.si] [metabase.shared.models.visualization-settings :as mb.viz] - [metabase.util.date-2 :as u.date]) + [metabase.util.date-2 :as u.date] + [metabase.util.performance :as perf]) (:import (java.io BufferedWriter OutputStream OutputStreamWriter) (java.nio.charset StandardCharsets))) @@ -35,6 +36,38 @@ Disabled by default and should remain disabled until Issue #44556 is resolved and a clear plan is made." false) +(defn- write-csv + "Custom implementation of `clojure.data.csv/write-csv` with a more efficient quote? predicate and no support for + options (we don't use them)." + [writer data] + (let [separator \, + quote \" + quote? (fn [^String s] + (let [n (.length s)] + (loop [i 0] + (if (>= i n) false + (let [ch (.charAt s (unchecked-int i))] + (if (or (= ch \,) ;; separator + (= ch \") ;; quote + (= ch \return) + (= ch \newline)) + true + (recur (unchecked-inc i)))))))) + newline "\n"] + (#'clojure.data.csv/write-csv* writer data separator quote quote? newline))) + +;; Rebind write-cell to avoid using clojure.core/escape. Instead, use String.replace with known arguments (we never +;; change quote symbol anyway). +(.bindRoot #'clojure.data.csv/write-cell + (fn [^java.io.Writer writer obj _ _ quote?] + (let [^String string (str obj) + must-quote (quote? string)] + (when must-quote (.write writer "\"")) + (.write writer (if must-quote + (.replace string "\"" "\"\"") + string)) + (when must-quote (.write writer "\""))))) + (defmethod qp.si/streaming-results-writer :csv [_ ^OutputStream os] (let [writer (BufferedWriter. (OutputStreamWriter. os StandardCharsets/UTF_8)) @@ -58,7 +91,7 @@ (vec (repeat (count ordered-cols) identity)))) ;; write the column names for non-pivot tables (when col-names - (csv/write-csv writer [col-names]) + (write-csv writer [col-names]) (.flush writer)))) (write-row! [_ row _row-num _ {:keys [output-order]}] @@ -66,15 +99,15 @@ (let [row-v (into [] row)] (for [i output-order] (row-v i))) row) - xf-row (mapv (fn [formatter r] - (formatter (common/format-value r))) - @ordered-formatters ordered-row)] + xf-row (perf/mapv (fn [formatter r] + (formatter (common/format-value r))) + @ordered-formatters ordered-row)] (if @pivot-options ;; if we're processing a pivot result, we don't write it out yet, just store it ;; so that we can post process the full set of results in finish! (swap! rows! conj xf-row) (do - (csv/write-csv writer [xf-row]) + (write-csv writer [xf-row]) (.flush writer))))) (finish! [_ _] @@ -82,7 +115,7 @@ (when @pivot-options (let [pivot-table-rows (qp.pivot.postprocess/pivot-builder @rows! @pivot-options)] (doseq [xf-row pivot-table-rows] - (csv/write-csv writer [xf-row])))) + (write-csv writer [xf-row])))) (.flush writer) (.flush os) (.close writer))))) diff --git a/src/metabase/util/performance.clj b/src/metabase/util/performance.clj new file mode 100644 index 00000000000..58786746ac1 --- /dev/null +++ b/src/metabase/util/performance.clj @@ -0,0 +1,71 @@ +(ns metabase.util.performance + "Functions and utilities for faster processing." + (:refer-clojure :exclude [reduce mapv])) + +(set! *warn-on-reflection* true) + +(defn reduce + "Like `clojure.core/reduce`, but uses iterators under the hood to walk the collections and can iterate several + collections at once. The function `f` accepts the number of arguments that is the number of iterated collections + + 1 (accumulator)." + ([f init coll1] + (if (nil? coll1) + init + (let [it1 (.iterator ^Iterable coll1)] + (loop [res init] + (if (.hasNext it1) + (let [res (f res (.next it1))] + (if (reduced? res) + @res + (recur res))) + res))))) + ([f init coll1 coll2] + (if (or (nil? coll1) (nil? coll2)) + init + (let [it1 (.iterator ^Iterable coll1) + it2 (.iterator ^Iterable coll2)] + (loop [res init] + (if (and (.hasNext it1) (.hasNext it2)) + (let [res (f res (.next it1) (.next it2))] + (if (reduced? res) + @res + (recur res))) + res))))) + ([f init coll1 coll2 coll3] + (if (or (nil? coll1) (nil? coll2) (nil? coll3)) + init + (let [it1 (.iterator ^Iterable coll1) + it2 (.iterator ^Iterable coll2) + it3 (.iterator ^Iterable coll3)] + (loop [res init] + (if (and (.hasNext it1) (.hasNext it2) (.hasNext it3)) + (let [res (f res (.next it1) (.next it2) (.next it3))] + (if (reduced? res) + @res + (recur res))) + res))))) + ([f init coll1 coll2 coll3 coll4] + (if (or (nil? coll1) (nil? coll2) (nil? coll3) (nil? coll4)) + init + (let [it1 (.iterator ^Iterable coll1) + it2 (.iterator ^Iterable coll2) + it3 (.iterator ^Iterable coll3) + it4 (.iterator ^Iterable coll4)] + (loop [res init] + (if (and (.hasNext it1) (.hasNext it2) (.hasNext it3) (.hasNext it4)) + (let [res (f res (.next it1) (.next it2) (.next it3) (.next it4))] + (if (reduced? res) + @res + (recur res))) + res)))))) + +(defn mapv + "Like `clojure.core/mapv`, but iterates multiple collections more effectively and uses Java iterators under the hood." + ([f coll1] + (persistent! (reduce #(conj! %1 (f %2)) (transient []) coll1))) + ([f coll1 coll2] + (persistent! (reduce #(conj! %1 (f %2 %3)) (transient []) coll1 coll2))) + ([f coll1 coll2 coll3] + (persistent! (reduce #(conj! %1 (f %2 %3 %4)) (transient []) coll1 coll2 coll3))) + ([f coll1 coll2 coll3 coll4] + (persistent! (reduce #(conj! %1 (f %2 %3 %4 %5)) (transient []) coll1 coll2 coll3 coll4)))) diff --git a/test/metabase/formatter_test.clj b/test/metabase/formatter_test.clj index 19a0e308aa8..3ac8f46dbda 100644 --- a/test/metabase/formatter_test.clj +++ b/test/metabase/formatter_test.clj @@ -65,7 +65,16 @@ ::mb.viz/decimals 2}))) (is (= "1.2346E4" (fmt {::mb.viz/number-style "scientific" ::mb.viz/decimals 4}))) - (is (= "-1.23E4" (format -12345 {::mb.viz/number-style "scientific"})))) + (is (= "1E0" (format 1 {::mb.viz/number-style "scientific"}))) + (is (= "1.2E1" (format 12 {::mb.viz/number-style "scientific"}))) + (is (= "1.23E2" (format 123 {::mb.viz/number-style "scientific"}))) + (is (= "1.23E3" (format 1234 {::mb.viz/number-style "scientific"}))) + (is (= "1.23E4" (format 12345 {::mb.viz/number-style "scientific"}))) + (is (= "-1E0" (format -1 {::mb.viz/number-style "scientific"}))) + (is (= "-1.2E1" (format -12 {::mb.viz/number-style "scientific"}))) + (is (= "-1.23E2" (format -123 {::mb.viz/number-style "scientific"}))) + (is (= "-1.23E3" (format -1234 {::mb.viz/number-style "scientific"}))) + (is (= "-1.23E4" (format -12345 {::mb.viz/number-style "scientific"})))) (testing "Percentage" (is (= "1,234,554.32%" (fmt {::mb.viz/number-style "percent"}))) (is (= "1.234.554,3200%" -- GitLab