From 5c7d398b0e1837cec9bf1b9f26a1fe0f93e81337 Mon Sep 17 00:00:00 2001 From: github-automation-metabase <166700802+github-automation-metabase@users.noreply.github.com> Date: Fri, 15 Nov 2024 18:23:58 -0500 Subject: [PATCH] XLSX Pivot Export - Initialize Pivot Table with Small Area Ref (#50060) (#50111) * XLSX Pivot Export - Initialize Pivot Table with Small Area Ref This PR changes the native pivot export so that we first initialize the pivot table with an area ref that's only the first 2 rows. Prior, I used `AreaReference/GetWholecolumn` which had the side effect of using lots of memory. I think this is related to how the Pivot Table's Cache is built up as you add rows/cols to the pivot definition. So, if you keep the area ref only as wide as the number of columns in the raw data, and just 2 rows, then the cache can stay small. After adding row/column data to the pivot table, you can then modify the area ref to be all rows in the relevant columns (`A:E` for example). This keeps the pivot-table object much smaller in size. * need the extra xml schemas to get this to work I'd like to try find a way around including this extra dep, it's a 13mb jar (ish), and I think that's a little bigger than I'd like * oops, didnt mean to have this in * Add test for pivot table initialization being fast and lean on memory * remove time based assertion, add comments in impl * cljfmt * fix test * Pesky little whitespace --------- Co-authored-by: adam-james <21064735+adam-james-v@users.noreply.github.com> Co-authored-by: Oleksandr Yakushev <alex@bytopia.org> --- .../query_processor/streaming/xlsx.clj | 15 +++- .../query_processor/streaming/xlsx_test.clj | 72 +++++++++++++------ 2 files changed, 61 insertions(+), 26 deletions(-) diff --git a/src/metabase/query_processor/streaming/xlsx.clj b/src/metabase/query_processor/streaming/xlsx.clj index 8e81cd6b385..664b91fd8aa 100644 --- a/src/metabase/query_processor/streaming/xlsx.clj +++ b/src/metabase/query_processor/streaming/xlsx.clj @@ -600,9 +600,11 @@ pivot-sheet (spreadsheet/select-sheet "pivot" wb) col-names (common/column-titles ordered-cols col-settings format-rows?) _ (add-row! data-sheet col-names ordered-cols col-settings cell-styles typed-cell-styles) - area-ref (AreaReference/getWholeColumn SpreadsheetVersion/EXCEL2007 - "A" - (CellReference/convertNumToColString (dec (count ordered-cols)))) + ;; keep the initial area-ref small (only 2 rows) so that adding row and column labels keeps the pivot table + ;; object small. + area-ref (AreaReference. + (format "A1:%s2" (CellReference/convertNumToColString (dec (count ordered-cols)))) + SpreadsheetVersion/EXCEL2007) ^XSSFPivotTable pivot-table (.createPivotTable ^XSSFSheet pivot-sheet ^AreaReference area-ref (CellReference. 0 0) @@ -623,6 +625,13 @@ .getPivotFields (.getPivotFieldArray idx) (.setSortType setting))))) + ;; now that the Pivot Table Rows and Cols are set, we can update the area-ref + (-> pivot-table + .getPivotCacheDefinition + .getCTPivotCacheDefinition + .getCacheSource + .getWorksheetSource + (.setRef (format "A:%s" (CellReference/convertNumToColString (dec (count ordered-cols)))))) (let [swb (-> (SXSSFWorkbook. ^XSSFWorkbook wb) (doto (.setCompressTempFiles true))) sheet (spreadsheet/select-sheet "data" swb)] diff --git a/test/metabase/query_processor/streaming/xlsx_test.clj b/test/metabase/query_processor/streaming/xlsx_test.clj index effb7850012..1fbf58de10e 100644 --- a/test/metabase/query_processor/streaming/xlsx_test.clj +++ b/test/metabase/query_processor/streaming/xlsx_test.clj @@ -12,7 +12,9 @@ [metabase.test :as mt]) (:import (com.fasterxml.jackson.core JsonGenerator) - (java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream))) + com.sun.management.ThreadMXBean + (java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream) + java.lang.management.ManagementFactory)) (set! *warn-on-reflection* true) @@ -390,7 +392,7 @@ [sheet] (mapv (fn [row] (mapv spreadsheet/read-cell row)) - (spreadsheet/into-seq sheet))) + (some-> sheet spreadsheet/into-seq))) (defn parse-xlsx-results "Given a byte array representing an XLSX document, parses the query result sheet using the provided `parse-fn`" @@ -404,20 +406,20 @@ (parse-fn sheet))))) (defn- xlsx-export - ([ordered-cols viz-settings rows] - (xlsx-export ordered-cols viz-settings rows parse-cell-content)) - - ([ordered-cols viz-settings rows parse-fn] - (with-open [bos (ByteArrayOutputStream.) - os (BufferedOutputStream. bos)] - (let [results-writer (qp.si/streaming-results-writer :xlsx os)] - (qp.si/begin! results-writer {:data {:ordered-cols ordered-cols}} viz-settings) - (doall (map-indexed - (fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings)) - rows)) - (qp.si/finish! results-writer {:row_count (count rows)})) - (let [bytea (.toByteArray bos)] - (parse-xlsx-results bytea parse-fn))))) + [ordered-cols viz-settings rows & {:keys [parse-fn pivot-export-options]}] + (with-open [bos (ByteArrayOutputStream.) + os (BufferedOutputStream. bos)] + (let [results-writer (qp.si/streaming-results-writer :xlsx os)] + (qp.si/begin! results-writer {:data {:ordered-cols ordered-cols + :pivot? (some? pivot-export-options) + :pivot-export-options pivot-export-options}} + viz-settings) + (doall (map-indexed + (fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings)) + rows)) + (qp.si/finish! results-writer {:row_count (count rows)})) + (let [bytea (.toByteArray bos)] + (parse-xlsx-results bytea (or parse-fn parse-cell-content))))) (defn- parse-format-strings [sheet] @@ -426,6 +428,9 @@ (.. cell getCellStyle getDataFormatString)) row))) +(defn- get-allocated-bytes [] + (.getCurrentThreadAllocatedBytes ^ThreadMXBean (ManagementFactory/getThreadMXBean))) + (deftest export-format-test (mt/with-temporary-setting-values [custom-formatting {}] (testing "Different format strings are used for ints and numbers that round to ints (with 2 decimal places)" @@ -433,7 +438,7 @@ (rest (xlsx-export [{:field_ref [:field 0] :name "Col" :semantic_type :type/Cost}] {} [[1] [1.23] [1.004] [1.005] [10000000000] [10000000000.123]] - parse-format-strings))))) + :parse-fn parse-format-strings))))) (testing "Misc format strings are included correctly in exports" (is (= ["[$€]#,##0.00"] @@ -442,7 +447,7 @@ {::mb.viz/currency "EUR" ::mb.viz/currency-in-header false}}} [[1.23]] - parse-format-strings)))) + :parse-fn parse-format-strings)))) (is (= ["yyyy.m.d, h:mm:ss am/pm"] (second (xlsx-export [{:field_ref [:field 0] :name "Col" :effective_type :type/Temporal}] {::mb.viz/column-settings {{::mb.viz/field-id 0} @@ -451,7 +456,7 @@ ::mb.viz/time-style "h:mm A", ::mb.viz/time-enabled "seconds"}}} [[#t "2020-03-28T10:12:06.681"]] - parse-format-strings))))))) + :parse-fn parse-format-strings))))))) (deftest column-order-test (testing "Column titles are ordered correctly in the output" @@ -675,22 +680,22 @@ (xlsx-export [{:id 0, :name "Col1"} {:id 1, :name "Col2"}] {} [["a" "abcdefghijklmnopqrstuvwxyz"]] - assert-non-default-widths)) + :parse-fn assert-non-default-widths)) (testing "Auto-sizing works when the number of rows is at or above the auto-sizing threshold" (binding [qp.xlsx/*auto-sizing-threshold* 2] (xlsx-export [{:id 0, :name "Col1"}] {} [["abcdef"] ["abcedf"]] - assert-non-default-widths) + :parse-fn assert-non-default-widths) (xlsx-export [{:id 0, :name "Col1"}] {} [["abcdef"] ["abcedf"] ["abcdef"]] - assert-non-default-widths))) + :parse-fn assert-non-default-widths))) (testing "An auto-sized column does not exceed max-column-width (the width of 255 characters)" (let [[col-width] (xlsx-export [{:id 0, :name "Col1"}] {} [[(apply str (repeat 256 "0"))]] - parse-column-widths)] + :parse-fn parse-column-widths)] (is (= 65280 col-width))))) (deftest poi-tempfiles-test @@ -748,3 +753,24 @@ {} [[1] [2]])))))) + +(deftest pivot-table-resource-usage-test + (testing "pivot table initialization should complete in reasonable time and memory" + ;; We test XLSX export of an empty table (0 rows) with pivoting enabled. This should test the initialization of + ;; pivot machinery that used to allocate and retain a lot of memory (and hence was slow on smaller heaps). + (let [do-export #(xlsx-export [{:display_name "A"} + {:display_name "B"} + {:display_name "C"} + {:display_name "D"} + {:display_name "pivot-grouping"} + {:display_name "E"} + {:display_name "F"}] + {} + [] + :pivot-export-options {:pivot-rows [0 1], :pivot-cols [2 3], :pivot-measures [5 4]}) + ;; Run once before measuring to warm-up and thus reduce flakiness. + _ (do-export) + start-bytes (get-allocated-bytes)] + (do-export) + ;; Should always allocate less than 100Mb. + (is (< (- (get-allocated-bytes) start-bytes) (* 100 1024 1024)))))) -- GitLab