Skip to content
Snippets Groups Projects
Unverified Commit 520db041 authored by adam-james's avatar adam-james Committed by GitHub
Browse files

XLSX Pivot Export - Initialize Pivot Table with Small Area Ref (#50060)


* XLSX Pivot Export - Initialize Pivot Table with Small Area Ref

This PR changes the native pivot export so that we first initialize the pivot table with an area ref that's only the
first 2 rows. Prior, I used `AreaReference/GetWholecolumn` which had the side effect of using lots of memory.

I think this is related to how the Pivot Table's Cache is built up as you add rows/cols to the pivot definition. So,
if you keep the area ref only as wide as the number of columns in the raw data, and just 2 rows, then the cache can
stay small.

After adding row/column data to the pivot table, you can then modify the area ref to be all rows in the relevant
columns (`A:E` for example).

This keeps the pivot-table object much smaller in size.

* need the extra xml schemas to get this to work

I'd like to try find a way around including this extra dep, it's a 13mb jar (ish), and I think that's a little bigger
than I'd like

* oops, didnt mean to have this in

* Add test for pivot table initialization being fast and lean on memory

* remove time based assertion, add comments in impl

* cljfmt

* fix test

* Pesky little whitespace

---------

Co-authored-by: default avatarOleksandr Yakushev <alex@bytopia.org>
parent 09483a66
No related branches found
No related tags found
No related merge requests found
...@@ -600,9 +600,11 @@ ...@@ -600,9 +600,11 @@
pivot-sheet (spreadsheet/select-sheet "pivot" wb) pivot-sheet (spreadsheet/select-sheet "pivot" wb)
col-names (common/column-titles ordered-cols col-settings format-rows?) col-names (common/column-titles ordered-cols col-settings format-rows?)
_ (add-row! data-sheet col-names ordered-cols col-settings cell-styles typed-cell-styles) _ (add-row! data-sheet col-names ordered-cols col-settings cell-styles typed-cell-styles)
area-ref (AreaReference/getWholeColumn SpreadsheetVersion/EXCEL2007 ;; keep the initial area-ref small (only 2 rows) so that adding row and column labels keeps the pivot table
"A" ;; object small.
(CellReference/convertNumToColString (dec (count ordered-cols)))) area-ref (AreaReference.
(format "A1:%s2" (CellReference/convertNumToColString (dec (count ordered-cols))))
SpreadsheetVersion/EXCEL2007)
^XSSFPivotTable pivot-table (.createPivotTable ^XSSFSheet pivot-sheet ^XSSFPivotTable pivot-table (.createPivotTable ^XSSFSheet pivot-sheet
^AreaReference area-ref ^AreaReference area-ref
(CellReference. 0 0) (CellReference. 0 0)
...@@ -623,6 +625,13 @@ ...@@ -623,6 +625,13 @@
.getPivotFields .getPivotFields
(.getPivotFieldArray idx) (.getPivotFieldArray idx)
(.setSortType setting))))) (.setSortType setting)))))
;; now that the Pivot Table Rows and Cols are set, we can update the area-ref
(-> pivot-table
.getPivotCacheDefinition
.getCTPivotCacheDefinition
.getCacheSource
.getWorksheetSource
(.setRef (format "A:%s" (CellReference/convertNumToColString (dec (count ordered-cols))))))
(let [swb (-> (SXSSFWorkbook. ^XSSFWorkbook wb) (let [swb (-> (SXSSFWorkbook. ^XSSFWorkbook wb)
(doto (.setCompressTempFiles true))) (doto (.setCompressTempFiles true)))
sheet (spreadsheet/select-sheet "data" swb)] sheet (spreadsheet/select-sheet "data" swb)]
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
[metabase.test :as mt]) [metabase.test :as mt])
(:import (:import
(com.fasterxml.jackson.core JsonGenerator) (com.fasterxml.jackson.core JsonGenerator)
(java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream))) com.sun.management.ThreadMXBean
(java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream)
java.lang.management.ManagementFactory))
(set! *warn-on-reflection* true) (set! *warn-on-reflection* true)
...@@ -390,7 +392,7 @@ ...@@ -390,7 +392,7 @@
[sheet] [sheet]
(mapv (fn [row] (mapv (fn [row]
(mapv spreadsheet/read-cell row)) (mapv spreadsheet/read-cell row))
(spreadsheet/into-seq sheet))) (some-> sheet spreadsheet/into-seq)))
(defn parse-xlsx-results (defn parse-xlsx-results
"Given a byte array representing an XLSX document, parses the query result sheet using the provided `parse-fn`" "Given a byte array representing an XLSX document, parses the query result sheet using the provided `parse-fn`"
...@@ -404,20 +406,20 @@ ...@@ -404,20 +406,20 @@
(parse-fn sheet))))) (parse-fn sheet)))))
(defn- xlsx-export (defn- xlsx-export
([ordered-cols viz-settings rows] [ordered-cols viz-settings rows & {:keys [parse-fn pivot-export-options]}]
(xlsx-export ordered-cols viz-settings rows parse-cell-content)) (with-open [bos (ByteArrayOutputStream.)
os (BufferedOutputStream. bos)]
([ordered-cols viz-settings rows parse-fn] (let [results-writer (qp.si/streaming-results-writer :xlsx os)]
(with-open [bos (ByteArrayOutputStream.) (qp.si/begin! results-writer {:data {:ordered-cols ordered-cols
os (BufferedOutputStream. bos)] :pivot? (some? pivot-export-options)
(let [results-writer (qp.si/streaming-results-writer :xlsx os)] :pivot-export-options pivot-export-options}}
(qp.si/begin! results-writer {:data {:ordered-cols ordered-cols}} viz-settings) viz-settings)
(doall (map-indexed (doall (map-indexed
(fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings)) (fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings))
rows)) rows))
(qp.si/finish! results-writer {:row_count (count rows)})) (qp.si/finish! results-writer {:row_count (count rows)}))
(let [bytea (.toByteArray bos)] (let [bytea (.toByteArray bos)]
(parse-xlsx-results bytea parse-fn))))) (parse-xlsx-results bytea (or parse-fn parse-cell-content)))))
(defn- parse-format-strings (defn- parse-format-strings
[sheet] [sheet]
...@@ -426,6 +428,9 @@ ...@@ -426,6 +428,9 @@
(.. cell getCellStyle getDataFormatString)) (.. cell getCellStyle getDataFormatString))
row))) row)))
(defn- get-allocated-bytes []
(.getCurrentThreadAllocatedBytes ^ThreadMXBean (ManagementFactory/getThreadMXBean)))
(deftest export-format-test (deftest export-format-test
(mt/with-temporary-setting-values [custom-formatting {}] (mt/with-temporary-setting-values [custom-formatting {}]
(testing "Different format strings are used for ints and numbers that round to ints (with 2 decimal places)" (testing "Different format strings are used for ints and numbers that round to ints (with 2 decimal places)"
...@@ -433,7 +438,7 @@ ...@@ -433,7 +438,7 @@
(rest (xlsx-export [{:field_ref [:field 0] :name "Col" :semantic_type :type/Cost}] (rest (xlsx-export [{:field_ref [:field 0] :name "Col" :semantic_type :type/Cost}]
{} {}
[[1] [1.23] [1.004] [1.005] [10000000000] [10000000000.123]] [[1] [1.23] [1.004] [1.005] [10000000000] [10000000000.123]]
parse-format-strings))))) :parse-fn parse-format-strings)))))
(testing "Misc format strings are included correctly in exports" (testing "Misc format strings are included correctly in exports"
(is (= ["[$€]#,##0.00"] (is (= ["[$€]#,##0.00"]
...@@ -442,7 +447,7 @@ ...@@ -442,7 +447,7 @@
{::mb.viz/currency "EUR" {::mb.viz/currency "EUR"
::mb.viz/currency-in-header false}}} ::mb.viz/currency-in-header false}}}
[[1.23]] [[1.23]]
parse-format-strings)))) :parse-fn parse-format-strings))))
(is (= ["yyyy.m.d, h:mm:ss am/pm"] (is (= ["yyyy.m.d, h:mm:ss am/pm"]
(second (xlsx-export [{:field_ref [:field 0] :name "Col" :effective_type :type/Temporal}] (second (xlsx-export [{:field_ref [:field 0] :name "Col" :effective_type :type/Temporal}]
{::mb.viz/column-settings {{::mb.viz/field-id 0} {::mb.viz/column-settings {{::mb.viz/field-id 0}
...@@ -451,7 +456,7 @@ ...@@ -451,7 +456,7 @@
::mb.viz/time-style "h:mm A", ::mb.viz/time-style "h:mm A",
::mb.viz/time-enabled "seconds"}}} ::mb.viz/time-enabled "seconds"}}}
[[#t "2020-03-28T10:12:06.681"]] [[#t "2020-03-28T10:12:06.681"]]
parse-format-strings))))))) :parse-fn parse-format-strings)))))))
(deftest column-order-test (deftest column-order-test
(testing "Column titles are ordered correctly in the output" (testing "Column titles are ordered correctly in the output"
...@@ -675,22 +680,22 @@ ...@@ -675,22 +680,22 @@
(xlsx-export [{:id 0, :name "Col1"} {:id 1, :name "Col2"}] (xlsx-export [{:id 0, :name "Col1"} {:id 1, :name "Col2"}]
{} {}
[["a" "abcdefghijklmnopqrstuvwxyz"]] [["a" "abcdefghijklmnopqrstuvwxyz"]]
assert-non-default-widths)) :parse-fn assert-non-default-widths))
(testing "Auto-sizing works when the number of rows is at or above the auto-sizing threshold" (testing "Auto-sizing works when the number of rows is at or above the auto-sizing threshold"
(binding [qp.xlsx/*auto-sizing-threshold* 2] (binding [qp.xlsx/*auto-sizing-threshold* 2]
(xlsx-export [{:id 0, :name "Col1"}] (xlsx-export [{:id 0, :name "Col1"}]
{} {}
[["abcdef"] ["abcedf"]] [["abcdef"] ["abcedf"]]
assert-non-default-widths) :parse-fn assert-non-default-widths)
(xlsx-export [{:id 0, :name "Col1"}] (xlsx-export [{:id 0, :name "Col1"}]
{} {}
[["abcdef"] ["abcedf"] ["abcdef"]] [["abcdef"] ["abcedf"] ["abcdef"]]
assert-non-default-widths))) :parse-fn assert-non-default-widths)))
(testing "An auto-sized column does not exceed max-column-width (the width of 255 characters)" (testing "An auto-sized column does not exceed max-column-width (the width of 255 characters)"
(let [[col-width] (xlsx-export [{:id 0, :name "Col1"}] (let [[col-width] (xlsx-export [{:id 0, :name "Col1"}]
{} {}
[[(apply str (repeat 256 "0"))]] [[(apply str (repeat 256 "0"))]]
parse-column-widths)] :parse-fn parse-column-widths)]
(is (= 65280 col-width))))) (is (= 65280 col-width)))))
(deftest poi-tempfiles-test (deftest poi-tempfiles-test
...@@ -748,3 +753,24 @@ ...@@ -748,3 +753,24 @@
{} {}
[[1] [[1]
[2]])))))) [2]]))))))
(deftest pivot-table-resource-usage-test
(testing "pivot table initialization should complete in reasonable time and memory"
;; We test XLSX export of an empty table (0 rows) with pivoting enabled. This should test the initialization of
;; pivot machinery that used to allocate and retain a lot of memory (and hence was slow on smaller heaps).
(let [do-export #(xlsx-export [{:display_name "A"}
{:display_name "B"}
{:display_name "C"}
{:display_name "D"}
{:display_name "pivot-grouping"}
{:display_name "E"}
{:display_name "F"}]
{}
[]
:pivot-export-options {:pivot-rows [0 1], :pivot-cols [2 3], :pivot-measures [5 4]})
;; Run once before measuring to warm-up and thus reduce flakiness.
_ (do-export)
start-bytes (get-allocated-bytes)]
(do-export)
;; Should always allocate less than 100Mb.
(is (< (- (get-allocated-bytes) start-bytes) (* 100 1024 1024))))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment