Skip to content
Snippets Groups Projects
Unverified Commit 520db041 authored by adam-james's avatar adam-james Committed by GitHub
Browse files

XLSX Pivot Export - Initialize Pivot Table with Small Area Ref (#50060)


* XLSX Pivot Export - Initialize Pivot Table with Small Area Ref

This PR changes the native pivot export so that we first initialize the pivot table with an area ref that's only the
first 2 rows. Prior, I used `AreaReference/GetWholecolumn` which had the side effect of using lots of memory.

I think this is related to how the Pivot Table's Cache is built up as you add rows/cols to the pivot definition. So,
if you keep the area ref only as wide as the number of columns in the raw data, and just 2 rows, then the cache can
stay small.

After adding row/column data to the pivot table, you can then modify the area ref to be all rows in the relevant
columns (`A:E` for example).

This keeps the pivot-table object much smaller in size.

* need the extra xml schemas to get this to work

I'd like to try find a way around including this extra dep, it's a 13mb jar (ish), and I think that's a little bigger
than I'd like

* oops, didnt mean to have this in

* Add test for pivot table initialization being fast and lean on memory

* remove time based assertion, add comments in impl

* cljfmt

* fix test

* Pesky little whitespace

---------

Co-authored-by: default avatarOleksandr Yakushev <alex@bytopia.org>
parent 09483a66
No related branches found
No related tags found
No related merge requests found
......@@ -600,9 +600,11 @@
pivot-sheet (spreadsheet/select-sheet "pivot" wb)
col-names (common/column-titles ordered-cols col-settings format-rows?)
_ (add-row! data-sheet col-names ordered-cols col-settings cell-styles typed-cell-styles)
area-ref (AreaReference/getWholeColumn SpreadsheetVersion/EXCEL2007
"A"
(CellReference/convertNumToColString (dec (count ordered-cols))))
;; keep the initial area-ref small (only 2 rows) so that adding row and column labels keeps the pivot table
;; object small.
area-ref (AreaReference.
(format "A1:%s2" (CellReference/convertNumToColString (dec (count ordered-cols))))
SpreadsheetVersion/EXCEL2007)
^XSSFPivotTable pivot-table (.createPivotTable ^XSSFSheet pivot-sheet
^AreaReference area-ref
(CellReference. 0 0)
......@@ -623,6 +625,13 @@
.getPivotFields
(.getPivotFieldArray idx)
(.setSortType setting)))))
;; now that the Pivot Table Rows and Cols are set, we can update the area-ref
(-> pivot-table
.getPivotCacheDefinition
.getCTPivotCacheDefinition
.getCacheSource
.getWorksheetSource
(.setRef (format "A:%s" (CellReference/convertNumToColString (dec (count ordered-cols))))))
(let [swb (-> (SXSSFWorkbook. ^XSSFWorkbook wb)
(doto (.setCompressTempFiles true)))
sheet (spreadsheet/select-sheet "data" swb)]
......
......@@ -12,7 +12,9 @@
[metabase.test :as mt])
(:import
(com.fasterxml.jackson.core JsonGenerator)
(java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream)))
com.sun.management.ThreadMXBean
(java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream)
java.lang.management.ManagementFactory))
(set! *warn-on-reflection* true)
......@@ -390,7 +392,7 @@
[sheet]
(mapv (fn [row]
(mapv spreadsheet/read-cell row))
(spreadsheet/into-seq sheet)))
(some-> sheet spreadsheet/into-seq)))
(defn parse-xlsx-results
"Given a byte array representing an XLSX document, parses the query result sheet using the provided `parse-fn`"
......@@ -404,20 +406,20 @@
(parse-fn sheet)))))
(defn- xlsx-export
([ordered-cols viz-settings rows]
(xlsx-export ordered-cols viz-settings rows parse-cell-content))
([ordered-cols viz-settings rows parse-fn]
(with-open [bos (ByteArrayOutputStream.)
os (BufferedOutputStream. bos)]
(let [results-writer (qp.si/streaming-results-writer :xlsx os)]
(qp.si/begin! results-writer {:data {:ordered-cols ordered-cols}} viz-settings)
(doall (map-indexed
(fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings))
rows))
(qp.si/finish! results-writer {:row_count (count rows)}))
(let [bytea (.toByteArray bos)]
(parse-xlsx-results bytea parse-fn)))))
[ordered-cols viz-settings rows & {:keys [parse-fn pivot-export-options]}]
(with-open [bos (ByteArrayOutputStream.)
os (BufferedOutputStream. bos)]
(let [results-writer (qp.si/streaming-results-writer :xlsx os)]
(qp.si/begin! results-writer {:data {:ordered-cols ordered-cols
:pivot? (some? pivot-export-options)
:pivot-export-options pivot-export-options}}
viz-settings)
(doall (map-indexed
(fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings))
rows))
(qp.si/finish! results-writer {:row_count (count rows)}))
(let [bytea (.toByteArray bos)]
(parse-xlsx-results bytea (or parse-fn parse-cell-content)))))
(defn- parse-format-strings
[sheet]
......@@ -426,6 +428,9 @@
(.. cell getCellStyle getDataFormatString))
row)))
(defn- get-allocated-bytes []
(.getCurrentThreadAllocatedBytes ^ThreadMXBean (ManagementFactory/getThreadMXBean)))
(deftest export-format-test
(mt/with-temporary-setting-values [custom-formatting {}]
(testing "Different format strings are used for ints and numbers that round to ints (with 2 decimal places)"
......@@ -433,7 +438,7 @@
(rest (xlsx-export [{:field_ref [:field 0] :name "Col" :semantic_type :type/Cost}]
{}
[[1] [1.23] [1.004] [1.005] [10000000000] [10000000000.123]]
parse-format-strings)))))
:parse-fn parse-format-strings)))))
(testing "Misc format strings are included correctly in exports"
(is (= ["[$€]#,##0.00"]
......@@ -442,7 +447,7 @@
{::mb.viz/currency "EUR"
::mb.viz/currency-in-header false}}}
[[1.23]]
parse-format-strings))))
:parse-fn parse-format-strings))))
(is (= ["yyyy.m.d, h:mm:ss am/pm"]
(second (xlsx-export [{:field_ref [:field 0] :name "Col" :effective_type :type/Temporal}]
{::mb.viz/column-settings {{::mb.viz/field-id 0}
......@@ -451,7 +456,7 @@
::mb.viz/time-style "h:mm A",
::mb.viz/time-enabled "seconds"}}}
[[#t "2020-03-28T10:12:06.681"]]
parse-format-strings)))))))
:parse-fn parse-format-strings)))))))
(deftest column-order-test
(testing "Column titles are ordered correctly in the output"
......@@ -675,22 +680,22 @@
(xlsx-export [{:id 0, :name "Col1"} {:id 1, :name "Col2"}]
{}
[["a" "abcdefghijklmnopqrstuvwxyz"]]
assert-non-default-widths))
:parse-fn assert-non-default-widths))
(testing "Auto-sizing works when the number of rows is at or above the auto-sizing threshold"
(binding [qp.xlsx/*auto-sizing-threshold* 2]
(xlsx-export [{:id 0, :name "Col1"}]
{}
[["abcdef"] ["abcedf"]]
assert-non-default-widths)
:parse-fn assert-non-default-widths)
(xlsx-export [{:id 0, :name "Col1"}]
{}
[["abcdef"] ["abcedf"] ["abcdef"]]
assert-non-default-widths)))
:parse-fn assert-non-default-widths)))
(testing "An auto-sized column does not exceed max-column-width (the width of 255 characters)"
(let [[col-width] (xlsx-export [{:id 0, :name "Col1"}]
{}
[[(apply str (repeat 256 "0"))]]
parse-column-widths)]
:parse-fn parse-column-widths)]
(is (= 65280 col-width)))))
(deftest poi-tempfiles-test
......@@ -748,3 +753,24 @@
{}
[[1]
[2]]))))))
(deftest pivot-table-resource-usage-test
(testing "pivot table initialization should complete in reasonable time and memory"
;; We test XLSX export of an empty table (0 rows) with pivoting enabled. This should test the initialization of
;; pivot machinery that used to allocate and retain a lot of memory (and hence was slow on smaller heaps).
(let [do-export #(xlsx-export [{:display_name "A"}
{:display_name "B"}
{:display_name "C"}
{:display_name "D"}
{:display_name "pivot-grouping"}
{:display_name "E"}
{:display_name "F"}]
{}
[]
:pivot-export-options {:pivot-rows [0 1], :pivot-cols [2 3], :pivot-measures [5 4]})
;; Run once before measuring to warm-up and thus reduce flakiness.
_ (do-export)
start-bytes (get-allocated-bytes)]
(do-export)
;; Should always allocate less than 100Mb.
(is (< (- (get-allocated-bytes) start-bytes) (* 100 1024 1024))))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment