From 5c7d398b0e1837cec9bf1b9f26a1fe0f93e81337 Mon Sep 17 00:00:00 2001
From: github-automation-metabase
 <166700802+github-automation-metabase@users.noreply.github.com>
Date: Fri, 15 Nov 2024 18:23:58 -0500
Subject: [PATCH] XLSX Pivot Export - Initialize Pivot Table with Small Area
 Ref (#50060) (#50111)

* XLSX Pivot Export - Initialize Pivot Table with Small Area Ref

This PR changes the native pivot export so that we first initialize the pivot table with an area ref that's only the
first 2 rows. Prior, I used `AreaReference/GetWholecolumn` which had the side effect of using lots of memory.

I think this is related to how the Pivot Table's Cache is built up as you add rows/cols to the pivot definition. So,
if you keep the area ref only as wide as the number of columns in the raw data, and just 2 rows, then the cache can
stay small.

After adding row/column data to the pivot table, you can then modify the area ref to be all rows in the relevant
columns (`A:E` for example).

This keeps the pivot-table object much smaller in size.

* need the extra xml schemas to get this to work

I'd like to try find a way around including this extra dep, it's a 13mb jar (ish), and I think that's a little bigger
than I'd like

* oops, didnt mean to have this in

* Add test for pivot table initialization being fast and lean on memory

* remove time based assertion, add comments in impl

* cljfmt

* fix test

* Pesky little whitespace

---------

Co-authored-by: adam-james <21064735+adam-james-v@users.noreply.github.com>
Co-authored-by: Oleksandr Yakushev <alex@bytopia.org>
---
 .../query_processor/streaming/xlsx.clj        | 15 +++-
 .../query_processor/streaming/xlsx_test.clj   | 72 +++++++++++++------
 2 files changed, 61 insertions(+), 26 deletions(-)

diff --git a/src/metabase/query_processor/streaming/xlsx.clj b/src/metabase/query_processor/streaming/xlsx.clj
index 8e81cd6b385..664b91fd8aa 100644
--- a/src/metabase/query_processor/streaming/xlsx.clj
+++ b/src/metabase/query_processor/streaming/xlsx.clj
@@ -600,9 +600,11 @@
         pivot-sheet                 (spreadsheet/select-sheet "pivot" wb)
         col-names                   (common/column-titles ordered-cols col-settings format-rows?)
         _                           (add-row! data-sheet col-names ordered-cols col-settings cell-styles typed-cell-styles)
-        area-ref                    (AreaReference/getWholeColumn SpreadsheetVersion/EXCEL2007
-                                                                  "A"
-                                                                  (CellReference/convertNumToColString (dec (count ordered-cols))))
+        ;; keep the initial area-ref small (only 2 rows) so that adding row and column labels keeps the pivot table
+        ;; object small.
+        area-ref                    (AreaReference.
+                                     (format "A1:%s2" (CellReference/convertNumToColString (dec (count ordered-cols))))
+                                     SpreadsheetVersion/EXCEL2007)
         ^XSSFPivotTable pivot-table (.createPivotTable ^XSSFSheet pivot-sheet
                                                        ^AreaReference area-ref
                                                        (CellReference. 0 0)
@@ -623,6 +625,13 @@
               .getPivotFields
               (.getPivotFieldArray idx)
               (.setSortType setting)))))
+    ;; now that the Pivot Table Rows and Cols are set, we can update the area-ref
+    (-> pivot-table
+        .getPivotCacheDefinition
+        .getCTPivotCacheDefinition
+        .getCacheSource
+        .getWorksheetSource
+        (.setRef (format "A:%s" (CellReference/convertNumToColString (dec (count ordered-cols))))))
     (let [swb   (-> (SXSSFWorkbook. ^XSSFWorkbook wb)
                     (doto (.setCompressTempFiles true)))
           sheet (spreadsheet/select-sheet "data" swb)]
diff --git a/test/metabase/query_processor/streaming/xlsx_test.clj b/test/metabase/query_processor/streaming/xlsx_test.clj
index effb7850012..1fbf58de10e 100644
--- a/test/metabase/query_processor/streaming/xlsx_test.clj
+++ b/test/metabase/query_processor/streaming/xlsx_test.clj
@@ -12,7 +12,9 @@
    [metabase.test :as mt])
   (:import
    (com.fasterxml.jackson.core JsonGenerator)
-   (java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream)))
+   com.sun.management.ThreadMXBean
+   (java.io BufferedInputStream BufferedOutputStream ByteArrayInputStream ByteArrayOutputStream)
+   java.lang.management.ManagementFactory))
 
 (set! *warn-on-reflection* true)
 
@@ -390,7 +392,7 @@
   [sheet]
   (mapv (fn [row]
           (mapv spreadsheet/read-cell row))
-        (spreadsheet/into-seq sheet)))
+        (some-> sheet spreadsheet/into-seq)))
 
 (defn parse-xlsx-results
   "Given a byte array representing an XLSX document, parses the query result sheet using the provided `parse-fn`"
@@ -404,20 +406,20 @@
        (parse-fn sheet)))))
 
 (defn- xlsx-export
-  ([ordered-cols viz-settings rows]
-   (xlsx-export ordered-cols viz-settings rows parse-cell-content))
-
-  ([ordered-cols viz-settings rows parse-fn]
-   (with-open [bos (ByteArrayOutputStream.)
-               os  (BufferedOutputStream. bos)]
-     (let [results-writer (qp.si/streaming-results-writer :xlsx os)]
-       (qp.si/begin! results-writer {:data {:ordered-cols ordered-cols}} viz-settings)
-       (doall (map-indexed
-               (fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings))
-               rows))
-       (qp.si/finish! results-writer {:row_count (count rows)}))
-     (let [bytea (.toByteArray bos)]
-       (parse-xlsx-results bytea parse-fn)))))
+  [ordered-cols viz-settings rows & {:keys [parse-fn pivot-export-options]}]
+  (with-open [bos (ByteArrayOutputStream.)
+              os  (BufferedOutputStream. bos)]
+    (let [results-writer (qp.si/streaming-results-writer :xlsx os)]
+      (qp.si/begin! results-writer {:data {:ordered-cols ordered-cols
+                                           :pivot? (some? pivot-export-options)
+                                           :pivot-export-options pivot-export-options}}
+                    viz-settings)
+      (doall (map-indexed
+              (fn [i row] (qp.si/write-row! results-writer row i ordered-cols viz-settings))
+              rows))
+      (qp.si/finish! results-writer {:row_count (count rows)}))
+    (let [bytea (.toByteArray bos)]
+      (parse-xlsx-results bytea (or parse-fn parse-cell-content)))))
 
 (defn- parse-format-strings
   [sheet]
@@ -426,6 +428,9 @@
            (.. cell getCellStyle getDataFormatString))
          row)))
 
+(defn- get-allocated-bytes []
+  (.getCurrentThreadAllocatedBytes ^ThreadMXBean (ManagementFactory/getThreadMXBean)))
+
 (deftest export-format-test
   (mt/with-temporary-setting-values [custom-formatting {}]
     (testing "Different format strings are used for ints and numbers that round to ints (with 2 decimal places)"
@@ -433,7 +438,7 @@
              (rest (xlsx-export [{:field_ref [:field 0] :name "Col" :semantic_type :type/Cost}]
                                 {}
                                 [[1] [1.23] [1.004] [1.005] [10000000000] [10000000000.123]]
-                                parse-format-strings)))))
+                                :parse-fn parse-format-strings)))))
 
     (testing "Misc format strings are included correctly in exports"
       (is (= ["[$€]#,##0.00"]
@@ -442,7 +447,7 @@
                                                              {::mb.viz/currency           "EUR"
                                                               ::mb.viz/currency-in-header false}}}
                                   [[1.23]]
-                                  parse-format-strings))))
+                                  :parse-fn parse-format-strings))))
       (is (= ["yyyy.m.d, h:mm:ss am/pm"]
              (second (xlsx-export [{:field_ref [:field 0] :name "Col" :effective_type :type/Temporal}]
                                   {::mb.viz/column-settings {{::mb.viz/field-id 0}
@@ -451,7 +456,7 @@
                                                               ::mb.viz/time-style     "h:mm A",
                                                               ::mb.viz/time-enabled   "seconds"}}}
                                   [[#t "2020-03-28T10:12:06.681"]]
-                                  parse-format-strings)))))))
+                                  :parse-fn parse-format-strings)))))))
 
 (deftest column-order-test
   (testing "Column titles are ordered correctly in the output"
@@ -675,22 +680,22 @@
     (xlsx-export [{:id 0, :name "Col1"} {:id 1, :name "Col2"}]
                  {}
                  [["a" "abcdefghijklmnopqrstuvwxyz"]]
-                 assert-non-default-widths))
+                 :parse-fn assert-non-default-widths))
   (testing "Auto-sizing works when the number of rows is at or above the auto-sizing threshold"
     (binding [qp.xlsx/*auto-sizing-threshold* 2]
       (xlsx-export [{:id 0, :name "Col1"}]
                    {}
                    [["abcdef"] ["abcedf"]]
-                   assert-non-default-widths)
+                   :parse-fn assert-non-default-widths)
       (xlsx-export [{:id 0, :name "Col1"}]
                    {}
                    [["abcdef"] ["abcedf"] ["abcdef"]]
-                   assert-non-default-widths)))
+                   :parse-fn assert-non-default-widths)))
   (testing "An auto-sized column does not exceed max-column-width (the width of 255 characters)"
     (let [[col-width] (xlsx-export [{:id 0, :name "Col1"}]
                                    {}
                                    [[(apply str (repeat 256 "0"))]]
-                                   parse-column-widths)]
+                                   :parse-fn parse-column-widths)]
       (is (= 65280 col-width)))))
 
 (deftest poi-tempfiles-test
@@ -748,3 +753,24 @@
                           {}
                           [[1]
                            [2]]))))))
+
+(deftest pivot-table-resource-usage-test
+  (testing "pivot table initialization should complete in reasonable time and memory"
+    ;; We test XLSX export of an empty table (0 rows) with pivoting enabled. This should test the initialization of
+    ;; pivot machinery that used to allocate and retain a lot of memory (and hence was slow on smaller heaps).
+    (let [do-export #(xlsx-export [{:display_name "A"}
+                                   {:display_name "B"}
+                                   {:display_name "C"}
+                                   {:display_name "D"}
+                                   {:display_name "pivot-grouping"}
+                                   {:display_name "E"}
+                                   {:display_name "F"}]
+                                  {}
+                                  []
+                                  :pivot-export-options {:pivot-rows [0 1], :pivot-cols [2 3], :pivot-measures [5 4]})
+          ;; Run once before measuring to warm-up and thus reduce flakiness.
+          _ (do-export)
+          start-bytes (get-allocated-bytes)]
+      (do-export)
+      ;; Should always allocate less than 100Mb.
+      (is (< (- (get-allocated-bytes) start-bytes) (* 100 1024 1024))))))
-- 
GitLab