Skip to content
Snippets Groups Projects
Unverified Commit 6e64a3e0 authored by Braden Shepherdson's avatar Braden Shepherdson Committed by GitHub
Browse files

[perf] Better CLJC memoization; bounded strategy, fast versions (#46199)

Expands and refactors `metabase.util.memoize` to support better cross-platform memoization.

**Better basic memoization**

Use `clojure.core.memoize/memo` (and its clone in CLJS) rather than `clojure.core/memoize` for
the "everything forever" strategy.

**Bounded memoization**

There are a few places (eg. `u/kebab-case-en`) where we want zero-overhead *hits* and expect the input space
to be fixed. To guard against runaway memory usage, this **bounded** strategy dumps the entire cache when it
overflows. It logs an INFO level note when that happens - the intent here is that the threshold should never
actually get hit!

**Fast JVM versions**

In the special case where the function to memoize takes exactly 1 argument which is a valid map key, we can
use `ConcurrentHashMap.computeIfAbsent` as the basis of the cache and have much less overhead.

`fast-memo` and `fast-bounded` implement this in CLJ; in CLJS they just call `memo` and `bounded`.
parent 029a6c72
No related branches found
No related tags found
No related merge requests found
......@@ -611,7 +611,9 @@
toucan.models models}}}
:lint-as
{clojure.core.logic/defne clj-kondo.lint-as/def-catch-all
{cljs.cache/defcache clojure.core/deftype
clojure.core.cache/defcache clojure.core/deftype
clojure.core.logic/defne clj-kondo.lint-as/def-catch-all
clojure.test.check.clojure-test/defspec clojure.test/deftest
clojurewerkz.quartzite.jobs/defjob clojure.core/defn
honeysql.helpers/defhelper clj-kondo.lint-as/def-catch-all
......
......@@ -25,12 +25,6 @@
;;;
;;; where keys are a map of String ID => metadata
;; TODO: This is always wrapped with `keyword` in its usage so that may as well be memoized too.
(def ^:private ^{:arglists '([k])} memoized-kebab-key
"Even tho [[u/->kebab-case-en]] has LRU memoization, plain memoization is significantly faster, and since the keys
we're parsing here are bounded it's fine to memoize this stuff forever."
(memoize u/->kebab-case-en))
(defn- object-get [obj k]
(when (and obj (js-in k obj))
(gobject/get obj k)))
......@@ -108,7 +102,7 @@
(comp
;; convert keys to kebab-case keywords
(map (fn [[k v]]
[(cond-> (keyword (memoized-kebab-key k))
[(cond-> (keyword (u/->kebab-case-en k))
rename-key (#(or (rename-key %) %)))
v]))
;; remove [[excluded-keys]]
......@@ -244,7 +238,7 @@
[m]
(obj->clj
(map (fn [[k v]]
(let [k (keyword (memoized-kebab-key k))
(let [k (keyword (u/->kebab-case-en k))
k (if (= k :binning-strategy)
:strategy
k)
......
......@@ -219,24 +219,27 @@
(def ^{:arglists '([x])} ->kebab-case-en
"Like [[camel-snake-kebab.core/->kebab-case]], but always uses English for lower-casing, supports keywords with
namespaces, and returns `nil` when passed `nil` (rather than throwing an exception)."
(memoize/lru (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->kebab-case-en*) :lru/threshold 256))
(memoize/fast-bounded (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->kebab-case-en*)
:bounded/threshold 10000))
(def ^{:arglists '([x])} ->snake_case_en
"Like [[camel-snake-kebab.core/->snake_case]], but always uses English for lower-casing, supports keywords with
namespaces, and returns `nil` when passed `nil` (rather than throwing an exception)."
(memoize/lru (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->snake_case_en*) :lru/threshold 256))
(memoize/fast-bounded (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->snake_case_en*)
:bounded/threshold 10000))
(def ^{:arglists '([x])} ->camelCaseEn
"Like [[camel-snake-kebab.core/->camelCase]], but always uses English for upper- and lower-casing, supports keywords
with namespaces, and returns `nil` when passed `nil` (rather than throwing an exception)."
(memoize/lru (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->camelCaseEn*) :lru/threshold 256))
(memoize/fast-bounded (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->camelCaseEn*)
:bounded/threshold 10000))
(def ^{:arglists '([x])} ->SCREAMING_SNAKE_CASE_EN
"Like [[camel-snake-kebab.core/->SCREAMING_SNAKE_CASE]], but always uses English for upper- and lower-casing, supports
keywords with namespaces, and returns `nil` when passed `nil` (rather than throwing an exception)."
(memoize/lru (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->SCREAMING_SNAKE_CASE_EN*)
:lru/threshold 256))
(memoize/fast-bounded (wrap-csk-conversion-fn-to-handle-nil-and-namespaced-keywords ->SCREAMING_SNAKE_CASE_EN*)
:bounded/threshold 10000))
(defn capitalize-first-char
"Like string/capitalize, only it ignores the rest of the string
......
(ns metabase.util.memoize
(:require
[clojure.core.memoize :as memoize]
[metabase.shared.util.namespaces :as shared.ns]))
(comment
memoize/keep-me)
(shared.ns/import-fns
[memoize
lru
memoizer])
(ns metabase.util.memoize
"Cross-platform version of `clojure.core.memoize`.
In CLJ this re-exports some of the public functions from `clojure.core.memoize`.
In CLJS it implements the same design in CLJS.
There are also some extra memoization tools here, like [[fast-memo]] and [[fast-bounded]]."
(:require
[metabase.shared.util.namespaces :as shared.ns]
[metabase.util.memoize.impl.bounded :as bounded]
#?@(:clj ([clojure.core.memoize :as memoize])
:cljs ([metabase.util.memoize.impl.js :as memoize]))))
(shared.ns/import-fns
[memoize
lru
memoizer
memo])
(defn bounded
"Memoizes a function with zero overhead on a *hit*, but keeping to a bounded size.
This is intended for functions that have many calls and nearly always *hit*. We want to use [[memo]] but are concerned
about leaking memory if the set of inputs isn't bounded.
It works by checking the cache size only on a *miss*: if it has reached the (configurable) threshold, then **discard
the cache** and start over with an empty map. There's no bookkeeping overhead on a *hit*, which keeps the frequent
hits very fast. The cost is that it will occasionally recompute even the most frequently hit values.
The threshold is intended to be big enough to hold everything forever! It's just there as a safety valve in case
the input set is larger than expected.
This is not quite a drop-in replacement for [[memo]] (or `clojure.core/memoize`) because those will *never* call the
inner function twice for the same arguments, but this will.
Default `:bounded/threshold` is 1024."
([f] (bounded f :bounded/threshold 1024))
([f tkey threshold]
(assert (= tkey :bounded/threshold) (str "wrong parameter tkey " tkey))
(memoize/memoizer f (bounded/bounded-cache-factory {} threshold) {})))
(defn fast-memo
"Variant of [[memo]] that optimizes a common special case: a function with only a single argument, where that
argument makes a good map key.
In CLJ, this uses `ConcurrentHashMap.computeIfAbsent` in Clojure for a significant speedup.
Note that this also doesn't support the memoization API like `memo-swap!`, `memoized?` etc.
The key should have a good `Object.hashCode`; Clojure values like strings and keywords have this built in.
See this thread for an analysis of the performance https://metaboat.slack.com/archives/C04CYTEL9N2/p1702671632956539
In CLJS, this is identical to [[memo]], but it's defined in both dialects for convenience."
[f]
#?(:clj (let [cache (java.util.concurrent.ConcurrentHashMap.)
mapping-fn (reify java.util.function.Function
(apply [_this k]
(f k)))]
(fn [k]
(.computeIfAbsent cache k mapping-fn)))
:cljs (memoize/memo f)))
(defn fast-bounded
"Variant of [[bounded]] that optimizes a common special case: a function with only a single argument, where that
argument makes a good map key.
In CLJ, this uses `ConcurrentHashMap.computeIfAbsent` in Clojure for a significant speedup.
Note that this also doesn't support the memoization API like `memo-swap!`, `memoized?` etc.
The key should have a good `Object.hashCode`; Clojure values like strings and keywords have this built in.
In CLJS, this is identical to [[bounded]], but it's defined in both dialects for convenience."
([f] (fast-bounded f :bounded/threshold 1024))
([f tkey threshold]
(assert (= tkey :bounded/threshold) (str "wrong parameter tkey " tkey))
#?(:clj (let [cache (java.util.concurrent.ConcurrentHashMap.)
mapping-fn (reify java.util.function.Function
(apply [_this k]
(f k)))]
(fn [k]
(when (>= (.size cache) threshold)
;; If the cache gets too large, empty it in place.
(.clear cache))
(.computeIfAbsent cache k mapping-fn)))
:cljs (bounded f tkey threshold))))
(ns metabase.util.memoize.impl.bounded
(:require
[metabase.util.log :as log]
#?(:clj [clojure.core.cache :as cache]
:cljs [cljs.cache :as cache])))
(cache/defcache BoundedCache [cache threshold]
cache/CacheProtocol
(lookup [_ item] (get cache item))
(lookup [_ item not-found] (get cache item not-found))
(has? [_ item] (contains? cache item))
(hit [this _item] this)
(miss [_ item result]
(if (< (count cache) threshold)
(BoundedCache. (assoc cache item result) threshold)
;; It's too big! Throw away the original cache and start over!
(do
(log/warnf "BoundedCache threshold (%d) exceeded - runaway cache? threshold too small?" threshold)
(BoundedCache. {item result} threshold))))
(evict [_ item] (BoundedCache. (dissoc cache item) threshold))
(seed [_ base]
(assert (< (count base) threshold) "cache seed is larger than the threshold")
(BoundedCache. base threshold)))
(defn bounded-cache-factory
"Create a bounded [[clojure.core.cache]]-compatible cache. This is a basic map with no bookkeeping on `hit` but
that will completely discard the cache if it exceeds the `threshold` size.
Intended to be used with [[metabase.util.memoize/bounded]], rather than called directly."
[base threshold]
{:pre [(map? base)]}
(BoundedCache. base threshold))
(ns metabase.util.memoize
"Copied from clojure.core.memoize."
(:require [cljs.cache :as cache]))
(ns metabase.util.memoize.impl.js
"CLJS-specific implementation code for [[metabase.util.memoize]].
Not to be referenced from other code!"
(:require
[cljs.cache :as cache]))
;; Similar to clojure.lang.Delay, but will not memoize an exception and will
;; instead retry.
......@@ -128,3 +131,39 @@
([f base key threshold]
(assert (= key :lru/threshold) (str "wrong parameter key " key))
(memoizer f (cache/lru-cache-factory {} :threshold threshold) base)))
(defn memo
"Used as a more flexible alternative to Clojure's core `memoization`
function. Memoized functions built using `memo` will respond to
the core.memo manipulable memoization utilities. As a nice bonus,
you can use `memo` in place of `memoize` without any additional
changes, with the added guarantee that the memoized function will
only be called once for a given sequence of arguments (`memoize`
can call the function multiple times when concurrent calls are
made with the same sequence of arguments).
The default way to use this function is to simply supply a function
that will be memoized. Additionally, you may also supply a map
of the form `'{[42] 42, [108] 108}` where keys are a vector
mapping expected argument values to arity positions. The map values
are the return values of the memoized function.
If the supplied function has metadata containing an
`:clojure.core.memoize/args-fn` key, the value is assumed to be a
function that should be applied to the arguments to produce a
subset or transformed sequence of arguments that are used for the
key in the cache (the full, original arguments will still be used
to call the function). This allows you to memoize functions where
one or more arguments are irrelevant for memoization, such as the
`clojure.java.jdbc` functions, whose first argument may include
a (mutable) JDBC `Connection` object:
(memo/memo (with-meta jdbc/execute! {::memo/args-fn rest}))
You can access the memoization cache directly via the `:clojure.core.memoize/cache` key
on the memoized function's metadata. However, it is advised to
use the core.memo primitives instead as implementation details may
change over time."
([f] (memo f {}))
([f seed]
(memoizer f (cache/basic-cache-factory {}) seed)))
(ns metabase.util.memoize-test
(:require
[clojure.string :as str]
[clojure.test :refer [deftest is testing]]
[clojure.test.check.generators :as gen]
[metabase.util.memoize :as memo]))
(defn- call-count-harness [memoizer inner-fn]
(let [misses (atom 0)
calls (atom 0)
wrapped-inner (fn [x]
(swap! misses inc)
(inner-fn x))
memoized (memoizer wrapped-inner)
wrapped-memo (fn [x]
(swap! calls inc)
(memoized x))]
{:misses misses
:calls calls
:f wrapped-memo
:reset (fn []
(reset! misses 0)
(reset! calls 0))}))
(defn- round-robin [times keyspace f exp-fn]
(let [exp-map (into {} (map (juxt identity exp-fn)) keyspace)]
(dotimes [_ times]
(doseq [k keyspace]
(is (= (get exp-map k ::not-found)
(f k)))))))
(defn- gen-keyspace [n]
(-> (gen/set (gen/such-that seq gen/string) {:num-elements n})
(gen/sample 1)
first
vec))
(defn- never-evicts [memoizer]
(let [keyspace (gen-keyspace 100)
{:keys [f calls misses]} (call-count-harness memoizer str/reverse)]
;; Now we deliberately call f for each entry in keyspace, ten times around.
(round-robin 10 keyspace f str/reverse)
(is (= 100 @misses)
"should be as many misses as unique inputs")
(is (= 1000 @calls)
"and 10 calls for each unique input")))
(deftest ^:parallel memoize-never-evicts-test
(never-evicts memo/memo))
(deftest ^:parallel fast-memoize-never-evicts-test
(never-evicts memo/fast-memo))
(deftest ^:parallel bounded-never-evicts-if-large-enough-test
(never-evicts #(memo/bounded % :bounded/threshold 100)))
(deftest ^:parallel fast-bounded-never-evicts-if-large-enough-test
(never-evicts #(memo/fast-bounded % :bounded/threshold 101)))
(deftest ^:parallel bounded-evicts-when-keyspace-overflows-test
(let [keyspace (gen-keyspace 100)
{:keys [f calls misses reset]} (call-count-harness #(memo/bounded % :bounded/threshold 50) str/reverse)]
;; 10 round-robin calls will expect to evict twice per lap, and never hit!
(testing "10 round-robin calls will never hit"
(round-robin 10 keyspace f str/reverse)
(is (= 1000 @misses))
(is (= 1000 @calls)))
(testing "10000 randomly sampled calls will hit sometimes"
;; This hits 25-30% of the time, empirically.
(reset)
(doseq [k (gen/sample (gen/elements keyspace) 10000)]
(is (= (str/reverse k)
(f k))))
(is (= @calls 10000))
(is (< @misses 10000)))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment