Skip to content
Snippets Groups Projects
Unverified Commit ea3e8b6a authored by Howon Lee's avatar Howon Lee Committed by GitHub
Browse files

Describe the JSON within tables for postgres (#20547)

Step 1 towards the long path towards whacking #708. We need an interface for describing the weird thing every RDBMS vendor did where they have their own special way of doing JSON columns, every single one of them
parent e9178bb4
No related merge requests found
......@@ -10,6 +10,15 @@
function rather than accessing `:expressions` directly, as doing so can make your driver compatible with both 0.42.0
and with 0.43.0 and newer.
- There is now a `describe-nested-field-columns` method under `sql-jdbc.sync` namespace which returns an instance of
NestedFCMetadata. This is in order to allow JSON columns in Postgres and eventually other DB's which are usually
ordinary RDBMS's but then sometimes they have a denormalized column with JSON or some other semantics. Given a table
with denormalized columns which have nested field semantics (so, typed sub-fields which are still denormalized but
stable in type between rows), return value should be a NestedFCMetadata, a map of flattened key paths to the
detected sub-field. Field detection in syncing will then be enriched with those nested types. This is materially
different from the way we do it for mongo because every kind of JSON column is different, but it's going to run
every sync so it can't be too slow, even on enormous tables and enormous denormalized columns on those enormous tables.
## Metabase 0.42.0
Changes in Metabase 0.42.0 affect drivers that derive from `:sql` (including `:sql-jdbc`).
......
......@@ -358,9 +358,12 @@
;; Does this database support foreign key relationships?
:foreign-keys
;; Does this database support nested fields (e.g. Mongo)?
;; Does this database support nested fields for any and every field except primary key (e.g. Mongo)?
:nested-fields
;; Does this database support nested fields but only for certain field types (e.g. Postgres and JSON / JSONB columns)?
:nested-field-columns
;; Does this driver support setting a timezone for the query?
:set-timezone
......
(ns metabase.driver.postgres
"Database driver for PostgreSQL databases. Builds on top of the SQL JDBC driver, which implements most functionality
for JDBC-based drivers."
(:require [clojure.java.jdbc :as jdbc]
(:require [cheshire.core :as json]
[clojure.java.jdbc :as jdbc]
[clojure.set :as set]
[clojure.string :as str]
[clojure.tools.logging :as log]
......@@ -165,6 +166,67 @@
(binding [*enum-types* (enum-types driver database)]
(sql-jdbc.sync/describe-table driver database table)))
(def ^:const nested-field-sample-limit
"Number of rows to sample for describe-nested-field-columns"
10000)
(defn- flatten-row [row]
(letfn [(flatten-row [row path]
(lazy-seq
(when-let [[[k v] & xs] (seq row)]
(cond (and (map? v) (not-empty v))
(into (flatten-row v (conj path k))
(flatten-row xs path))
:else
(cons [(conj path k) v]
(flatten-row xs path))))))]
(into {} (flatten-row row []))))
(defn- row->types [row]
(into {} (for [[field-name field-val] row]
[field-name (let [flattened-row (flatten-row field-val)]
(into {} (map (fn [[k v]] [k (type v)]) flattened-row)))])))
(defn- describe-json-xform [member]
((comp (map #(for [[k v] %] [k (json/parse-string v)]))
(map #(into {} %))
(map row->types)) member))
(defn- describe-json-rf
([] nil)
([fst] fst)
([fst snd]
(into {}
(for [json-column (keys snd)]
(if (or (nil? fst) (= (hash (fst json-column)) (hash (snd json-column))))
[json-column (snd json-column)]
[json-column nil])))))
(defn- describe-nested-field-columns*
[driver spec table]
(with-open [conn (jdbc/get-connection spec)]
(let [map-inner (fn [f xs] (map #(into {}
(for [[k v] %]
[k (f v)])) xs))
table-fields (sql-jdbc.sync/describe-table-fields driver conn table)
json-fields (filter #(= (:semantic-type %) :type/SerializedJSON) table-fields)
json-field-names (mapv (comp keyword :name) json-fields)
sql-args (hsql/format {:select json-field-names
:from [(keyword (:name table))]
:limit nested-field-sample-limit} {:quoting :ansi})
query (jdbc/reducible-query spec sql-args)]
{:types (transduce describe-json-xform describe-json-rf query)})))
;; Describe the nested fields present in a table (currently and maybe forever just JSON),
;; including if they have proper keyword and type stability.
;; Not to be confused with existing nested field functionality for mongo,
;; since this one only applies to JSON fields, whereas mongo only has BSON (JSON basically) fields.
;; Every single database major is fiddly and weird and different about JSON so there's only a trivial default impl in sql.jdbc
(defmethod sql-jdbc.sync/describe-nested-field-columns :postgres
[driver database table]
(let [spec (sql-jdbc.conn/db->pooled-connection-spec database)]
(describe-nested-field-columns* driver spec table)))
;;; +----------------------------------------------------------------------------------------------------------------+
;;; | metabase.driver.sql impls |
......
......@@ -13,6 +13,7 @@
column->semantic-type
database-type->base-type
db-default-timezone
describe-nested-field-columns
excluded-schemas
fallback-metadata-query
filtered-syncable-schemas
......
......@@ -104,3 +104,13 @@
(defmethod db-default-timezone :sql-jdbc
[_ _]
nil)
(defmulti describe-nested-field-columns
"Return information about the nestable columns in a `table`. Required for drivers that support `:nested-field-columns`. Results
should match the [[metabase.sync.interface/NestedFCMetadata]] schema."
{:added "0.43.0", :arglists '([driver database table])}
driver/dispatch-on-initialized-driver
:hierarchy #'driver/hierarchy)
(defmethod describe-nested-field-columns :sql-jdbc [_ _ _]
nil)
......@@ -9,6 +9,7 @@
[metabase.driver.postgres :as postgres]
[metabase.driver.sql-jdbc.connection :as sql-jdbc.conn]
[metabase.driver.sql-jdbc.execute :as sql-jdbc.execute]
[metabase.driver.sql-jdbc.sync :as sql-jdbc.sync]
[metabase.driver.sql.query-processor :as sql.qp]
[metabase.driver.sql.query-processor-test-util :as sql.qp-test-util]
[metabase.models.database :refer [Database]]
......@@ -283,6 +284,29 @@
(is (= :type/SerializedJSON
(db/select-one-field :semantic_type Field, :id (mt/id :venues :address))))))))
(deftest describe-nested-field-columns-test
(mt/test-driver :postgres
(testing "flatten-row"
(let [row {:bob {:dobbs 123 :cobbs "boop"}}
flattened {[:bob :dobbs] 123
[:bob :cobbs] "boop"}]
(is (= (#'postgres/flatten-row row) flattened))))
(testing "row->types"
(let [row {:bob {:dobbs {:robbs 123} :cobbs [1 2 3]}}
types {:bob {[:cobbs] clojure.lang.PersistentVector
[:dobbs :robbs] java.lang.Long}}]
(is (= (#'postgres/row->types row) types))))
(testing "describes json columns and gives types for ones with coherent schemas only"
(drop-if-exists-and-create-db! "describe-json-test")
(let [details (mt/dbdef->connection-details :postgres :db {:database-name "describe-json-test"})
spec (sql-jdbc.conn/connection-details->spec :postgres details)]
(jdbc/execute! spec [(str "CREATE TABLE describe_json_table (coherent_json_val JSON NOT NULL, incoherent_json_val JSON NOT NULL);"
"INSERT INTO describe_json_table (coherent_json_val, incoherent_json_val) VALUES ('{\"a\": 1, \"b\": 2}', '{\"a\": 1, \"b\": 2}');"
"INSERT INTO describe_json_table (coherent_json_val, incoherent_json_val) VALUES ('{\"a\": 2, \"b\": 3}', '{\"a\": [1, 2], \"b\": 2}');")])
(mt/with-temp Database [database {:engine :postgres, :details details}]
(is (= (into (sorted-map) (sql-jdbc.sync/describe-nested-field-columns :postgres database {:name "describe_json_table"}))
(into (sorted-map) {:types {:coherent_json_val {["a"] java.lang.Integer, ["b"] java.lang.Integer} :incoherent_json_val nil}}))))))))
(mt/defdataset with-uuid
[["users"
[{:field-name "user_id", :base-type :type/UUID}]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment