Skip to content
Snippets Groups Projects
Unverified Commit 510a792c authored by Mark Bastian's avatar Mark Bastian Committed by GitHub
Browse files

Fixing native model aliasing and reducing enum threshold (#30055)

Models based on native queries were not producing any SELECT clauses previously. This fixes that and puts both native and mbql query aliases under test. Additionally, I'm reducing the enum count threshold for pseudo-ddls to 10. This seems low, but models on stats are quite large so we need a lower number. In upcoming work we should make this value adaptive.
parent aaf6b5eb
No related branches found
No related tags found
No related merge requests found
......@@ -108,7 +108,9 @@
question)
(let [{:as database} (api/check-404 (t2/select-one Database :id database-id))
_ (check-database-support (:id database))
context {:database (metabot-util/denormalize-database database)
context {:database (->> database
metabot-util/denormalize-database
metabot-util/add-pseudo-database-ddl)
:user_prompt question
:prompt_task :infer_native_sql}]
(metabot/infer-native-sql-query context)))
......
......@@ -7,6 +7,7 @@
[clojure.string :as str]
[honey.sql :as sql]
[metabase.db.query :as mdb.query]
[metabase.mbql.util :as mbql.u]
[metabase.metabot.settings :as metabot-settings]
[metabase.models :refer [Card Field FieldValues Table]]
[metabase.query-processor :as qp]
......@@ -41,47 +42,68 @@
str/trim
(str/replace #" " "_")))
(defn- aliases
"Create a seq of aliases to be used to make model columns more human friendly"
[{:keys [dataset_query result_metadata]}]
(let [alias-map (update-vals
(into {} (map (juxt :id :display_name) result_metadata))
(fn [v] (cond-> v
(str/includes? v " ")
normalize-name)))
{:keys [query]} (let [qp (qp.reducible/combine-middleware
(vec qp/around-middleware)
(fn [query _rff _context]
(add/add-alias-info
(#'qp/preprocess* query))))]
(qp dataset_query nil nil))
{:keys [fields]} query]
(for [field fields
:let [[_ id m] field
alias (::add/desired-alias m)]]
[alias (alias-map id)])))
(defn- add-qp-column-aliases
"Add the aliases generated by the query processor to each results metadata field."
[{:keys [dataset_query] :as model}]
(let [fields (let [qp (qp.reducible/combine-middleware
(vec qp/around-middleware)
(fn [query _rff _context]
(add/add-alias-info
(#'qp/preprocess* query))))]
(get-in (qp dataset_query nil nil) [:query :fields]))
field-ref->alias (reduce
(fn [acc [_f _id-or-name m :as field-ref]]
(if-let [alias (::add/desired-alias m)]
(assoc acc (mbql.u/remove-namespaced-options field-ref) alias)
acc))
{}
fields)]
(update model :result_metadata
(fn [result_metadata]
(map
(fn [{:keys [field_ref] :as rsmd}]
(assoc rsmd :qp_column_name (field-ref->alias field_ref)))
result_metadata)))))
(defn inner-query
(defn- add-inner-query
"Produce a SELECT * over the parameterized model with columns aliased to normalized display names.
Add this result to the input model along with the generated column aliases.
This can be used in a CTE such that an outer query can be called on this query."
[{:keys [id] :as model}]
(let [column-aliases (->> (aliases model)
(map (partial apply format "\"%s\" AS %s"))
(str/join ","))]
(mdb.query/format-sql
(format "SELECT %s FROM {{#%s}} AS INNER_QUERY" column-aliases id))))
[{:keys [id result_metadata] :as model}]
(let [column-aliases (or
(some->> result_metadata
(map (comp
(fn [[column_name column_alias]]
(cond
(and column_name column_alias) (format "\"%s\" AS %s" column_name column_alias)
column_alias column_alias
:else nil))
(juxt :qp_column_name :sql_name)))
(filter identity)
seq
(str/join ", "))
"*")]
(assoc model
:column_aliases column-aliases
:inner_query
(mdb.query/format-sql
(format "SELECT %s FROM {{#%s}} AS INNER_QUERY" column-aliases id)))))
(defn denormalize-field
(defn- denormalize-field
"Create a 'denormalized' version of the field which is optimized for querying
and prompt engineering. Add in enumerated values (if a low-cardinality field),
a sql-friendly name, and remove fields unused in prompt engineering."
[{:keys [display_name id base_type] :as field}]
(let [field-vals (when (not= :type/Boolean base_type)
and remove fields unused in prompt engineering."
[{:keys [id base_type] :as field}]
(let [field-vals (when
(and
(not= :type/Boolean base_type)
(< 0
(get-in field [:fingerprint :global :distinct-count] 0)
(inc (metabot-settings/enum-cardinality-threshold))))
(t2/select-one-fn :values FieldValues :field_id id))]
(-> (cond-> field
(seq field-vals)
(assoc :possible_values (vec field-vals)))
(assoc :sql_name (normalize-name display_name))
(dissoc :field_ref :id))))
(defn- create-enum-ddl
......@@ -89,9 +111,7 @@
[{:keys [result_metadata]}]
(into {}
(for [{:keys [display_name sql_name possible_values]} result_metadata
:when (and (seq possible_values)
(<= (count possible_values)
(metabot-settings/enum-cardinality-threshold)))
:when (seq possible_values)
:let [ddl-str (format "create type %s_t as enum %s;"
sql_name
(str/join ", " (map (partial format "'%s'") possible_values)))
......@@ -128,6 +148,30 @@
(defn- add-create-table-ddl [model]
(assoc model :create_table_ddl (create-table-ddl model)))
(defn- disambiguate
"Given a seq of names that are potentially the same, provide a seq of tuples of
original name to a non-ambiguous version of the name."
[names]
(let [uniquifier (metabase.mbql.util/unique-name-generator)
[_ new-names] (reduce
(fn [[taken acc] n]
(let [candidate (uniquifier n)]
(if (taken candidate)
(recur [(conj taken candidate) acc] n)
[(conj taken candidate) (conj acc candidate)])))
[#{} []] names)]
(map vector names new-names)))
(defn- add-sql-names
"Add a distinct SCREAMING_SNAKE_CASE sql name to each field in the result_metadata."
[{:keys [result_metadata] :as model}]
(update model :result_metadata
#(->> %
(map (comp normalize-name :display_name))
disambiguate
(map (fn [rsmd [_ disambiguated-name]]
(assoc rsmd :sql_name disambiguated-name)) result_metadata))))
(defn denormalize-model
"Create a 'denormalized' version of the model which is optimized for querying.
All foreign keys are resolved as data, sql-friendly names are added, and
......@@ -135,9 +179,11 @@
(with sql friendly column names) that can be used to query this model."
[{model-name :name :as model}]
(-> model
add-qp-column-aliases
add-sql-names
add-inner-query
(update :result_metadata #(mapv denormalize-field %))
(assoc :sql_name (normalize-name model-name))
(assoc :inner_query (inner-query model))
add-create-table-ddl
(dissoc :creator_id :dataset_query :table_id :collection_position)))
......@@ -167,8 +213,8 @@
[{table-name :name} {field-name :name field-id :id :keys [base_type]}]
(when-let [values (and
(not= :type/Boolean base_type)
(:values (t2/select-one FieldValues :field_id field-id)))]
(when (< (count values) (metabot-settings/enum-cardinality-threshold))
(t2/select-one-fn :values FieldValues :field_id field-id))]
(when (<= (count values) (metabot-settings/enum-cardinality-threshold))
(let [ddl-str (format "create type %s_%s_t as enum %s;"
table-name
field-name
......@@ -252,7 +298,9 @@
(quot nchars 4))
ddl-str))
(defn- add-pseudo-database-ddl [database]
(defn add-pseudo-database-ddl
"Add a create_database_ddl entry to the denormalized database suitable for raw sql inference input."
[database]
(assoc database :create_database_ddl (database->pseudo-ddl database)))
(defn denormalize-database
......@@ -264,8 +312,7 @@
(-> database
(assoc :sql_name (normalize-name database-name))
(assoc :models (mapv denormalize-model models))
add-model-json-summary
add-pseudo-database-ddl)))
add-model-json-summary)))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Prompt Input ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
......
......@@ -10,8 +10,10 @@
[metabase.models :refer [Card Database]]
[metabase.query-processor :as qp]
[metabase.test :as mt]
[metabase.test.util :as tu]
[metabase.util :as u]
[toucan2.core :as t2]))
[toucan2.core :as t2]
[toucan2.tools.with-temp :as t2.with-temp]))
(deftest normalize-name-test
(testing "Testing basic examples of how normalize-name should work"
......@@ -42,11 +44,8 @@
{:display_name "Sizes"
:base_type :type/Integer
:possible_values [1 2 3]}
;; Despite being enumerated, when the cardinality is too high,
;; we will skip the enumeration here to prevent using too many tokens.
{:display_name "BigCardinality"
:base_type :type/Integer
:possible_values (range (inc (metabot-settings/enum-cardinality-threshold)))}])}]
:base_type :type/Integer}])}]
(is (= (mdb.query/format-sql
(str
"create type SIZES_t as enum '1', '2', '3';"
......@@ -54,6 +53,25 @@
(mdb.query/format-sql
(#'metabot-util/create-table-ddl model)))))))
(deftest denormalize-field-cardinality-test
(testing "Ensure enum-cardinality-threshold is respected in model denormalization"
(mt/dataset sample-dataset
(mt/with-temp* [Card [model
{:dataset_query
{:database (mt/id)
:type :query
:query {:source-table (mt/id :people)}}
:dataset true}]]
(tu/with-temporary-setting-values [metabot-settings/enum-cardinality-threshold 0]
(let [{:keys [result_metadata]} (metabot-util/denormalize-model model)]
(zero? (count (filter :possible_values result_metadata)))))
(tu/with-temporary-setting-values [metabot-settings/enum-cardinality-threshold 10]
(let [{:keys [result_metadata]} (metabot-util/denormalize-model model)]
(= 1 (count (filter :possible_values result_metadata)))))
(tu/with-temporary-setting-values [metabot-settings/enum-cardinality-threshold 50]
(let [{:keys [result_metadata]} (metabot-util/denormalize-model model)]
(= 2 (count (filter :possible_values result_metadata)))))))))
(deftest denormalize-model-test
(testing "Basic denormalized model test"
(mt/dataset sample-dataset
......@@ -76,8 +94,7 @@
(->> result_metadata
(some (fn [{:keys [sql_name] :as rsmd}] (when (= "SOURCE" sql_name) rsmd)))
:possible_values
set))))
(:result_metadata model)))))
set))))))))
(deftest denormalize-database-test
(testing "Basic denormalized database test"
......@@ -125,12 +142,12 @@
(is (= (mdb.query/format-sql sql)
(metabot-util/extract-sql sql)))))
(testing "Test that we detect SQL embedded in markdown"
(let [sql "SELECT * FROM SOMETHING"
(let [sql "SELECT * FROM SOMETHING"
bot-str (format "kmfeasf fasel;f fasefes; fasef;o ```%s```feafs feass" sql)]
(is (= (mdb.query/format-sql sql)
(metabot-util/extract-sql bot-str)))))
(testing "Test that we detect SQL embedded in markdown with language hint"
(let [sql "SELECT * FROM SOMETHING"
(let [sql "SELECT * FROM SOMETHING"
bot-str (format "kmfeasf fasel;f fasefes; fasef;o ```sql%s```feafs feass" sql)]
(is (= (mdb.query/format-sql sql)
(metabot-util/extract-sql bot-str))))))
......@@ -169,12 +186,246 @@
(deftest create-database-ddl-test
(testing "Ensure the generated pseudo-ddl contains the expected tables and enums."
(mt/dataset sample-dataset
(let [{:keys [create_database_ddl]} (metabot-util/denormalize-database {:id (mt/id)})]
(is (str/includes? create_database_ddl "create type PRODUCTS_CATEGORY_t as enum 'Doohickey', 'Gadget', 'Gizmo', 'Widget';"))
(is (str/includes? create_database_ddl "create type PEOPLE_STATE_t as enum 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT',"))
(is (str/includes? create_database_ddl "create type PEOPLE_SOURCE_t as enum 'Affiliate', 'Facebook', 'Google', 'Organic', 'Twitter';"))
(is (str/includes? create_database_ddl "create type REVIEWS_RATING_t as enum '1', '2', '3', '4', '5';"))
(is (str/includes? create_database_ddl "CREATE TABLE \"PRODUCTS\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"ORDERS\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"PEOPLE\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"REVIEWS\" ("))))))
(tu/with-temporary-setting-values [metabot-settings/enum-cardinality-threshold 50]
(let [{:keys [create_database_ddl]} (->> (metabot-util/denormalize-database {:id (mt/id)})
metabot-util/add-pseudo-database-ddl)]
(is (str/includes? create_database_ddl "create type PRODUCTS_CATEGORY_t as enum 'Doohickey', 'Gadget', 'Gizmo', 'Widget';"))
(is (str/includes? create_database_ddl "create type PEOPLE_STATE_t as enum 'AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT',"))
(is (str/includes? create_database_ddl "create type PEOPLE_SOURCE_t as enum 'Affiliate', 'Facebook', 'Google', 'Organic', 'Twitter';"))
(is (str/includes? create_database_ddl "create type REVIEWS_RATING_t as enum '1', '2', '3', '4', '5';"))
(is (str/includes? create_database_ddl "CREATE TABLE \"PRODUCTS\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"ORDERS\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"PEOPLE\" ("))
(is (str/includes? create_database_ddl "CREATE TABLE \"REVIEWS\" (")))))))
(deftest inner-query-test
(testing "Ensure that a dataset-based query contains expected AS aliases"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card orders-model {:name "Orders Model"
:dataset_query
{:database (mt/id)
:type :query
:query {:source-table (mt/id :orders)}}
:dataset true}]
(let [{:keys [column_aliases inner_query create_table_ddl sql_name]} (metabot-util/denormalize-model orders-model)]
(is (= 9 (count (re-seq #"\s+AS\s+" column_aliases))))
(is (= 10 (count (re-seq #"\s+AS\s+" inner_query))))
(is (= (mdb.query/format-sql
(str/join
[(format "CREATE TABLE \"%s\" (" sql_name)
"'ID' BIGINTEGER,"
"'USER_ID' INTEGER,"
"'PRODUCT_ID' INTEGER,"
"'SUBTOTAL' FLOAT,"
"'TAX' FLOAT,"
"'TOTAL' FLOAT,"
"'DISCOUNT' FLOAT,"
"'CREATED_AT' DATETIMEWITHLOCALTZ,"
"'QUANTITY' INTEGER)"]))
create_table_ddl)))))))
(deftest native-inner-query-test
(testing "A SELECT * will produce column all column names in th resulting DDLs"
(mt/dataset sample-dataset
(let [q (mt/native-query {:query "SELECT * FROM ORDERS;"})
result-metadata (get-in (qp/process-query q) [:data :results_metadata :columns])]
(t2.with-temp/with-temp
[Card orders-model {:name "Orders Model"
:dataset_query q
:result_metadata result-metadata
:dataset true}]
(let [{:keys [column_aliases inner_query create_table_ddl sql_name]} (metabot-util/denormalize-model orders-model)]
(is (= (mdb.query/format-sql
(format "SELECT %s FROM {{#%s}} AS INNER_QUERY" column_aliases (:id orders-model)))
inner_query))
(is (= (mdb.query/format-sql
(str/join
[(format "CREATE TABLE \"%s\" (" sql_name)
"'ID' BIGINTEGER,"
"'USER_ID' INTEGER,"
"'PRODUCT_ID' INTEGER,"
"'SUBTOTAL' FLOAT,"
"'TAX' FLOAT,"
"'TOTAL' FLOAT,"
"'DISCOUNT' FLOAT,"
"'CREATED_AT' DATETIMEWITHLOCALTZ,"
"'QUANTITY' INTEGER)"]))
(mdb.query/format-sql create_table_ddl)))
create_table_ddl)))))
(testing "A SELECT of columns will produce those column names in th resulting DDLs"
(mt/dataset sample-dataset
(let [q (mt/native-query {:query "SELECT TOTAL, QUANTITY, TAX, CREATED_AT FROM ORDERS;"})
result-metadata (get-in (qp/process-query q) [:data :results_metadata :columns])]
(t2.with-temp/with-temp
[Card orders-model {:name "Orders Model"
:dataset_query q
:result_metadata result-metadata
:dataset true}]
(let [{:keys [column_aliases inner_query create_table_ddl sql_name]} (metabot-util/denormalize-model orders-model)]
(is (= (mdb.query/format-sql
(format "SELECT %s FROM {{#%s}} AS INNER_QUERY" column_aliases (:id orders-model)))
inner_query))
(is (= (mdb.query/format-sql
(str/join
[(format "CREATE TABLE \"%s\" (" sql_name)
"'TOTAL' FLOAT,"
"'QUANTITY' INTEGER,"
"'TAX' FLOAT,"
"'CREATED_AT' DATETIMEWITHLOCALTZ)"]))
(mdb.query/format-sql create_table_ddl)))
create_table_ddl)))))
(testing "Duplicate native column aliases will be deduplicated"
(mt/dataset sample-dataset
(let [q (mt/native-query {:query "SELECT TOTAL AS X, QUANTITY AS X FROM ORDERS;"})
result-metadata (get-in (qp/process-query q) [:data :results_metadata :columns])]
(t2.with-temp/with-temp
[Card orders-model {:name "Orders Model"
:dataset_query q
:result_metadata result-metadata
:dataset true}]
(let [{:keys [column_aliases inner_query create_table_ddl sql_name]} (metabot-util/denormalize-model orders-model)]
(is (= (mdb.query/format-sql
(format "SELECT %s FROM {{#%s}} AS INNER_QUERY" column_aliases (:id orders-model)))
inner_query))
(is (= (mdb.query/format-sql
(str/join
[(format "CREATE TABLE \"%s\" (" sql_name)
"'X' FLOAT,"
"'X_2' INTEGER)"]))
(mdb.query/format-sql create_table_ddl)))))))))
(deftest inner-query-with-joins-test
(testing "Models with joins work"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card joined-model {:dataset true
:database_id (mt/id)
:query_type :query
:dataset_query
(mt/mbql-query orders
{:fields [$total &products.products.category]
:joins [{:source-table $$products
:condition [:= $product_id &products.products.id]
:strategy :left-join
:alias "products"}]})}]
(let [{:keys [column_aliases create_table_ddl sql_name]} (metabot-util/denormalize-model joined-model)]
(is (= "\"TOTAL\" AS TOTAL, \"products__CATEGORY\" AS PRODUCTS_CATEGORY"
column_aliases))
(is (= (mdb.query/format-sql
(str/join
["create type PRODUCTS_CATEGORY_t as enum 'Doohickey', 'Gadget', 'Gizmo', 'Widget';"
(format "CREATE TABLE \"%s\" (" sql_name)
"'TOTAL' FLOAT,"
"'PRODUCTS_CATEGORY' 'PRODUCTS_CATEGORY_t')"]))
(mdb.query/format-sql create_table_ddl)))))))
(testing "A model with joins on the same table will produce distinct aliases"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card joined-model {:dataset true
:database_id (mt/id)
:query_type :query
:dataset_query
(mt/mbql-query products
{:fields [$id $category &self.products.category]
:joins [{:source-table $$products
:condition [:= $id &self.products.id]
:strategy :left-join
:alias "self"}]})}]
(let [{:keys [column_aliases create_table_ddl sql_name]} (metabot-util/denormalize-model joined-model)]
(is (= "\"ID\" AS ID, \"CATEGORY\" AS CATEGORY, \"self__CATEGORY\" AS SELF_CATEGORY"
column_aliases))
(is (= (mdb.query/format-sql
(str/join
["create type CATEGORY_t as enum 'Doohickey', 'Gadget', 'Gizmo', 'Widget';"
"create type SELF_CATEGORY_t as enum 'Doohickey', 'Gadget', 'Gizmo', 'Widget';"
(format "CREATE TABLE \"%s\" (" sql_name)
"'ID' BIGINTEGER,"
"'CATEGORY' 'CATEGORY_t',"
"'SELF_CATEGORY' 'SELF_CATEGORY_t')"]))
(mdb.query/format-sql create_table_ddl))))))))
(deftest inner-query-with-aggregations-test
(testing "A model with aggregations will produce column names only (no AS aliases)"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card aggregated-model {:dataset true
:database_id (mt/id)
:query_type :query
:dataset_query
(mt/mbql-query orders
{:aggregation [[:sum $total]]
:breakout [$user_id]})}]
(let [{:keys [column_aliases inner_query create_table_ddl sql_name]} (metabot-util/denormalize-model aggregated-model)]
(is (= (mdb.query/format-sql
(format "SELECT USER_ID, SUM_OF_TOTAL FROM {{#%s}} AS INNER_QUERY" (:id aggregated-model)))
inner_query))
(is (= "USER_ID, SUM_OF_TOTAL" column_aliases))
(is (= (format "CREATE TABLE \"%s\" ('USER_ID' INTEGER, 'SUM_OF_TOTAL' FLOAT)" sql_name)
create_table_ddl))
create_table_ddl)))))
(deftest inner-query-name-collisions-test
(testing "When column names collide, each conflict is disambiguated with an _X postfix"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card orders-model {:name "Orders Model"
:dataset_query
{:database (mt/id)
:type :query
:query {:source-table (mt/id :orders)}}
:dataset true}]
(let [orders-model (update orders-model :result_metadata
(fn [v]
(map #(assoc % :display_name "ABC") v)))
{:keys [column_aliases create_table_ddl]} (metabot-util/denormalize-model orders-model)]
(is (= 9 (count (re-seq #"ABC(?:_\d+)?" column_aliases))))
;; Ensure that the same aliases are used in the create table ddl
(is (= 9 (count (re-seq #"ABC" create_table_ddl))))))))
(testing "Models with name collisions across joins are also correctly disambiguated"
(mt/dataset sample-dataset
(t2.with-temp/with-temp
[Card model {:dataset true
:database_id (mt/id)
:query_type :query
:dataset_query
(mt/mbql-query orders
{:fields [$total &products.products.category &self.products.category]
:joins [{:source-table $$products
:condition [:= $product_id &products.products.id]
:strategy :left-join
:alias "products"}
{:source-table $$products
:condition [:= $id &self.products.id]
:strategy :left-join
:alias "self"}]})}]
(let [model (update model :result_metadata
(fn [v]
(map #(assoc % :display_name "FOO") v)))
{:keys [column_aliases create_table_ddl]} (metabot-util/denormalize-model model)]
(is (= "\"TOTAL\" AS FOO, \"products__CATEGORY\" AS FOO_2, \"self__CATEGORY\" AS FOO_3"
column_aliases))
;; Ensure that the same aliases are used in the create table ddl
;; 7 = 3 for the column names + 2 for the type creation + 2 for the type references
(is (= 7 (count (re-seq #"FOO" create_table_ddl)))))))))
(deftest deconflicting-aliases-test
(testing "Test sql_name generation deconfliction:
- Potentially conflicting names are retained
- As conflicts occur, _X is appended to each alias in increasing order, skipping existing aliases"
(is
(= [{:display_name "ABC", :sql_name "ABC"}
{:display_name "AB", :sql_name "AB"}
{:display_name "A B C", :sql_name "A_B_C"}
{:display_name "ABC", :sql_name "ABC_2"}
{:display_name "ABC_1", :sql_name "ABC_1"}
{:display_name "ABC", :sql_name "ABC_3"}]
(:result_metadata
(#'metabot-util/add-sql-names
{:result_metadata
[{:display_name "ABC"}
{:display_name "AB"}
{:display_name "A B C"}
{:display_name "ABC"}
{:display_name "ABC_1"}
{:display_name "ABC"}]}))))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment