From 9fef23dc6be04a0b83cd11f9090ee0c71d51b343 Mon Sep 17 00:00:00 2001
From: lbrdnk <lbrdnk@users.noreply.github.com>
Date: Thu, 10 Aug 2023 14:09:30 +0200
Subject: [PATCH] Unify semicolon and comment handling in nested native queries
 (#30677)

* Unify semicolon and comment handling in nested native queries

* Use different comment and semicolon removal approach

* Apply suggestions from code review

Co-authored-by: metamben <103100869+metamben@users.noreply.github.com>

* Update base and effective type verification for oracle

* Avoid running test with snowflake and bigquery

* Update `breakout-results` and test

1. In `breakout-results` case driver statement was redundant.
Original code had field name and types hardcoded. During further
examination, I've realized name and types are available thanks
to the use of `qp.test/col`, so are just "unhardcoded" :)

2. `card-id-native-source-queries-test` is altered, so instead of
hardcoding native sub-queries, roughly equivalent* mbql query is
compiled and its native form is used. This way I can hopefully avoid
problems with extraction of cloud databases' table names for testing
purposes.

* By roughly equivalent I mean instead of using * for selecting
fields, mbql is compiled selecting fields explicitly. Also limit is added
in some cases, but local testing showed no problems with that.

---------

Co-authored-by: metamben <103100869+metamben@users.noreply.github.com>
---
 src/metabase/driver/sql/query_processor.clj   | 29 +++++++-
 .../driver/sql/query_processor/deprecated.clj |  4 +-
 .../middleware/fetch_source_query.clj         | 30 ++------
 .../driver/sql/parameters/substitute_test.clj |  9 ++-
 .../driver/sql/query_processor_test.clj       | 41 +++++++++++
 .../middleware/fetch_source_query_test.clj    | 10 ---
 .../nested_queries_test.clj                   | 72 ++++++++++---------
 7 files changed, 118 insertions(+), 77 deletions(-)

diff --git a/src/metabase/driver/sql/query_processor.clj b/src/metabase/driver/sql/query_processor.clj
index f99deafa2cf..7aef5ff1767 100644
--- a/src/metabase/driver/sql/query_processor.clj
+++ b/src/metabase/driver/sql/query_processor.clj
@@ -46,9 +46,32 @@
   nil)
 
 (defn make-nestable-sql
-  "For embedding native sql queries, wraps [sql] in parens and removes any semicolons"
+  "Do best effort edit to the `sql`, to make it nestable in subselect.
+
+  That requires:
+
+  - Removal of traling comments (after the semicolon).
+  - Removing the semicolon(s).
+  - Squashing whitespace at the end of the string and replacinig it with newline. This is required in case some
+    comments were preceding semicolon.
+  - Wrapping the result in parens.
+
+  This implementation does not handle few cases cases properly. 100% correct comment and semicolon removal would
+  probably require _parsing_ sql string and not just a regular expression replacement. Link to the discussion:
+  https://github.com/metabase/metabase/pull/30677
+
+  For the limitations see the [[metabase.driver.sql.query-processor-test/make-nestable-sql-test]]"
   [sql]
-  (str "(" (str/replace sql #";[\s;]*$" "") ")"))
+  (str "("
+       (-> sql
+           (str/replace #";([\s;]*(--.*\n?)*)*$" "")
+           str/trimr
+           (as-> trimmed
+                 ;; Query could potentially end with a comment.
+                 (if (re-find #"--.*$" trimmed)
+                   (str trimmed "\n")
+                   trimmed)))
+       ")"))
 
 (defn- format-sql-source-query [_fn [sql params]]
   (into [(make-nestable-sql sql)] params))
@@ -71,7 +94,7 @@
   (case (long hx/*honey-sql-version*)
     1
     #_{:clj-kondo/ignore [:deprecated-var]}
-    (sql.qp.deprecated/->SQLSourceQuery sql params)
+    (sql.qp.deprecated/->SQLSourceQuery (make-nestable-sql sql) params)
 
     2
     [::sql-source-query sql params]))
diff --git a/src/metabase/driver/sql/query_processor/deprecated.clj b/src/metabase/driver/sql/query_processor/deprecated.clj
index 6b4f2a3dfb6..1111bd721ff 100644
--- a/src/metabase/driver/sql/query_processor/deprecated.clj
+++ b/src/metabase/driver/sql/query_processor/deprecated.clj
@@ -6,7 +6,6 @@
   Deprecated method impls should call [[log-deprecation-warning]] to gently nudge driver authors to stop using this
   method."
   (:require
-   [clojure.string :as str]
    [honeysql.core :as hsql]
    [honeysql.format :as hformat]
    [metabase.query-processor.store :as qp.store]
@@ -45,8 +44,7 @@
   hformat/ToSql
   (to-sql [_]
     (dorun (map hformat/add-anon-param params))
-    ;; strip off any trailing semicolons
-    (str "(" (str/replace sql #";+\s*$" "") ")"))
+    sql)
 
   pretty/PrettyPrintable
   (pretty [_]
diff --git a/src/metabase/query_processor/middleware/fetch_source_query.clj b/src/metabase/query_processor/middleware/fetch_source_query.clj
index cd937b5b9f1..0eba9d1b57d 100644
--- a/src/metabase/query_processor/middleware/fetch_source_query.clj
+++ b/src/metabase/query_processor/middleware/fetch_source_query.clj
@@ -23,7 +23,6 @@
   TODO - consider renaming this namespace to `metabase.query-processor.middleware.resolve-card-id-source-tables`"
   (:require
    [clojure.set :as set]
-   [clojure.string :as str]
    [medley.core :as m]
    [metabase.driver.ddl.interface :as ddl.i]
    [metabase.driver.util :as driver.u]
@@ -96,19 +95,6 @@
 ;;; |                                       Resolving card__id -> source query                                       |
 ;;; +----------------------------------------------------------------------------------------------------------------+
 
-(mu/defn ^:private trim-sql-query :- ms/NonBlankString
-  "Native queries can have trailing SQL comments. This works when executed directly, but when we use the query in a
-  nested query, we wrap it in another query, which can cause the last part of the query to be unintentionally
-  commented out, causing it to fail. This function removes any trailing SQL comment."
-  [card-id   :- ms/PositiveInt
-   query-str :- ms/NonBlankString]
-  (let [trimmed-string (str/replace query-str #"--.*(\n|$)" "")]
-    (if (= query-str trimmed-string)
-      query-str
-      (do
-        (log/info (trs "Trimming trailing comment from card with id {0}" card-id))
-        trimmed-string))))
-
 (defn- source-query
   "Get the query to be run from the card"
   [{dataset-query :dataset_query card-id :id :as card}]
@@ -121,18 +107,10 @@
      (when-some [native-query (set/rename-keys native-query {:query :native})]
        (let [mongo? (= (driver.u/database->driver db-id) :mongo)]
          (cond-> native-query
-                 ;; MongoDB native  queries consist of a collection and a pipelne (query)
-                 mongo?
-                 (update :native (fn [pipeline] {:collection (:collection native-query)
-                                                 :query      pipeline}))
-
-                 ;; trim trailing comments from SQL, but not other types of native queries
-                 (and (not mongo?)
-                      (string? (:native native-query)))
-                 (update :native (partial trim-sql-query card-id))
-
-                 (empty? template-tags)
-                 (dissoc :template-tags))))
+           ;; MongoDB native  queries consist of a collection and a pipelne (query)
+           mongo? (update :native (fn [pipeline] {:collection (:collection native-query)
+                                                  :query      pipeline}))
+           (empty? template-tags) (dissoc :template-tags))))
      (throw (ex-info (tru "Missing source query in Card {0}" card-id)
                      {:card card})))))
 
diff --git a/test/metabase/driver/sql/parameters/substitute_test.clj b/test/metabase/driver/sql/parameters/substitute_test.clj
index c80fdc04828..8221cd33abe 100644
--- a/test/metabase/driver/sql/parameters/substitute_test.clj
+++ b/test/metabase/driver/sql/parameters/substitute_test.clj
@@ -302,10 +302,13 @@
     (let [query ["SELECT * FROM " (param "#123")]]
       (is (= ["SELECT * FROM (SELECT 1 `x`)" []]
              (substitute query {"#123" (params/map->ReferencedCardQuery {:card-id 123, :query "SELECT 1 `x`"})})))))
-  (testing "Referenced card query substitution removes trailing semicolons and whitespace #28218"
+  (testing "Referenced card query substitution removes comments (#29168), trailing semicolons (#28218) and whitespace"
     (let [query ["SELECT * FROM " (param "#123")]]
-      (is (= ["SELECT * FROM (SELECT ';' `x`)" []]
-             (substitute query {"#123" (params/map->ReferencedCardQuery {:card-id 123, :query "SELECT ';' `x`; ; "})}))))))
+      (are [nested expected] (= [(str "SELECT * FROM (" expected ")") []]
+                                (substitute query {"#123" (params/map->ReferencedCardQuery
+                                                           {:card-id 123, :query nested})}))
+        "SELECT ';' `x`; ; "             "SELECT ';' `x`"
+        "SELECT * FROM table\n-- remark" "SELECT * FROM table\n-- remark\n"))))
 
 ;;; --------------------------------------------- Native Query Snippets ----------------------------------------------
 
diff --git a/test/metabase/driver/sql/query_processor_test.clj b/test/metabase/driver/sql/query_processor_test.clj
index aac3385bbe5..00c32a42b64 100644
--- a/test/metabase/driver/sql/query_processor_test.clj
+++ b/test/metabase/driver/sql/query_processor_test.clj
@@ -1112,3 +1112,44 @@
                                     :breakout    [:binning-strategy $quantity :num-bins 10]}))
                    (mdb.query/format-sql :h2)
                    str/split-lines)))))))
+
+(deftest ^:parallel make-nestable-sql-test
+  (testing "Native sql query should be modified to be usable in subselect"
+    (are [raw nestable] (= nestable (sql.qp/make-nestable-sql raw))
+      "SELECT ';' `x`; ; "
+      "(SELECT ';' `x`)"
+
+      "SELECT * FROM table\n-- remark"
+      "(SELECT * FROM table\n-- remark\n)"
+
+      ;; Comment, semicolon, comment, comment.
+      "SELECT * from people -- people -- cool table\n ; -- cool query\n -- some notes on cool query"
+      "(SELECT * from people -- people -- cool table\n)"
+
+      ;; String containing semicolon, double dash and newline followed by NO _comment or semicolon or end of input_.
+      "SELECT 'string with \n ; -- ends \n on new line';"
+      "(SELECT 'string with \n ; -- ends \n on new line')"
+
+      ;; String containing semicolon followed by double dash followed by THE _comment or semicolon or end of input_.
+      ;; TODO: Enable when better sql parsing solution is found in the [[sql.qp/make-nestable-sql]]].
+      #_#_
+      "SELECT 'string with \n ; -- ending on the same line';"
+      "(SELECT 'string with \n ; -- ending on the same line')"
+      #_#_
+      "SELECT 'string with \n ; -- ending on the same line';\n-- comment"
+      "(SELECT 'string with \n ; -- ending on the same line')"
+
+      ;; String containing just `--` without `;` works
+      "SELECT 'string with \n -- ending on the same line';"
+      "(SELECT 'string with \n -- ending on the same line'\n)"
+
+      ;; String with just `;`
+      "SELECT 'string with ; ending on the same line';"
+      "(SELECT 'string with ; ending on the same line')"
+
+      ;; Semicolon after comment after semicolon
+      "SELECT ';';\n
+      --c1\n
+      ; --c2\n
+      -- c3"
+      "(SELECT ';')")))
diff --git a/test/metabase/query_processor/middleware/fetch_source_query_test.clj b/test/metabase/query_processor/middleware/fetch_source_query_test.clj
index 5974e234647..75f88ef3a95 100644
--- a/test/metabase/query_processor/middleware/fetch_source_query_test.clj
+++ b/test/metabase/query_processor/middleware/fetch_source_query_test.clj
@@ -326,16 +326,6 @@
                (resolve-card-id-source-tables query)))))))
 
 (deftest card-id->source-query-and-metadata-test
-  (testing "card-id->source-query-and-metadata-test should trim SQL queries"
-    (let [query {:type     :native
-                 :native   {:query "SELECT * FROM table\n-- remark"}
-                 :database (mt/id)}]
-      (t2.with-temp/with-temp [Card {card-id :id} {:dataset_query query}]
-        (is (= {:source-metadata nil
-                :source-query    {:native "SELECT * FROM table\n"}
-                :database        (mt/id)}
-               (#'fetch-source-query/card-id->source-query-and-metadata card-id))))))
-
   (testing "card-id->source-query-and-metadata-test should preserve non-SQL native queries"
     (let [query {:type     :native
                  :native   {:projections ["_id" "user_id" "venue_id"],
diff --git a/test/metabase/query_processor_test/nested_queries_test.clj b/test/metabase/query_processor_test/nested_queries_test.clj
index e2f2bbaaad1..d142ee9aa73 100644
--- a/test/metabase/query_processor_test/nested_queries_test.clj
+++ b/test/metabase/query_processor_test/nested_queries_test.clj
@@ -1,6 +1,7 @@
 (ns metabase.query-processor-test.nested-queries-test
   "Tests for handling queries with nested expressions."
   (:require
+   [clojure.set :as set]
    [clojure.string :as str]
    [clojure.test :refer :all]
    [honey.sql :as sql]
@@ -75,19 +76,21 @@
 (defn breakout-results [& {:keys [has-source-metadata? native-source?]
                             :or   {has-source-metadata? true
                                    native-source?       false}}]
-  {:rows [[1 22]
-          [2 59]
-          [3 13]
-          [4  6]]
-   :cols [(cond-> (qp.test/breakout-col (qp.test/col :venues :price))
-            native-source?
-            (-> (assoc :field_ref [:field "PRICE" {:base-type :type/Integer}]
-                       :effective_type :type/Integer)
-                (dissoc :description :parent_id :nfc_path :visibility_type))
-
-            (not has-source-metadata?)
-            (dissoc :id :semantic_type :settings :fingerprint :table_id :coercion_strategy))
-          (qp.test/aggregate-col :count)]})
+  (let [{base-type :base_type effective-type :effective_type :keys [name] :as breakout-col}
+        (qp.test/breakout-col (qp.test/col :venues :price))]
+    {:rows [[1 22]
+            [2 59]
+            [3 13]
+            [4  6]]
+     :cols [(cond-> breakout-col
+              native-source?
+              (-> (assoc :field_ref [:field name {:base-type base-type}]
+                         :effective_type effective-type)
+                  (dissoc :description :parent_id :nfc_path :visibility_type))
+
+              (not has-source-metadata?)
+              (dissoc :id :semantic_type :settings :fingerprint :table_id :coercion_strategy))
+            (qp.test/aggregate-col :count)]}))
 
 (deftest mbql-source-query-breakout-aggregation-test
   (mt/test-drivers (mt/normal-drivers-with-feature :nested-queries)
@@ -312,25 +315,30 @@
                  (query-with-source-card card)))))))))
 
 (deftest card-id-native-source-queries-test
-  (let [run-native-query
-        (fn [sql]
-          (t2.with-temp/with-temp [Card card {:dataset_query {:database (mt/id), :type :native, :native {:query sql}}}]
-            (qp.test/rows-and-cols
-             (mt/format-rows-by [int int]
-               (qp/process-query
-                (query-with-source-card card
-                                        (mt/$ids venues
-                                          {:aggregation [:count]
-                                           :breakout    [*price]})))))))]
-    (is (= (breakout-results :has-source-metadata? false :native-source? true)
-           (run-native-query "SELECT * FROM VENUES"))
-        "make sure `card__id`-style queries work with native source queries as well")
-    (is (= (breakout-results :has-source-metadata? false :native-source? true)
-           (run-native-query "SELECT * FROM VENUES -- small comment here"))
-        "Ensure trailing comments are trimmed and don't cause a wrapping SQL query to fail")
-    (is (= (breakout-results :has-source-metadata? false :native-source? true)
-           (run-native-query "SELECT * FROM VENUES -- small comment here\n"))
-        "Ensure trailing comments followed by a newline are trimmed and don't cause a wrapping SQL query to fail")))
+  (mt/test-drivers (set/intersection (mt/normal-drivers-with-feature :nested-queries)
+                                     (descendants driver/hierarchy :sql))
+    (let [native-sub-query (-> (mt/mbql-query venues {:source-table $$venues}) qp/compile :query)
+          run-native-query
+          (fn [sql]
+            (t2.with-temp/with-temp [Card card {:dataset_query {:database (mt/id)
+                                                                :type :native
+                                                                :native {:query sql}}}]
+              (qp.test/rows-and-cols
+               (mt/format-rows-by [int int]
+                 (qp/process-query
+                  (query-with-source-card card
+                                          (mt/$ids venues
+                                                   {:aggregation [:count]
+                                                    :breakout    [*price]})))))))]
+      (is (= (breakout-results :has-source-metadata? false :native-source? true)
+             (run-native-query native-sub-query))
+          "make sure `card__id`-style queries work with native source queries as well")
+      (is (= (breakout-results :has-source-metadata? false :native-source? true)
+             (run-native-query (str native-sub-query " -- small comment here")))
+          "Ensure trailing comments are trimmed and don't cause a wrapping SQL query to fail")
+      (is (= (breakout-results :has-source-metadata? false :native-source? true)
+             (run-native-query (str native-sub-query " -- small comment here\n")))
+          "Ensure trailing comments followed by a newline are trimmed and don't cause a wrapping SQL query to fail"))))
 
 (deftest filter-by-field-literal-test
   (testing "make sure we can filter by a field literal"
-- 
GitLab