Skip to content
Snippets Groups Projects
Unverified Commit 365f2c3b authored by Cam Saul's avatar Cam Saul Committed by GitHub
Browse files

Make Athena test data loading more robust (#27538)

parent aa68beea
No related branches found
No related tags found
No related merge requests found
......@@ -142,9 +142,14 @@
;;; ------------------------------------------------- date functions -------------------------------------------------
(def ^:dynamic *loading-data*
"HACK! Whether we're loading data (e.g. in [[metabase.test.data.athena]]). We can't use `timestamp with time zone`
literals when loading data because Athena doesn't let you use a `timestamp with time zone` value for a `timestamp`
column, and you can only have `timestamp` columns when actually creating them."
false)
(defmethod unprepare/unprepare-value [:athena OffsetDateTime]
[_driver t]
#_(format "timestamp '%s %s %s'" (t/local-date t) (t/local-time t) (t/zone-offset t))
;; Timestamp literals do not support offsets, or at least they don't in `INSERT INTO ...` statements. I'm not 100%
;; sure what the correct thing to do here is then. The options are either:
;;
......@@ -154,11 +159,14 @@
;;
;; For now I went with option (1) because it SEEMS like that's what Athena is doing. Not sure about this tho. We can
;; do something better when we figure out what's actually going on. -- Cam
(let [t (u.date/with-time-zone-same-instant t (t/zone-id "UTC"))]
(format "timestamp '%s %s'" (t/local-date t) (t/local-time t))))
(if *loading-data*
(let [t (u.date/with-time-zone-same-instant t (t/zone-id "UTC"))]
(format "timestamp '%s %s'" (t/local-date t) (t/local-time t)))
;; when not loading data we can actually use timestamp with offset info.
(format "timestamp '%s %s %s'" (t/local-date t) (t/local-time t) (t/zone-offset t))))
(defmethod unprepare/unprepare-value [:athena ZonedDateTime]
[_driver t]
[driver t]
;; This format works completely fine for literals e.g.
;;
;; SELECT timestamp '2022-11-16 04:21:00 US/Pacific'
......@@ -169,8 +177,9 @@
;; that have already been created for performance reasons. If you add a new dataset and it should work for
;; Athena (despite Athena only partially supporting TIMESTAMP WITH TIME ZONE) then you can use the commented out impl
;; to do it. That should work ok because it converts it to a UTC then to a LocalDateTime. -- Cam
#_(unprepare/unprepare-value driver (t/offset-date-time t))
(format "timestamp '%s %s %s'" (t/local-date t) (t/local-time t) (t/zone-id t)))
(if *loading-data*
(unprepare/unprepare-value driver (t/offset-date-time t))
(format "timestamp '%s %s %s'" (t/local-date t) (t/local-time t) (t/zone-id t))))
;;; for some evil reason Athena expects `OFFSET` *before* `LIMIT`, unlike every other database in the known universe; so
;;; we'll have to have a custom implementation of `:page` here and do our own version of `:offset` that comes before
......
......@@ -5,6 +5,7 @@
[clojure.tools.logging :as log]
[metabase.config :as config]
[metabase.driver :as driver]
[metabase.driver.athena :as athena]
[metabase.driver.ddl.interface :as ddl.i]
[metabase.driver.sql-jdbc.connection :as sql-jdbc.conn]
[metabase.driver.sql-jdbc.sync :as sql-jdbc.sync]
......@@ -28,8 +29,8 @@
;; Athena doesn't support dashes in Table names... we'll just go ahead and convert them all to underscores, even for DB
;; names.
(defmethod ddl.i/format-name :athena
[driver table-or-field-name]
((get-method ddl.i/format-name :sql-jdbc) driver (str/replace table-or-field-name #"-" "_")))
[driver database-or-table-or-field-name]
((get-method ddl.i/format-name :sql-jdbc) driver (str/replace database-or-table-or-field-name #"-" "_")))
(defmethod tx/dbdef->connection-details :athena
[driver _context {:keys [database-name], :as _dbdef}]
......@@ -58,11 +59,46 @@
([_ db-name table-name] [db-name table-name])
([_ db-name table-name field-name] [db-name table-name field-name]))
;;; INSTRUCTIONS FOR MANUALLY DROPPING AND RECREATING A DATABASE
;;;
;;; 1. Install the AWS CLI if you haven't done so already
;;;
;;; 2. Create a profile using the `MB_ATHENA_TEST_ACCESS_KEY`, `MB_ATHENA_TEST_SECRET_KEY`, and `MB_ATHENA_TEST_REGION`
;;; you're using to run tests
;;;
;;; ````
;;; aws configure --profile athena-ci
;;;
;;; 3. Delete the data from the `MB_ATHENA_TEST_S3_STAGING_DIR` S3 bucket. The data directory is the same as the dataset
;;; name you want to delete with hyphens replaced with underscores e.g. `sample-dataset` becomes `sample_dataset`
;;;
;;; ```
;;; aws s3 --profile athena-ci rm s3://metabase-ci-athena-results/sample_dataset --recursive
;;; ```
;;;
;;; 4. Delete the database from the Glue Console.
;;;
;;; ```
;;; aws glue --profile athena-ci delete-database --name sample_dataset
;;; ```
;;;
;;; 5. After this you can recreate the database normally using the test loading code. Note that you must
;;; enable [[*allow-database-creation*]] for this to work:
;;;
;;; ```
;;; (db/delete! 'Database :engine "athena", :name "sample-dataset")
;;; (binding [metabase.test.data.athena/*allow-database-creation* true]
;;; (metabase.driver/with-driver :athena
;;; (metabase.test/dataset sample-dataset
;;; (metabase.test/db))))
;;; ```
;;; Athena requires backtick-escaped database name for some queries
(defmethod sql.tx/drop-db-if-exists-sql :athena
[driver {:keys [database-name]}]
(log/warn "Dropping an Athena database does not delete existing data. You may have to delete in manually from S3 and Glue Console.")
(format "DROP DATABASE IF EXISTS `%s` CASCADE;" (ddl.i/format-name driver database-name)))
[_driver _dbdef]
(log/warn (str "You cannot delete a [non-Iceberg] Athena database using DDL statements. It has to be deleted "
"manually from S3 and the Glue Console. See documentation in [[metabase.test.data.athena]] for "
"instructions for doing this.")))
(defmethod sql.tx/drop-table-if-exists-sql :athena
[driver {:keys [database-name]} {:keys [table-name]}]
......@@ -160,7 +196,10 @@
;;; 200 is super slow, and the query ends up being too large around 500 rows... for some reason the same dataset
;;; orders table (about 17k rows) stalls out at row 10,000 when loading them 200 at a time. It works when you do 400
;;; at a time tho. This is just going to have to be ok for now.
(binding [load-data/*chunk-size* 400]
(binding [load-data/*chunk-size* 400
;; This tells Athena to convert `timestamp with time zone` literals to `timestamp` because otherwise it gets
;; very fussy! See [[athena/*loading-data*]] for more info.
athena/*loading-data* true]
(apply load-data/load-data-add-ids-chunked! args)))
(defn- server-connection-details []
......@@ -177,11 +216,26 @@
(log/infof "The following Athena databases have already been created: %s" (pr-str (sort dbs)))
dbs)))
(def ^:private ^:dynamic *allow-database-creation*
"Whether to allow database creation. This is normally disabled to prevent people from accidentally loading duplicate
data into Athena or somehow stomping over existing databases and breaking CI. If you want to create a new dataset,
change this flag to true and run your code again so the data will be loaded normally. Set it back to false when
you're done."
false)
(defmethod tx/create-db! :athena
[driver {:keys [database-name], :as db-def} & options]
(let [database-name (ddl.i/format-name driver database-name)]
(if (contains? (existing-databases) database-name)
(cond
(contains? (existing-databases) database-name)
(log/infof "Athena database %s already exists, skipping creation" (pr-str database-name))
(not *allow-database-creation*)
(log/fatalf (str "Athena database creation is disabled: not creating database %s. Tests will likely fail.\n"
"See metabase.test.data.athena/*allow-database-creation* for more info.")
(pr-str database-name))
:else
(do
(log/infof "Creating Athena database %s" (pr-str database-name))
;; call the default impl for SQL JDBC drivers
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment