Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
Metabase
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Engineering Digital Service
Metabase
Commits
ba66cf32
Unverified
Commit
ba66cf32
authored
7 months ago
by
Chris Truter
Committed by
GitHub
7 months ago
Browse files
Options
Downloads
Patches
Plain Diff
Consider the first 10 lines when inferring upload separator (#46321)
parent
b9a7566f
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/metabase/upload.clj
+38
-30
38 additions, 30 deletions
src/metabase/upload.clj
test/metabase/upload_test.clj
+56
-12
56 additions, 12 deletions
test/metabase/upload_test.clj
with
94 additions
and
42 deletions
src/metabase/upload.clj
+
38
−
30
View file @
ba66cf32
...
...
@@ -171,8 +171,16 @@
(
let
[
parsers
(
map
#
(
upload-parsing/upload-type->parser
%
settings
)
col-upload-types
)]
(
for
[
row
rows
]
(
for
[[
value
parser
]
(
u/map-all
vector
row
parsers
)]
(
when-not
(
str/blank?
value
)
(
parser
value
))))))
(
do
(
when-not
parser
(
throw
(
ex-info
(
format
"Column count in data (%s) exceeds the number of in the header (%s)"
(
count
rows
)
(
count
parsers
))
{
:settings
settings
:col-upload-types
rows
:row
row
})))
(
when-not
(
str/blank?
value
)
(
parser
value
)))))))
(
defn-
remove-indices
"Removes the elements at the given indices from the collection. Indices is a set."
...
...
@@ -210,44 +218,44 @@
(
def
^
:private
separators
",;\t"
)
(
defn-
assert-inferred-separator
[
maybe-s
]
(
or
maybe-s
(
throw
(
ex-info
(
tru
"Unable to recognise file separator"
)
{
:status-code
422
}))))
;; This number was chosen arbitrarily. There is robustness / performance trade-off.
(
def
^
:private
max-inferred-lines
10
)
(
defn-
separator-priority
"Prefer separators according to the follow criteria, in order:
- Splitting the header at least once.
- Giving a consistent column split for all the lines.
- Not having more columns in any row than in the header.
- The maximum number of column splits.
- The number of fields in the header.
- The precedence order in how we define them, e.g. a bias towards comma.
This last preference is implicit in the order of [[separators]]"
[[
header-column-count
&
data-row-column-counts
]]
[(
when
header-column-count
(
>
header-column-count
1
))
(
apply
=
header-column-count
data-row-column-counts
)
(
not
(
some
#
(
>
%
header-column-count
)
data-row-column-counts
))
(
reduce
max
0
data-row-column-counts
)
header-column-count
])
(
defn-
infer-separator
"Guess at what symbol is being used as a separator in the given CSV-like file.
Our heuristic is to use the separator that gives us the most number of columns.
Exclude separators which give incompatible column counts between the header and the first row."
[
^
File
fi
le
]
[
readab
le
]
(
let
[
count-columns
(
fn
[
s
]
;; Create a separate reader per separator, as the line-breaking behavio
u
r depends on the parser.
(
with-open
[
reader
(
bom/bom-reader
fi
le
)]
;; Create a separate reader per separator, as the line-breaking behavior depends on the parser.
(
with-open
[
reader
(
bom/bom-reader
readab
le
)]
(
try
(
into
[]
;; take first two rows and count the number of columns in each to compare headers
;; vs data rows.
(
comp
(
take
2
)
(
map
count
))
(
comp
(
take
max-inferred-lines
)
(
map
count
))
(
csv/read-csv
reader
:separator
s
))
(
catch
Exception
_e
nil
))))]
(
->>
(
map
(
juxt
identity
count-columns
)
separators
)
;; We cannot have more data columns than header columns
;; We currently support files without any data rows, and these get a free pass.
(
remove
(
fn
[[
_s
[
header-column-count
data-column-count
]]]
(
when
data-column-count
(
>
data-column-count
header-column-count
))))
;; Prefer separators according to the follow criteria, in order:
;; - Splitting the header at least once
;; - Giving a consistent column split for the first two lines of the file
;; - The number of fields in the header
;; - The precedence order in how we define them, e.g.. bias towards comma
(
sort-by
(
fn
[[
_
[
header-column-count
data-column-count
]]]
[(
when
header-column-count
(
>
header-column-count
1
))
(
=
header-column-count
data-column-count
)
header-column-count
])
u/reverse-compare
)
ffirst
assert-inferred-separator
)))
(
sort-by
(
comp
separator-priority
second
)
u/reverse-compare
)
ffirst
)))
(
defn-
infer-parser
"Currently this only infers the separator, but in future it may also handle different quoting options."
...
...
This diff is collapsed.
Click to expand it.
test/metabase/upload_test.clj
+
56
−
12
View file @
ba66cf32
...
...
@@ -34,7 +34,7 @@
[
metabase.util.malli
:as
mu
]
[
toucan2.core
:as
t2
])
(
:import
(
java.io
File
)))
(
java.io
ByteArrayInputStream
File
)))
(
set!
*warn-on-reflection*
true
)
...
...
@@ -366,7 +366,7 @@
(
finally
;; I'm experimenting with disabling this, it seems preposterous that this would actually cause test flakes --
;; Cam
(
do
#
_when
#
_
(
not=
driver/*driver*
:redshift
)
; redshift tests flake when tables are dropped
(
when
true
#
_
(
not=
driver/*driver*
:redshift
)
; redshift tests flake when tables are dropped
(
driver/drop-table!
driver/*driver*
(
:db_id
table
)
(
#
'upload/table-identifier
table
))))))
...
...
@@ -565,24 +565,68 @@
(
is
(
=
(
rows-with-auto-pk
[[
"a,b,c"
"d"
]])
(
rows-for-table
table
)))))))))
(
defn
reusable-string-reader
"Because life is too short for zillions of temp files."
[
^
String
s
]
(
let
[
bytes
(
.getBytes
s
"UTF-8"
)]
(
reify
io/IOFactory
(
make-input-stream
[
_
_opts
]
(
ByteArrayInputStream.
bytes
))
(
make-reader
[
this
opts
]
(
io/reader
(
io/make-input-stream
this
opts
))))))
(
defn-
infer-separator
[
rows
]
(
#
'upload/infer-separator
(
reusable-string-reader
(
str/join
"\n"
rows
))))
(
deftest
infer-separator-test
(
testing
"doesn't error when checking alternative separators (#44034)"
(
let
[
file
(
csv-file-with
[
"\"c1\",\"c2\""
"\"a,b,c\",\"d\""
]
)
]
(
is
(
=
\,
(
#
'upload/
infer-separator
file
)))))
(
let
[
rows
[
"\"c1\",\"c2\""
"\"a,b,c\",\"d\""
]]
(
is
(
=
\,
(
infer-separator
rows
)))))
(
doseq
[[
separator
lines
]
example-files
]
(
testing
(
str
"inferring "
separator
)
(
let
[
f
(
csv-file-with
lines
)
s
({
:tab
\t
ab
:semi-colon
\;
:comma
\,
}
separator
)]
(
is
(
=
s
(
#
'upload/infer-separator
f
))))))
(
let
[
s
({
:tab
\t
ab
:semi-colon
\;
:comma
\,
}
separator
)]
(
is
(
=
s
(
infer-separator
lines
))))))
;; it's actually decently hard to make it not stumble on comma or semicolon. The strategy here is that the data
;; column count is greater than the header column count regardless of the separators we choose
(
let
[
lines
[
","
",,,;;;\t\t"
]]
(
testing
"throws an error if there's no clear winner"
(
let
[
f
(
csv-file-with
lines
)]
(
is
(
thrown-with-msg?
clojure.lang.ExceptionInfo
#
"Unable to recognise file separator"
(
#
'upload/infer-separator
f
)))))))
(
testing
"will defer data width errors to insertion time if other separators are degenerate"
(
is
(
=
\,
(
infer-separator
lines
))))))
(
deftest
infer-separator-priority-test
(
testing
"Multiple header columns"
;; Despite inconsistent counts, we pick \;
(
is
(
=
\;
(
infer-separator
[
"a;b"
"1"
]))))
(
testing
"Consistent column counts"
;; despite more data columns for the other separators, we pick \;
(
is
(
=
\;
(
infer-separator
[
"a;b,c\td"
"1;2,3,4\t5\t6"
]))))
(
testing
"Headers for every column"
;; despite more data columns for other separators, we pick \;
(
is
(
=
\;
(
infer-separator
[
"a,b;c\td"
"1,2,3;4\t5"
]))))
(
testing
"Greatest number of data columns"
;; despite more headers for \, we pick \;
(
is
(
=
\;
(
infer-separator
[
"a;b;c;d,e,f,g,h\ti\tj"
"1;2;3,4\t5"
]))))
(
testing
"Greatest number of header columns"
(
is
(
=
\;
(
infer-separator
[
"a,b;c;d\te"
]))))
(
testing
"Precedence"
(
is
(
=
\,
(
infer-separator
[])))
(
is
(
=
\;
(
infer-separator
[
"a\tb;c"
"1\t2;3"
])))))
(
deftest
infer-separator-multiline-test
(
testing
"it picks the only viable separator forced by a quote"
(
is
(
=
\;
(
infer-separator
[
"name, first;surname"
"bond, james;bond"
"\"semi;\";colon"
]))))
(
testing
"it considers consistency across the split count"
(
is
(
=
\;
(
infer-separator
[
"product name; amount, in dollars"
"blunderbuss; 1,000"
"cyberwagon; 1,000,000"
])))))
(
deftest
create-from-csv-date-test
(
testing
"Upload a CSV file with a datetime column"
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment