Skip to content
Snippets Groups Projects
Commit c843a8f3 authored by Cam Saül's avatar Cam Saül Committed by GitHub
Browse files

Merge pull request #4258 from metabase/better-support-for-non-english-chars-in-slugs

Better support for non-English characters in slugs
parents 5f618860 5ca1ec0f
No related branches found
No related tags found
No related merge requests found
......@@ -12,6 +12,7 @@
[coerce :as coerce]
[format :as time])
colorize.core
[ring.util.codec :as codec]
[metabase.config :as config]
metabase.logger) ; make sure this is loaded since we use clojure.tools.logging here
(:import clojure.lang.Keyword
......@@ -19,6 +20,7 @@
InetSocketAddress
InetAddress)
(java.sql SQLException Timestamp)
(java.text Normalizer Normalizer$Form)
(java.util Calendar Date TimeZone)
javax.xml.bind.DatatypeConverter
org.joda.time.DateTime
......@@ -663,21 +665,44 @@
(when (seq more)
(apply strict-extend atype more)))
(defn remove-diacritical-marks
"Return a version of S with diacritical marks removed."
^String [^String s]
(when (seq s)
(s/replace
;; First, "decompose" the characters. e.g. replace 'LATIN CAPITAL LETTER A WITH ACUTE' with 'LATIN CAPITAL LETTER A' + 'COMBINING ACUTE ACCENT'
;; See http://docs.oracle.com/javase/8/docs/api/java/text/Normalizer.html
(Normalizer/normalize s Normalizer$Form/NFD)
;; next, remove the combining diacritical marks -- this SO answer explains what's going on here best: http://stackoverflow.com/a/5697575/1198455
;; The closest thing to a relevant JavaDoc I could find was http://docs.oracle.com/javase/7/docs/api/java/lang/Character.UnicodeBlock.html#COMBINING_DIACRITICAL_MARKS
#"\p{Block=CombiningDiacriticalMarks}+"
"")))
(def ^:private ^:const slugify-valid-chars
"Valid *ASCII* characters for URL slugs generated by `slugify`."
#{\a \b \c \d \e \f \g \h \i \j \k \l \m \n \o \p \q \r \s \t \u \v \w \x \y \z
\0 \1 \2 \3 \4 \5 \6 \7 \8 \9
\_})
;; unfortunately it seems that this doesn't fully-support Emoji :(, they get encoded as "??"
(defn- slugify-char [^Character c]
(cond
(> (int c) 128) (codec/url-encode c) ; for non-ASCII characters, URL-encode them
(contains? slugify-valid-chars c) c ; for ASCII characters, if they're in the allowed set of characters, keep them
:else \_)) ; otherwise replace them with underscores
(defn slugify
"Return a version of `String` S appropriate for use as a URL slug.
Downcase the name and replace non-alphanumeric characters with underscores.
Downcase the name, remove diacritcal marks, and replace non-alphanumeric *ASCII* characters with underscores;
URL-encode non-ASCII characters. (Non-ASCII characters are encoded rather than replaced with underscores in order
to support languages that don't use the Latin alphabet; see issue #3818).
Optionally specify MAX-LENGTH which will truncate the slug after that many characters."
(^String [s]
(^String [^String s]
(when (seq s)
(s/join (for [c (s/lower-case (name s))]
(if (contains? slugify-valid-chars c)
c
\_)))))
(s/join (for [c (remove-diacritical-marks (s/lower-case s))]
(slugify-char c)))))
(^String [s max-length]
(s/join (take max-length (slugify s)))))
......
......@@ -134,3 +134,20 @@
2 {:id 2, :name "Lucky"}}
(key-by :id [{:id 1, :name "Rasta"}
{:id 2, :name "Lucky"}]))
;; Tests for remove-diacritical marks
(expect "uuuu" (remove-diacritical-marks "üuuü"))
(expect "aeiu" (remove-diacritical-marks "åéîü"))
(expect "acnx" (remove-diacritical-marks "åçñx"))
(expect nil (remove-diacritical-marks ""))
(expect nil (remove-diacritical-marks nil))
;;; Tests for slugify
(expect "toucanfest_2017" (slugify "ToucanFest 2017"))
(expect "cam_s_awesome_toucan_emporium" (slugify "Cam's awesome toucan emporium"))
(expect "frequently_used_cards" (slugify "Frequently-Used Cards"))
;; check that diactrics get removed
(expect "cam_saul_s_toucannery" (slugify "Cam Saül's Toucannery"))
(expect "toucans_dislike_pinatas___" (slugify "toucans dislike piñatas :("))
;; check that non-ASCII characters get URL-encoded (so we can support non-Latin alphabet languages; see #3818)
(expect "%E5%8B%87%E5%A3%AB" (slugify "勇士")) ; go dubs
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment