summaryrefslogtreecommitdiff
path: root/vendor/icu_normalizer
diff options
context:
space:
mode:
authormo khan <mo@mokhan.ca>2025-07-10 13:11:11 -0600
committermo khan <mo@mokhan.ca>2025-07-10 13:11:11 -0600
commit01959b16a21b22b5df5f16569c2a8e8f92beecef (patch)
tree32afa5d747c5466345c59ec52161a7cba3d6d755 /vendor/icu_normalizer
parentff30574117a996df332e23d1fb6f65259b316b5b (diff)
chore: vendor dependencies
Diffstat (limited to 'vendor/icu_normalizer')
-rw-r--r--vendor/icu_normalizer/.cargo-checksum.json1
-rw-r--r--vendor/icu_normalizer/Cargo.lock970
-rw-r--r--vendor/icu_normalizer/Cargo.toml201
-rw-r--r--vendor/icu_normalizer/LICENSE46
-rw-r--r--vendor/icu_normalizer/README.md48
-rw-r--r--vendor/icu_normalizer/benches/bench.rs24
-rw-r--r--vendor/icu_normalizer/benches/canonical_composition.rs188
-rw-r--r--vendor/icu_normalizer/benches/canonical_decomposition.rs162
-rw-r--r--vendor/icu_normalizer/benches/composing_normalizer_nfc.rs230
-rw-r--r--vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs211
-rw-r--r--vendor/icu_normalizer/benches/data/README.md25
-rw-r--r--vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestNames_Korean.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestNames_Latin.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestNames_Thai.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt54
-rw-r--r--vendor/icu_normalizer/benches/data/wotw.txt58
-rw-r--r--vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs213
-rw-r--r--vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs211
-rw-r--r--vendor/icu_normalizer/src/lib.rs2854
-rw-r--r--vendor/icu_normalizer/src/properties.rs663
-rw-r--r--vendor/icu_normalizer/src/provider.rs216
-rw-r--r--vendor/icu_normalizer/src/uts46.rs177
-rw-r--r--vendor/icu_normalizer/tests/data/NormalizationTest.txt4
-rw-r--r--vendor/icu_normalizer/tests/data/README.md2
-rw-r--r--vendor/icu_normalizer/tests/tests.rs2083
36 files changed, 9397 insertions, 0 deletions
diff --git a/vendor/icu_normalizer/.cargo-checksum.json b/vendor/icu_normalizer/.cargo-checksum.json
new file mode 100644
index 00000000..93f500cf
--- /dev/null
+++ b/vendor/icu_normalizer/.cargo-checksum.json
@@ -0,0 +1 @@
+{"files":{"Cargo.lock":"1c8fe1c6e24d42329df5fb51aa5c07741ff411e78bcbef0f15cfc0cf400b4335","Cargo.toml":"b3ebc0d4deaf34153984d80c71ecfde9fe30d5621081322a00ff87c73348e57e","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"aec56e279d7e40a901b47a2eccb52197fde6c9499011b349c5ef509363bee6a9","benches/bench.rs":"9cd781e3d0e8d772860cd332b4f403910f3ca52fd69a459f5ac95d28f0e25ac2","benches/canonical_composition.rs":"0aa91d5d400f58da61865f5fabe878c8506e60466c78503f77041ef7257e6dbe","benches/canonical_decomposition.rs":"3b44b8f832e426e8c82e449743117182ab7b138288001b621ccc9325b4c27b6c","benches/composing_normalizer_nfc.rs":"9a7aaae94e0096ccac9f3d1a83585c3f449af87f9f0f8b05615d2a010078e3e8","benches/composing_normalizer_nfkc.rs":"ad92d562a1e9aad3611521526882e1896aa436d2ac59493c8c00686c57bdf31e","benches/data/README.md":"fa79b84815a228c3fbfa5d4c6d12885036994ca8ad61e683b2113cf2b428bb85","benches/data/TestNames_Japanese_h.txt":"6522f8ed794ad348c904079082ec3aa303ae7acf3f68bbc49fa0ee90eebf31e0","benches/data/TestNames_Japanese_k.txt":"e4e18804fe742ecd27ae48bc3564c6bc653180a3c649d43a2ab4d8b7f2607627","benches/data/TestNames_Korean.txt":"9cbf54d5ee16726c0fc9477366e273ba1b82e651c9e88e6c7532df5344f03920","benches/data/TestNames_Latin.txt":"3a30d450d259a6be4a6aee8eeef08d3767d11fcc047b8f58060c542efe1182d1","benches/data/TestNames_Thai.txt":"28d76ddb62d6f47646232860fce7440544f402158443889393fd7e8bf10e9c3d","benches/data/TestRandomWordsUDHR_ar.txt":"02a775153e9746ae938a9db0b60244f2c00d911bb72b611a3593b0991fd95723","benches/data/TestRandomWordsUDHR_de.txt":"100b9502e7ddcb2fcbd055cb7ec9113245105bd1c606cace5e5bc147cc18727b","benches/data/TestRandomWordsUDHR_el.txt":"d1a2f0f9efc9ce663026ca7c285177391937c90008479a8c5b909c300dc86972","benches/data/TestRandomWordsUDHR_es.txt":"deeebda09e0ce0f80dd805317e96d1a630908601ff2a4d1ccb0021b00b55814b","benches/data/TestRandomWordsUDHR_fr.txt":"5931edc9f1af2c27a0b35c9624732e70b87b0fd72ab486710f3aa6367c7ad35f","benches/data/TestRandomWordsUDHR_he.txt":"dc77a89ffb9803e5c574d87f4789cb17624df73e40a8a92961df8ea8be103425","benches/data/TestRandomWordsUDHR_pl.txt":"26c378295ee2ef75ccacea691df0456394184a9a5c9ce48b2bada169b2402bbb","benches/data/TestRandomWordsUDHR_ru.txt":"a1c339f6d7b69cf9154e855c290ab09eeaf167ebcdf6d4bcb917de039fba10ee","benches/data/TestRandomWordsUDHR_th.txt":"3ba518be9863c85c3ac80cbb12299e3594e6f5afed3406d910d948007adaaf4e","benches/data/TestRandomWordsUDHR_tr.txt":"815c7babbc7228ef89b56f29638aeb63013aeca0003a49e58994e26b41cba01c","benches/data/wotw.txt":"8f28e68041ce75bbf75e72e186a6145e4c2de9e7e62b9b86ce0621c527a23669","benches/decomposing_normalizer_nfd.rs":"28f3d54c9af813af7ac9d0fbc9d45a7a6d27a25266bd593453eb35c1894280b5","benches/decomposing_normalizer_nfkd.rs":"cbaa2755878ee1cc90170210fddb7c79836457f89eb84f4f32fb51348f350bd5","src/lib.rs":"49621ffe84e82515aecf3c660234355561520ee11066d30d49ef1189181b4ef4","src/properties.rs":"3940f55f1e608fe9a70cb943e71cfd37894339af6b7d13697ae1776d7c1a2cc0","src/provider.rs":"5850afc7ae842c7af74ce029be256944c64f5d0b51d95725a8366f5af22163e9","src/uts46.rs":"a54b6191cbb0538da16d8ef0b6dfb3adfa2ca30e4161aaf37bcaae3e6537de80","tests/data/NormalizationTest.txt":"1b04c22b82064adf871e76fd2148cd749129163f7d05bd7ace923516a65afe02","tests/data/README.md":"521fcd44a1f10f21629df88113fa29ca9f4e1dfbeea79fda19a7dc8ba435e24b","tests/tests.rs":"01db1c9dc1c7c71f80aed528e4309f416349af9eec887d2e438a3a11f2ee7f7c"},"package":"436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"} \ No newline at end of file
diff --git a/vendor/icu_normalizer/Cargo.lock b/vendor/icu_normalizer/Cargo.lock
new file mode 100644
index 00000000..0c89b8b3
--- /dev/null
+++ b/vendor/icu_normalizer/Cargo.lock
@@ -0,0 +1,970 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "anes"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
+
+[[package]]
+name = "anstyle"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
+
+[[package]]
+name = "arraystring"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185"
+dependencies = [
+ "typenum",
+]
+
+[[package]]
+name = "arrayvec"
+version = "0.7.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
+
+[[package]]
+name = "atoi"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
+
+[[package]]
+name = "bumpalo"
+version = "3.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
+
+[[package]]
+name = "cast"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "ciborium"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
+dependencies = [
+ "ciborium-io",
+ "ciborium-ll",
+ "serde",
+]
+
+[[package]]
+name = "ciborium-io"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
+
+[[package]]
+name = "ciborium-ll"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
+dependencies = [
+ "ciborium-io",
+ "half",
+]
+
+[[package]]
+name = "clap"
+version = "4.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c"
+dependencies = [
+ "clap_builder",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.4.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7"
+dependencies = [
+ "anstyle",
+ "clap_lex",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
+
+[[package]]
+name = "cobs"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15"
+
+[[package]]
+name = "criterion"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+dependencies = [
+ "anes",
+ "cast",
+ "ciborium",
+ "clap",
+ "criterion-plot",
+ "is-terminal",
+ "itertools",
+ "num-traits",
+ "once_cell",
+ "oorandom",
+ "plotters",
+ "rayon",
+ "regex",
+ "serde",
+ "serde_derive",
+ "serde_json",
+ "tinytemplate",
+ "walkdir",
+]
+
+[[package]]
+name = "criterion-plot"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+dependencies = [
+ "cast",
+ "itertools",
+]
+
+[[package]]
+name = "crossbeam-deque"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
+dependencies = [
+ "crossbeam-epoch",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-epoch"
+version = "0.9.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
+dependencies = [
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
+
+[[package]]
+name = "crunchy"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
+
+[[package]]
+name = "databake"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef"
+dependencies = [
+ "databake-derive",
+ "proc-macro2",
+ "quote",
+]
+
+[[package]]
+name = "databake-derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "detone"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09"
+
+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "either"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
+
+[[package]]
+name = "erased-serde"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e004d887f51fcb9fef17317a2f3525c887d8aa3f4f50fed920816a688284a5b7"
+dependencies = [
+ "serde",
+ "typeid",
+]
+
+[[package]]
+name = "half"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08"
+
+[[package]]
+name = "icu_collections"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47"
+dependencies = [
+ "databake",
+ "displaydoc",
+ "potential_utf",
+ "serde",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a"
+dependencies = [
+ "databake",
+ "displaydoc",
+ "litemap",
+ "serde",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.0.0"
+dependencies = [
+ "arraystring",
+ "arrayvec",
+ "atoi",
+ "criterion",
+ "databake",
+ "detone",
+ "displaydoc",
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "serde",
+ "smallvec",
+ "utf16_iter",
+ "utf8_iter",
+ "write16",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3"
+
+[[package]]
+name = "icu_properties"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2549ca8c7241c82f59c80ba2a6f415d931c5b58d24fb8412caa1a1f02c49139a"
+dependencies = [
+ "databake",
+ "displaydoc",
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "potential_utf",
+ "serde",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8197e866e47b68f8f7d95249e172903bec06004b18b2937f1095d40a0c57de04"
+
+[[package]]
+name = "icu_provider"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af"
+dependencies = [
+ "databake",
+ "displaydoc",
+ "erased-serde",
+ "icu_locale_core",
+ "postcard",
+ "serde",
+ "stable_deref_trait",
+ "tinystr",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "is-terminal"
+version = "0.4.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
+dependencies = [
+ "hermit-abi",
+ "libc",
+ "windows-sys",
+]
+
+[[package]]
+name = "itertools"
+version = "0.10.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
+
+[[package]]
+name = "js-sys"
+version = "0.3.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
+dependencies = [
+ "once_cell",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.172"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
+
+[[package]]
+name = "litemap"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "log"
+version = "0.4.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
+
+[[package]]
+name = "memchr"
+version = "2.7.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.21.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
+
+[[package]]
+name = "oorandom"
+version = "11.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
+
+[[package]]
+name = "plotters"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
+dependencies = [
+ "num-traits",
+ "plotters-backend",
+ "plotters-svg",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "plotters-backend"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
+
+[[package]]
+name = "plotters-svg"
+version = "0.3.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
+dependencies = [
+ "plotters-backend",
+]
+
+[[package]]
+name = "postcard"
+version = "1.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "170a2601f67cc9dba8edd8c4870b15f71a6a2dc196daec8c83f72b59dff628a8"
+dependencies = [
+ "cobs",
+ "serde",
+]
+
+[[package]]
+name = "potential_utf"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"
+dependencies = [
+ "databake",
+ "serde",
+ "zerovec",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.95"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rayon"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
+dependencies = [
+ "either",
+ "rayon-core",
+]
+
+[[package]]
+name = "rayon-core"
+version = "1.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
+dependencies = [
+ "crossbeam-deque",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "regex"
+version = "1.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
+
+[[package]]
+name = "rustversion"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
+
+[[package]]
+name = "ryu"
+version = "1.0.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "serde"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.219"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.140"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
+dependencies = [
+ "itoa",
+ "memchr",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.15.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9"
+
+[[package]]
+name = "stable_deref_trait"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
+
+[[package]]
+name = "syn"
+version = "2.0.101"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
+[[package]]
+name = "tinystr"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b"
+dependencies = [
+ "displaydoc",
+ "serde",
+ "zerovec",
+]
+
+[[package]]
+name = "tinytemplate"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
+dependencies = [
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "typeid"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c"
+
+[[package]]
+name = "typenum"
+version = "1.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
+
+[[package]]
+name = "utf16_iter"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
+[[package]]
+name = "walkdir"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+ "rustversion",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
+dependencies = [
+ "bumpalo",
+ "log",
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.100"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "winapi-util"
+version = "0.1.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
+dependencies = [
+ "windows-sys",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.59.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
+dependencies = [
+ "windows-targets",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
+dependencies = [
+ "windows_aarch64_gnullvm",
+ "windows_aarch64_msvc",
+ "windows_i686_gnu",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc",
+ "windows_x86_64_gnu",
+ "windows_x86_64_gnullvm",
+ "windows_x86_64_msvc",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
+
+[[package]]
+name = "write16"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
+dependencies = [
+ "arrayvec",
+]
+
+[[package]]
+name = "writeable"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb"
+
+[[package]]
+name = "yoke"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc"
+dependencies = [
+ "serde",
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
+[[package]]
+name = "zerotrie"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595"
+dependencies = [
+ "databake",
+ "displaydoc",
+ "litemap",
+ "serde",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428"
+dependencies = [
+ "databake",
+ "serde",
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
diff --git a/vendor/icu_normalizer/Cargo.toml b/vendor/icu_normalizer/Cargo.toml
new file mode 100644
index 00000000..f9eb1163
--- /dev/null
+++ b/vendor/icu_normalizer/Cargo.toml
@@ -0,0 +1,201 @@
+# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
+#
+# When uploading crates to the registry Cargo will automatically
+# "normalize" Cargo.toml files for maximal compatibility
+# with all versions of Cargo and also rewrite `path` dependencies
+# to registry (e.g., crates.io) dependencies.
+#
+# If you are reading this file be aware that the original Cargo.toml
+# will likely look very different (and much more reasonable).
+# See Cargo.toml.orig for the original contents.
+
+[package]
+edition = "2021"
+rust-version = "1.82"
+name = "icu_normalizer"
+version = "2.0.0"
+authors = ["The ICU4X Project Developers"]
+build = false
+include = [
+ "data/**/*",
+ "src/**/*",
+ "examples/**/*",
+ "benches/**/*",
+ "tests/**/*",
+ "Cargo.toml",
+ "LICENSE",
+ "README.md",
+ "build.rs",
+]
+autolib = false
+autobins = false
+autoexamples = false
+autotests = false
+autobenches = false
+description = "API for normalizing text into Unicode Normalization Forms"
+homepage = "https://icu4x.unicode.org"
+readme = "README.md"
+categories = ["internationalization"]
+license = "Unicode-3.0"
+repository = "https://github.com/unicode-org/icu4x"
+
+[package.metadata.docs.rs]
+all-features = true
+
+[features]
+compiled_data = [
+ "dep:icu_normalizer_data",
+ "icu_properties?/compiled_data",
+ "icu_provider/baked",
+]
+datagen = [
+ "serde",
+ "dep:databake",
+ "icu_properties",
+ "icu_collections/databake",
+ "zerovec/databake",
+ "icu_properties?/datagen",
+ "icu_provider/export",
+]
+default = [
+ "compiled_data",
+ "utf8_iter",
+ "utf16_iter",
+]
+experimental = []
+icu_properties = ["dep:icu_properties"]
+serde = [
+ "dep:serde",
+ "icu_collections/serde",
+ "zerovec/serde",
+ "icu_properties?/serde",
+ "icu_provider/serde",
+]
+utf16_iter = [
+ "dep:utf16_iter",
+ "write16",
+]
+utf8_iter = ["dep:utf8_iter"]
+
+[lib]
+name = "icu_normalizer"
+path = "src/lib.rs"
+
+[[test]]
+name = "tests"
+path = "tests/tests.rs"
+
+[[bench]]
+name = "bench"
+path = "benches/bench.rs"
+harness = false
+required-features = [
+ "utf16_iter",
+ "utf8_iter",
+]
+
+[[bench]]
+name = "canonical_composition"
+path = "benches/canonical_composition.rs"
+
+[[bench]]
+name = "canonical_decomposition"
+path = "benches/canonical_decomposition.rs"
+
+[[bench]]
+name = "composing_normalizer_nfc"
+path = "benches/composing_normalizer_nfc.rs"
+
+[[bench]]
+name = "composing_normalizer_nfkc"
+path = "benches/composing_normalizer_nfkc.rs"
+
+[[bench]]
+name = "decomposing_normalizer_nfd"
+path = "benches/decomposing_normalizer_nfd.rs"
+
+[[bench]]
+name = "decomposing_normalizer_nfkd"
+path = "benches/decomposing_normalizer_nfkd.rs"
+
+[dependencies.databake]
+version = "0.2.0"
+features = ["derive"]
+optional = true
+default-features = false
+
+[dependencies.displaydoc]
+version = "0.2.3"
+default-features = false
+
+[dependencies.icu_collections]
+version = "~2.0.0"
+default-features = false
+
+[dependencies.icu_normalizer_data]
+version = "~2.0.0"
+optional = true
+default-features = false
+
+[dependencies.icu_properties]
+version = "~2.0.0"
+optional = true
+default-features = false
+
+[dependencies.icu_provider]
+version = "2.0.0"
+default-features = false
+
+[dependencies.serde]
+version = "1.0.110"
+features = [
+ "derive",
+ "alloc",
+]
+optional = true
+default-features = false
+
+[dependencies.smallvec]
+version = "1.10.0"
+default-features = false
+
+[dependencies.utf16_iter]
+version = "1.0.2"
+optional = true
+default-features = false
+
+[dependencies.utf8_iter]
+version = "1.0.2"
+optional = true
+default-features = false
+
+[dependencies.write16]
+version = "1.0.0"
+features = ["alloc"]
+optional = true
+default-features = false
+
+[dependencies.zerovec]
+version = "0.11.1"
+default-features = false
+
+[dev-dependencies.arraystring]
+version = "0.3.0"
+
+[dev-dependencies.arrayvec]
+version = "0.7.2"
+default-features = false
+
+[dev-dependencies.atoi]
+version = "2.0.0"
+
+[dev-dependencies.detone]
+version = "1.0.0"
+
+[dev-dependencies.write16]
+version = "1.0.0"
+features = ["arrayvec"]
+default-features = false
+
+[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion]
+version = "0.5.0"
diff --git a/vendor/icu_normalizer/LICENSE b/vendor/icu_normalizer/LICENSE
new file mode 100644
index 00000000..c9be6012
--- /dev/null
+++ b/vendor/icu_normalizer/LICENSE
@@ -0,0 +1,46 @@
+UNICODE LICENSE V3
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright © 2020-2024 Unicode, Inc.
+
+NOTICE TO USER: Carefully read the following legal agreement. BY
+DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR
+SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
+TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT
+DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of data files and any associated documentation (the "Data Files") or
+software and any associated documentation (the "Software") to deal in the
+Data Files or Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, and/or sell
+copies of the Data Files or Software, and to permit persons to whom the
+Data Files or Software are furnished to do so, provided that either (a)
+this copyright and permission notice appear with all copies of the Data
+Files or Software, or (b) this copyright and permission notice appear in
+associated Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
+KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
+THIRD PARTY RIGHTS.
+
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
+BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
+OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
+ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA
+FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder shall
+not be used in advertising or otherwise to promote the sale, use or other
+dealings in these Data Files or Software without prior written
+authorization of the copyright holder.
+
+SPDX-License-Identifier: Unicode-3.0
+
+—
+
+Portions of ICU4X may have been adapted from ICU4C and/or ICU4J.
+ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others.
diff --git a/vendor/icu_normalizer/README.md b/vendor/icu_normalizer/README.md
new file mode 100644
index 00000000..5c9e7409
--- /dev/null
+++ b/vendor/icu_normalizer/README.md
@@ -0,0 +1,48 @@
+# icu_normalizer [![crates.io](https://img.shields.io/crates/v/icu_normalizer)](https://crates.io/crates/icu_normalizer)
+
+<!-- cargo-rdme start -->
+
+Normalizing text into Unicode Normalization Forms.
+
+This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
+and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
+
+## Functionality
+
+The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
+Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
+
+Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
+and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
+
+The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
+Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
+applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
+[`idna`](https://docs.rs/idna/latest/idna/) crate.
+
+The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
+the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
+property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
+[`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
+
+Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
+addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
+non-“maybe” answer.
+
+## Examples
+
+```rust
+let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
+assert_eq!(nfc.normalize("a\u{0308}"), "ä");
+assert!(nfc.is_normalized("ä"));
+
+let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
+assert_eq!(nfd.normalize("ä"), "a\u{0308}");
+assert!(!nfd.is_normalized("ä"));
+```
+
+<!-- cargo-rdme end -->
+
+## More Information
+
+For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x).
diff --git a/vendor/icu_normalizer/benches/bench.rs b/vendor/icu_normalizer/benches/bench.rs
new file mode 100644
index 00000000..011478af
--- /dev/null
+++ b/vendor/icu_normalizer/benches/bench.rs
@@ -0,0 +1,24 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{criterion_group, criterion_main};
+
+mod canonical_composition;
+mod canonical_decomposition;
+mod composing_normalizer_nfc;
+mod composing_normalizer_nfkc;
+mod decomposing_normalizer_nfd;
+mod decomposing_normalizer_nfkd;
+
+criterion_group!(
+ benches,
+ canonical_composition::criterion_benchmark,
+ canonical_decomposition::criterion_benchmark,
+ composing_normalizer_nfc::criterion_benchmark,
+ composing_normalizer_nfkc::criterion_benchmark,
+ decomposing_normalizer_nfd::criterion_benchmark,
+ decomposing_normalizer_nfkd::criterion_benchmark,
+);
+
+criterion_main!(benches);
diff --git a/vendor/icu_normalizer/benches/canonical_composition.rs b/vendor/icu_normalizer/benches/canonical_composition.rs
new file mode 100644
index 00000000..134c08d8
--- /dev/null
+++ b/vendor/icu_normalizer/benches/canonical_composition.rs
@@ -0,0 +1,188 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+use detone::IterDecomposeVietnamese;
+
+use icu_normalizer::properties::{
+ CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed,
+};
+use icu_normalizer::ComposingNormalizerBorrowed;
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub pairs: Vec<(char, char)>,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 16] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ [
+ BenchDataContent {
+ file_name: "TestNames_Latin".to_owned(),
+ pairs: decompose_data(
+ &nfc_normalizer
+ .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))),
+ ),
+ },
+ BenchDataContent {
+ file_name: "TestNames_Japanese_h".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestNames_Japanese_h.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestNames_Japanese_k".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestNames_Japanese_k.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestNames_Korean".to_owned(),
+ pairs: decompose_data(
+ &nfc_normalizer
+ .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))),
+ ),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_ar".to_owned(),
+ #[cfg(debug_assertions)]
+ pairs: Vec::new(),
+ #[cfg(not(debug_assertions))]
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_ar.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_de".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_de.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_el".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_el.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_es".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_es.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_fr".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_fr.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_he".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_he.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_pl".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_pl.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_ru".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_ru.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_th".to_owned(),
+ #[cfg(debug_assertions)]
+ pairs: Vec::new(),
+ #[cfg(not(debug_assertions))]
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_th.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "TestRandomWordsUDHR_tr".to_owned(),
+ pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!(
+ "./data/TestRandomWordsUDHR_tr.txt"
+ )))),
+ },
+ BenchDataContent {
+ file_name: "udhr_vie".to_owned(),
+ pairs: decompose_data(
+ &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))),
+ ),
+ },
+ BenchDataContent {
+ file_name: "udhr_vie_detone".to_owned(),
+ pairs: {
+ let result: Vec<(char, char)> = nfc_normalizer
+ .normalize(&strip_headers(include_str!("data/wotw.txt")))
+ .chars()
+ .filter_map(|c| {
+ let mut iter = std::iter::once(c).decompose_vietnamese_tones(true);
+ if let Some(base) = iter.next() {
+ iter.next().map(|tone| (base, tone))
+ } else {
+ None
+ }
+ })
+ .collect();
+ assert!(!result.is_empty());
+ result
+ },
+ },
+ ]
+}
+
+fn function_under_bench(
+ canonical_composer: &CanonicalCompositionBorrowed,
+ composable_points: &[(char, char)],
+) {
+ for pair in composable_points.iter() {
+ canonical_composer.compose(pair.0, pair.1);
+ }
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "canonical_composition";
+ let mut group = criterion.benchmark_group(group_name);
+
+ let composer = CanonicalCompositionBorrowed::new();
+
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)),
+ );
+ }
+
+ group.finish();
+}
+
+fn decompose_data(nfc: &str) -> Vec<(char, char)> {
+ let decomposer = CanonicalDecompositionBorrowed::new();
+ nfc.chars()
+ .map(|c| decomposer.decompose(c))
+ .filter_map(|decomposed| {
+ if let Decomposed::Expansion(a, b) = decomposed {
+ Some((a, b))
+ } else {
+ None
+ }
+ })
+ .collect()
+}
diff --git a/vendor/icu_normalizer/benches/canonical_decomposition.rs b/vendor/icu_normalizer/benches/canonical_decomposition.rs
new file mode 100644
index 00000000..8e5ad5dc
--- /dev/null
+++ b/vendor/icu_normalizer/benches/canonical_decomposition.rs
@@ -0,0 +1,162 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+
+use icu_normalizer::properties::CanonicalDecompositionBorrowed;
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub nfc: String,
+ pub nfd: String,
+ pub nfkc: String,
+ pub nfkd: String,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 15] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+ let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let content_latin: (&str, &str) = (
+ "TestNames_Latin",
+ &strip_headers(include_str!("./data/TestNames_Latin.txt")),
+ );
+ let content_jp_h: (&str, &str) = (
+ "TestNames_Japanese_h",
+ &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
+ );
+ let content_jp_k: (&str, &str) = (
+ "TestNames_Japanese_k",
+ &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
+ );
+ let content_korean: (&str, &str) = (
+ "TestNames_Korean",
+ &strip_headers(include_str!("./data/TestNames_Korean.txt")),
+ );
+ let content_random_words_ar: (&str, &str) = (
+ "TestRandomWordsUDHR_ar",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
+ );
+ let content_random_words_de: (&str, &str) = (
+ "TestRandomWordsUDHR_de",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
+ );
+ let content_random_words_el: (&str, &str) = (
+ "TestRandomWordsUDHR_el",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
+ );
+ let content_random_words_es: (&str, &str) = (
+ "TestRandomWordsUDHR_es",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
+ );
+ let content_random_words_fr: (&str, &str) = (
+ "TestRandomWordsUDHR_fr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
+ );
+ let content_random_words_he: (&str, &str) = (
+ "TestRandomWordsUDHR_he",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
+ );
+ let content_random_words_pl: (&str, &str) = (
+ "TestRandomWordsUDHR_pl",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
+ );
+ let content_random_words_ru: (&str, &str) = (
+ "TestRandomWordsUDHR_ru",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
+ );
+ let content_random_words_th: (&str, &str) = (
+ "TestRandomWordsUDHR_th",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
+ );
+ let content_random_words_tr: (&str, &str) = (
+ "TestRandomWordsUDHR_tr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
+ );
+ let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt")));
+
+ [
+ content_latin,
+ content_viet,
+ content_jp_k,
+ content_jp_h,
+ content_korean,
+ content_random_words_ru,
+ content_random_words_ar,
+ content_random_words_el,
+ content_random_words_es,
+ content_random_words_fr,
+ content_random_words_tr,
+ content_random_words_th,
+ content_random_words_pl,
+ content_random_words_he,
+ content_random_words_de,
+ ]
+ .map(|(file_name, raw_content)| BenchDataContent {
+ file_name: file_name.to_owned(),
+ nfc: nfc_normalizer.normalize(raw_content).to_string(),
+ nfd: nfd_normalizer.normalize(raw_content).to_string(),
+ nfkc: nfkc_normalizer.normalize(raw_content).to_string(),
+ nfkd: nfkd_normalizer.normalize(raw_content).to_string(),
+ })
+}
+
+#[cfg(debug_assertions)]
+fn function_under_bench(
+ _canonical_decomposer: &CanonicalDecompositionBorrowed,
+ _decomposable_points: &str,
+) {
+ // using debug assertion fails some test.
+ // "cargo test --bench bench" will pass
+ // "cargo bench" will work as expected, because the profile doesn't include debug assertions.
+}
+
+#[cfg(not(debug_assertions))]
+fn function_under_bench(
+ canonical_decomposer: &CanonicalDecompositionBorrowed,
+ decomposable_points: &str,
+) {
+ decomposable_points.chars().for_each(|point| {
+ canonical_decomposer.decompose(point);
+ });
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "canonical_decomposition";
+ let mut group = criterion.benchmark_group(group_name);
+
+ let decomposer = CanonicalDecompositionBorrowed::new();
+
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)),
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
+ |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)),
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
+ |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)),
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
+ |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)),
+ );
+ }
+ group.finish();
+}
diff --git a/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs b/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs
new file mode 100644
index 00000000..e23848dc
--- /dev/null
+++ b/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs
@@ -0,0 +1,230 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub nfc: String,
+ pub nfd: String,
+ pub nfkc: String,
+ pub nfkd: String,
+ pub nfc_u16: Vec<u16>,
+ pub nfd_u16: Vec<u16>,
+ pub nfkc_u16: Vec<u16>,
+ pub nfkd_u16: Vec<u16>,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 15] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+ let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let content_latin: (&str, &str) = (
+ "TestNames_Latin",
+ &strip_headers(include_str!("./data/TestNames_Latin.txt")),
+ );
+ let content_jp_h: (&str, &str) = (
+ "TestNames_Japanese_h",
+ &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
+ );
+ let content_jp_k: (&str, &str) = (
+ "TestNames_Japanese_k",
+ &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
+ );
+ let content_korean: (&str, &str) = (
+ "TestNames_Korean",
+ &strip_headers(include_str!("./data/TestNames_Korean.txt")),
+ );
+ let content_random_words_ar: (&str, &str) = (
+ "TestRandomWordsUDHR_ar",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
+ );
+ let content_random_words_de: (&str, &str) = (
+ "TestRandomWordsUDHR_de",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
+ );
+ let content_random_words_el: (&str, &str) = (
+ "TestRandomWordsUDHR_el",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
+ );
+ let content_random_words_es: (&str, &str) = (
+ "TestRandomWordsUDHR_es",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
+ );
+ let content_random_words_fr: (&str, &str) = (
+ "TestRandomWordsUDHR_fr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
+ );
+ let content_random_words_he: (&str, &str) = (
+ "TestRandomWordsUDHR_he",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
+ );
+ let content_random_words_pl: (&str, &str) = (
+ "TestRandomWordsUDHR_pl",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
+ );
+ let content_random_words_ru: (&str, &str) = (
+ "TestRandomWordsUDHR_ru",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
+ );
+ let content_random_words_th: (&str, &str) = (
+ "TestRandomWordsUDHR_th",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
+ );
+ let content_random_words_tr: (&str, &str) = (
+ "TestRandomWordsUDHR_tr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
+ );
+ let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
+
+ [
+ content_latin,
+ content_viet,
+ content_jp_k,
+ content_jp_h,
+ content_korean,
+ content_random_words_ru,
+ content_random_words_ar,
+ content_random_words_el,
+ content_random_words_es,
+ content_random_words_fr,
+ content_random_words_tr,
+ content_random_words_th,
+ content_random_words_pl,
+ content_random_words_he,
+ content_random_words_de,
+ ]
+ .map(|(file_name, raw_content)| {
+ let nfc = &nfc_normalizer.normalize(raw_content);
+ let nfd = &nfd_normalizer.normalize(raw_content);
+ let nfkc = &nfkc_normalizer.normalize(raw_content);
+ let nfkd = &nfkd_normalizer.normalize(raw_content);
+ BenchDataContent {
+ file_name: file_name.to_owned(),
+ nfc: nfc.to_string(),
+ nfd: nfd.to_string(),
+ nfkc: nfkc.to_string(),
+ nfkd: nfkd.to_string(),
+ nfc_u16: nfc.encode_utf16().collect(),
+ nfd_u16: nfd.encode_utf16().collect(),
+ nfkc_u16: nfkc.encode_utf16().collect(),
+ nfkd_u16: nfkd.encode_utf16().collect(),
+ }
+ })
+}
+
+fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) {
+ normalizer.normalize(text);
+}
+
+fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) {
+ normalizer.normalize_utf16(text);
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "composing_normalizer_nfc";
+
+ let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut group = criterion.benchmark_group(group_name);
+
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
+ })
+ },
+ );
+
+ // UTF_16
+ group.bench_function(
+ BenchmarkId::from_parameter(format!(
+ "from_nfc_{}_utf_16",
+ bench_data_content.file_name
+ )),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!(
+ "from_nfd_{}_utf_16",
+ bench_data_content.file_name
+ )),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!(
+ "from_nfkc_{}_utf_16",
+ bench_data_content.file_name
+ )),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_utf16(
+ &normalizer_under_bench,
+ &bench_data_content.nfkc_u16,
+ )
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!(
+ "from_nfkd_{}_utf_16",
+ bench_data_content.file_name
+ )),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_utf16(
+ &normalizer_under_bench,
+ &bench_data_content.nfkd_u16,
+ )
+ })
+ },
+ );
+ }
+ group.finish();
+}
diff --git a/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs b/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs
new file mode 100644
index 00000000..6792c7ee
--- /dev/null
+++ b/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs
@@ -0,0 +1,211 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub nfc: String,
+ pub nfd: String,
+ pub nfkc: String,
+ pub nfkd: String,
+ pub nfc_u16: Vec<u16>,
+ pub nfd_u16: Vec<u16>,
+ pub nfkc_u16: Vec<u16>,
+ pub nfkd_u16: Vec<u16>,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 15] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+ let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let content_latin: (&str, &str) = (
+ "TestNames_Latin",
+ &strip_headers(include_str!("./data/TestNames_Latin.txt")),
+ );
+ let content_jp_h: (&str, &str) = (
+ "TestNames_Japanese_h",
+ &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
+ );
+ let content_jp_k: (&str, &str) = (
+ "TestNames_Japanese_k",
+ &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
+ );
+ let content_korean: (&str, &str) = (
+ "TestNames_Korean",
+ &strip_headers(include_str!("./data/TestNames_Korean.txt")),
+ );
+ let content_random_words_ar: (&str, &str) = (
+ "TestRandomWordsUDHR_ar",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
+ );
+ let content_random_words_de: (&str, &str) = (
+ "TestRandomWordsUDHR_de",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
+ );
+ let content_random_words_el: (&str, &str) = (
+ "TestRandomWordsUDHR_el",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
+ );
+ let content_random_words_es: (&str, &str) = (
+ "TestRandomWordsUDHR_es",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
+ );
+ let content_random_words_fr: (&str, &str) = (
+ "TestRandomWordsUDHR_fr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
+ );
+ let content_random_words_he: (&str, &str) = (
+ "TestRandomWordsUDHR_he",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
+ );
+ let content_random_words_pl: (&str, &str) = (
+ "TestRandomWordsUDHR_pl",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
+ );
+ let content_random_words_ru: (&str, &str) = (
+ "TestRandomWordsUDHR_ru",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
+ );
+ let content_random_words_th: (&str, &str) = (
+ "TestRandomWordsUDHR_th",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
+ );
+ let content_random_words_tr: (&str, &str) = (
+ "TestRandomWordsUDHR_tr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
+ );
+ let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
+
+ [
+ content_latin,
+ content_viet,
+ content_jp_k,
+ content_jp_h,
+ content_korean,
+ content_random_words_ru,
+ content_random_words_ar,
+ content_random_words_el,
+ content_random_words_es,
+ content_random_words_fr,
+ content_random_words_tr,
+ content_random_words_th,
+ content_random_words_pl,
+ content_random_words_he,
+ content_random_words_de,
+ ]
+ .map(|(file_name, raw_content)| {
+ let nfc = &nfc_normalizer.normalize(raw_content);
+ let nfd = &nfd_normalizer.normalize(raw_content);
+ let nfkc = &nfkc_normalizer.normalize(raw_content);
+ let nfkd = &nfkd_normalizer.normalize(raw_content);
+ BenchDataContent {
+ file_name: file_name.to_owned(),
+ nfc: nfc.to_string(),
+ nfd: nfd.to_string(),
+ nfkc: nfkc.to_string(),
+ nfkd: nfkd.to_string(),
+ nfc_u16: nfc.encode_utf16().collect(),
+ nfd_u16: nfd.encode_utf16().collect(),
+ nfkc_u16: nfkc.encode_utf16().collect(),
+ nfkd_u16: nfkd.encode_utf16().collect(),
+ }
+ })
+}
+
+fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) {
+ normalizer.normalize(text);
+}
+
+fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) {
+ normalizer.normalize_utf16(text);
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "composing_normalizer_nfkc";
+
+ let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc();
+
+ let mut group = criterion.benchmark_group(group_name);
+
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
+ })
+ },
+ );
+ // UTF 16
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
+ })
+ },
+ );
+ }
+ group.finish();
+}
diff --git a/vendor/icu_normalizer/benches/data/README.md b/vendor/icu_normalizer/benches/data/README.md
new file mode 100644
index 00000000..de34f9fc
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/README.md
@@ -0,0 +1,25 @@
+# Generating microbench data
+
+The full versions of these files are located
+[in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data).
+
+## Sanitizing the file
+
+```shell
+sed -i '/^#/d' ${filename}
+sed -i '/^$/d' ${filename}
+```
+
+## Shuffling the file
+
+```shell
+shuf -n 20 ${filename} -o ${filename}
+```
+
+## Add back the header (if you plan on submitting the files)
+
+```
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+```
diff --git a/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt b/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt
new file mode 100644
index 00000000..5fb4d944
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+かげやま,みのる
+むらかみ,とおる
+つじさわ,けい
+やすい,たかゆき
+むらさき,としお
+はせがわ,ひであき
+うるしばら,よしひこ
+ままだ,ひろし
+おおぼら,えいじろう
+おおば,まさひで
+きたばたけ,たかひこ
+はまさき,あつし
+ほりい,つねお
+もり,だいいち
+いとう,しんいち
+くにもと,じゅんじ
+おか,のりひと
+たに,よしあき
+しらがき,ひろあき
+しらはま,たけひろ
+むらかみ,やすひろ
+うめはら,たかし
+いわた,ひろし
+すぎえ,かつとし
+てらにし,ひろみつ
+まつおか,だいすけ
+もろほし,すすむ
+いしはら,たかし
+おしま,ひろお
+なかお,ゆうじ
+いかり,はるお
+きまち,まさき
+ふるかわ,みちお
+かねこ,しゅうへい
+なかがわ,ともみ
+ささき,しんご
+うちだ,たくじ
+うめだ,さかえ
+しばた,いくこ
+まきした,けいこ
+まつもと,しんいちろう
+たかの,かずよし
+いしわた,なおひさ
+いうち,まこと
+いまい,りほ
+みずた,のりあき
+かくたに,まなぶ
+わだ,ほまれ
+わかまつ,かずき
+かわぐち,ひろき
diff --git a/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt b/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt
new file mode 100644
index 00000000..b986e7a2
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+ホリモト,ユウジ
+ハナミ,ヤスヒデ
+イシザカ,タカユキ
+ゼンケ,トシオ
+ハトリ,ユウコ
+ナガオカ,トモユキ
+コウダ,ケンイチ
+イシダ,ヒロシ
+ミワ,シゲユキ
+イシカワ,ヒロシ
+スズキ,ユウスケ
+オクダ,ヨシノリ
+シムラ,サカエ
+エビシマ,ヤスユキ
+イブカ,ヨシテル
+タノ,マコト
+ドウゾノ,セイヤ
+ヤマナカ,サツミ
+トミイエ,ハヤト
+アザミ,ツトム
+タナカ,キョウコ
+コジマ,アツシ
+フミハラ,カオリ
+スズキ,マサユキ
+ナトリ,ケンヤ
+スズキ,ユウコ
+スズキ,ヒサエ
+ナカガワ,カツヨシ
+スズキ,マサフミ
+マツヤマ,トシオ
+ヨシナガ,チカエ
+キタムラ,リカコ
+アオキ,タクオ
+ヤマグチ,ヤスヒロ
+スギムラ,シゲオ
+ウエスギ,マサミ
+マツムラ,シンイチ
+クバ,タカシ
+スドウ,タカトシ
+フジモト,ヒロシ
+イトウ,シュウイチ
+コバヤシ,カズミ
+タナカ,ヒロカツ
+イシダ,ツカサ
+ヤマダ,マサコ
+カミヤ,トミエ
+タケモト,ユウジ
+スミノ,コウジ
+ヒロハタ,タクヤ
+ミヒラ,リョウヘイ
diff --git a/vendor/icu_normalizer/benches/data/TestNames_Korean.txt b/vendor/icu_normalizer/benches/data/TestNames_Korean.txt
new file mode 100644
index 00000000..95b19916
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestNames_Korean.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+김명희
+홍차수
+허순재
+강영휘
+김운주
+이종환
+이은국
+강태호
+강일래
+김동현
+곽기자
+차재수
+표봉기
+문대원
+이형기
+최교표
+박식현
+홍종립
+서창수
+김쌍건
+서말도
+이병훈
+김희수
+박학태
+강태종
+조문란
+신범균
+백두진
+이철정
+김태중
+이성현
+김주조
+김강행
+이정길
+김완일
+권수자
+이춘철
+김판근
+김곡리
+이경형
+이운만
+손상철
+유기숙
+박정한
+조윤래
+유신호
+이두수
+김재률
+김성홍
+김혜경
diff --git a/vendor/icu_normalizer/benches/data/TestNames_Latin.txt b/vendor/icu_normalizer/benches/data/TestNames_Latin.txt
new file mode 100644
index 00000000..e5b82ab3
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestNames_Latin.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+González, Joan
+Reinders, Jim
+Applebroog, Ida
+Kidd, Joseph Bartholomew
+Gulácsy, Lajos
+Letendre, Rita
+Zuccaro, Federico
+Apt the Elder, Ulrich
+Drummond, Arthur
+Manley, Thomas
+Broc, Jean
+Ramunno, Tony
+Simone dei Crocifissi
+Lane, Theodore
+Symonds, William Robert
+Johnson, Frank Tenney
+Cox, Gardner
+Bunbury, Charles
+Pedro de la Cuadra
+Payne, William
+Lucas, John Seymour
+Holsman, Elizabeth T.
+de Vries, Auke
+Laszlo, Philip Alexius de
+Shigemasa
+Wolfe, Ruth Mitchell
+Buck, John
+Baselitz, Georg
+Hook, Walter
+Segall, Lasar
+Brush, George deForest
+Master of Jánosrét
+Sutherland, Elizabeth Leveson-Gower, Countess of
+Tuckerman, Jane
+Varley, F.H.
+Fosso, Samuel
+Gardner, Daniel
+Sadler, Walter Dendy
+Clausen, Franciska
+Coman, Charlotte Buell
+Wakelin, Roland
+Payne, Jon, CML
+Campagna, Girolamo
+Wiener, Phyllis
+Sallee, Charles
+Fitzgerald, John Anster
+Gribbroek, Robert
+Laporte, John
+Lévy-Dhurmer, Lucien
+Young, Stephen Scott
diff --git a/vendor/icu_normalizer/benches/data/TestNames_Thai.txt b/vendor/icu_normalizer/benches/data/TestNames_Thai.txt
new file mode 100644
index 00000000..4de72dc6
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestNames_Thai.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+ณรงค์ โต๊ะเงิน
+กิตติ บุญวันต์
+สมหมาย ดาบทองดี
+ธวัชชัย อิสระนิมิตร
+วรรณา โสภณนรินทร์
+วินัย หมู่มิ่ง
+พัชรี ชูจิรวงศ์
+สมปอง จิวไพโรจน์กิจ
+บุญส่ง กวยรักษา
+นิพนธ์ นิ่มใหม่
+พัชรี สุวพรศิลป์
+เจริญ นววัฒนทรัพย์
+อรพินท์ แซ่เจี่ย
+ชัยพร สมใจนึก
+ประนอม โคศิลา
+ฉวีวรรณ ศรสังข์ทอง
+วัชรา เจริญรัตนพร
+สุภัท นกศิริ
+อู๋ มาลาเล็ก
+ประยูร ไชโย
+ละออ อยู่ยืนยง
+สมใจ วิวัฒน์วานิช
+จุมพล จันทรศรีเกษร
+พุฒ ดอกไม้จีน
+บุญชัย วรกิจพรสิน
+สมาน ธูปเทียน
+พงศ์ศักดิ์ แซ่แต้
+อำนาจ ไวจงเจริญ
+พรทิพย์ แซ่ลี้
+อุไรวรรณ สาครสินธุ์
+อำพล วีระตะนนท์
+สมจิตร ใจวังโลก
+สุเทพ ตันวินิจ
+สวาท ทรัพย์มาก
+สมศักดิ์ เจือจันทร์
+ดัสซันซิงห์ กุลาตี
+ธีร ศรแก้ว
+พรรณยุพา ฮ่อสกุล
+สำราญ จันทร์เอี่ยม
+พจน์ มั่นกันนาน
+สุธี บุณยเกียรติ
+บุญโชติ ทิพย์ประเสริฐสิน
+ประดิษฐ์ ทองพสิฐสมบัติ
+จำเนียร เพ็งเจริญ
+สมศักดิ์ อรุณรัตน์
+อนุชา จารุหิรัญสกุล
+พิกุล มโนภิญโญภิญญะ
+ผ่องศรี นกแก้ว
+อารี วิไลวรรณ
+ณรงค์วิทย์ วิทสัทธาวรกุล
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt
new file mode 100644
index 00000000..0cf40fb0
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+ممارسة مراعاة
+العنصرية
+حدود والشيخوخة
+بالحكم كهذا ينتفع
+البلاد
+تربية
+الغير التقدم والعدل
+نحو بالتعليم والحرية
+تأمين متساو
+للتعليم فيها
+آذت اعتداء للتعليم
+ليس المتأصلة
+والمساهمة الضروري تتناقض
+وتأسيس
+رضى
+شرعي الطبية
+لكيلا الجمعية والحرية
+للرجال التزوج
+بالكرامة
+حرية بين
+هذه العيش تنظر
+قيد
+يقررها والصداقة
+اعتُمد وينبغي اجتماعي
+حرمان
+للإدراك بأجر إنتاجه
+التربية القانون
+لإنصافه وتأسيس وسمعته
+أساسه للرجال
+كافة
+المجهود دولي أينما
+وإلى
+بنشاط تجري
+والأمم مثل لحقوق
+الإنسان بشروط بحماية
+شرفه
+كما الوظائف
+حياته ديسمبر
+ولما
+هذه
+غاية جديد إنسان
+حرية
+متهم الوطنية قدمًا
+التملك وضع
+شرعية ويعبر تأدية
+بنظام عمل والأخلاق
+التملك لشخصيته يلجأ
+بحال يضطر ولا
+الانضمام بالكرامة
+عضوا
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt
new file mode 100644
index 00000000..b002a64c
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+Herrschaft Freiheiten Not
+Gewalt
+stets anderer begründet
+erhobenen innerstaatliche
+Heiratsfähige freie
+offenstehen Begrenzung grausamer
+Maßnahmen höchste
+unentbehrlich privat
+erniedrigender
+Verachtung freie
+innezuhaben innerstaatlichen
+kommen
+werden gleichgültig
+Würde überall höchste
+Schutzmaßnahmen den Pflichten
+Wille Bestimmung
+Leibeigenschaft einschließlich für
+gleiche bekräftigt Gewissens
+Wohles
+Generalversammlung
+Volkes
+Völkern gegenwärtig Zusammenarbeit
+Heiratsfähige sowie Jeder
+Stellung
+Lebensstandard
+seinem
+Rede strafbaren Sicherheit
+mit
+Kulthandlungen Grund
+ärztlicher
+Auflösung Anforderungen anzugehören
+Furcht
+keine Geburt
+Wohles Furcht genügen
+befriedigende Medien
+anzugehören Urlaub Vereinigungen
+hinzuwirken verboten Resolution
+kommen
+sozialer vor irgendein
+Bestimmung Bestimmung
+Fall natürliche kein
+Geschlecht Aufhetzung eigenen
+seinen
+über
+Unterlassung Berücksichtigung
+war
+Rufes stets
+Volkes anderer Beschränkungen
+Handlungen dessen
+Die
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt
new file mode 100644
index 00000000..9c71f293
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+προάγει αλληλογραφία
+λογική έχει
+ιδρύει ζωή τεχνική
+δυνατότητες
+περιορισμό συνόλου
+ασκεί παραγνώριση συναφθεί
+αναγνωρίζουν ποινικής εκδηλώνει
+κοινότητας διακυβέρνηση στα
+απέναντι υψηλή
+περιστάσεων αξιόποινη
+σεβασμό
+συντήρησής κατά εξασφαλίσουν
+παραβιάζουν συμπληρώνεται νόμο
+άμεσα
+σημαίνει καθεστώς
+ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων
+ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση
+μέσο
+ίση Εχει
+ειρήνης Κάθε
+μέλη μορφή
+όσο
+κρατείται Στο Διακηρύσσει
+οικονομικών έκφρασης εξασφαλίζεται
+κάθε
+περίπτωση απολαμβάνουν
+ποινικό γεροντική
+είναι μαζί δικαστήρια
+μαζί προοπτική
+δική
+βαρβαρότητας
+οικονομικών εξασφαλίσει
+υποχρεώσεις οδήγησαν
+Οικουμενική Διακήρυξης γονείς
+στις μυστική αντιπροσώπους
+Διακήρυξης άδειες βιοτικό
+αναπηρία ομάδα
+πραγματικό
+καλύτερες
+ανάπαυση
+δίκαιες ένα δικαίου
+μετέχει στους
+θρησκευτικών ποινικής
+Κανείς ίσα
+πεποιθήσεις
+πολιτικές ανάλογα δουλεία
+πολιτικές ιατρική ωσότου
+ηθικής χωρίς
+ανδρών ικανό
+καθώς
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt
new file mode 100644
index 00000000..db0490d3
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+duración común
+delito reconocimiento alimentación
+inalienables
+entre seguridad escogidos
+comportarse dignidad
+autónomo gobierno tiempo
+omisiones
+comisión
+Derechos territorios
+debe
+han
+regresar inalienables
+regresar
+desempleo científico
+arbitrariamente proclamada
+están contraerse esposos
+cualesquiera
+salir carácter desarrollo
+solamente justas
+personalidad una
+cuanto
+garantice resolución
+concepción
+tomar impondrá
+cualquier reconocimiento
+obligatoria obligatoria satisfactoria
+acusación sin
+artísticas penal culturales
+pagadas examen
+Además Organización dignidad
+opresión esposos ejercidos
+barbarie están mientras
+por
+idioma
+recursos pagadas
+materia Nada ella
+con injerencias
+inspirándose
+organización
+gozar jurisdicción
+que
+asegurar
+humana libertad
+nadie equivalente
+escoger remuneración
+torturas
+individuos poder
+disfruten seres Preámbulo
+desempleo
+liberados
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt
new file mode 100644
index 00000000..2e0a38e7
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+conforme êtres fonctions
+non tout généralisé
+premier lui
+faire hommes d’égalité
+peuple volonté bénéficier
+générale nationales
+cruels plus
+d’encourager opinions
+genre l’esprit
+d’origine effectif
+exigences auront
+résultent situation recevoir
+peuples Chacun
+sont d’égalité
+jouissent
+auront l’esprit
+pays telle
+publiquement
+mariage foi
+travail démocratique religieux
+rémunération
+omissions telles
+L’éducation
+raison complétée donner
+invoqué auront arbitraires
+l’amitié suffisant affaires
+travaille l’accomplissement l’intermédiaire
+race
+opinions celles
+assurer par privée
+valeur
+violant traite premier
+inhérente
+bienfaits l’avènement
+Unies s’il actions
+inquiété l’esclavage
+inquiété
+esclaves lieu
+salaire
+par
+toute
+innocente procédure membres
+arts l’idéal envers
+suffrage territoires inhumains
+d’immixtions l’organisation progrès
+comme égalité Unies
+maternité
+violerait suprême sécurité
+impliquant eux loisirs
+nationalité
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt
new file mode 100644
index 00000000..2b6b120a
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+זקנה משפילים
+ינתן חברתי עניניו
+הפוב
+ולהיות זכויות הישגים
+יאסרו מטעמי וללא
+ספרותית השלם
+למנוחה חינם
+וההתאגדות
+לטפח
+באלה במלואן
+יהנו
+ולרווחתם לגבר האדם
+בכבודו שבארצות כבוד
+ובינלאומיים
+בכך לתנאי אישי
+שאינן
+שרירותי
+במשפט
+ולעקרונותיהן מטעם
+שרירותית האשמה יהיה
+החינוך ולבטחון
+סובלנות אשמתו במגילה
+המאוחדות חיוני
+חשוב במקרה
+כלתי העולם
+שמקורה כציבור
+לשויון
+לתקנה
+תלוי ההתאספות
+הדיבור שהוא
+והבלתי והבסיסית
+ולעקרונותיהן יהא וישאף
+ביתנ הבינלאומי
+והזלזול להקנות
+בגלל כולם שיושלם
+לחיים
+בדבר
+לשירות
+זכויות
+לפני
+אדם ולא מזזמנות
+קנינו שהיה ההתאספות
+בינלאומי חיוניות לבקש
+תהיינה
+ובזכות בכורה מהגנה
+מתוך
+ובמצפון מזומנות לאגד
+והחמריים סוציאלי
+אנושיים ובהצבעה
+פראיים
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt
new file mode 100644
index 00000000..b6cd9760
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+uciskowi posiadania prawo
+społecznego największych skazany
+czy
+potrzeby samodzielnie przystępowania
+Krzewi też dokonania
+pełną prawo
+buntu
+moralności
+zapewnienia znaczenie
+nieludzki wypadek Nikt
+zasadności jakikolwiek Każdy
+samowolnie krajem
+międzynarodowego
+członek wielu
+rozwój wynikających obalenia
+rasy
+grudnia która
+jedynie urlopu ani
+małżeńskie stanowi ustaniu
+człowieka postępowych
+prześladowania
+politycznej które zawarcia
+Deklaracja
+ingerować wyłącznie
+studia Nikt
+innego uprawianie zrozumienie
+wybranych swobodę wyznania
+wolni osobowości
+ograniczenie Nie
+równej społecznego uciekać
+będącą POWSZECHNA
+niezdolności poszukiwania międzynarodowej
+konieczne potrzeby posiada
+opinii wychowywania 1948
+międzynarodowej zatrzymać
+przedstawicieli
+przeciw
+wynikających organy pracę
+człowiek grupami
+niezbędnych
+wolności podstawowym
+opinii małżonków wolność
+postępować zdecydowanie komórką
+odniesieniu
+pokoju azyl
+zawodowych powrócić człowiek
+konstytucję
+takiej postaciach powszechnego
+wygnać wygnać
+wspólny poszanowania
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt
new file mode 100644
index 00000000..4ceb0307
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+областях
+будут должен
+обеспечиваются нежели
+котором Уставе
+социального моральных
+совершеннолетия предоставление
+том независимо
+существование
+вмешательства какому ограниченной
+распространять
+находить помощь
+искусством
+унижающим положения искать
+изгнанию член совершеннолетия
+обществом имуществом государственной
+идеи братства
+наслаждаться значение социальной
+осуществления юрисдикцией наказанию
+достойное свою III
+жизнь расторжения инвалидности
+терпимости этого
+целях равны
+обеспечиваются законным
+принуждаем правосубъектности
+пыткам доступа неприкосновенность
+Брак против
+прибегать независимой
+человека человеческой
+быть независимо религии
+публичным
+членам против
+разумом результатом семью
+Принята участие
+беспристрастным тем
+частным основной
+правового
+страной обслуживание
+было свободу полное
+рабочего свободны
+состоянии помощь религиозными
+полное
+владеть власти морали
+меньшей
+братства социальному убежища
+государств
+равны который дети
+терпимости
+получать бесплатным полного
+богослужении
+отдельным
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt
new file mode 100644
index 00000000..bc0d0737
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+คิด ใตัอำ เคลื่อนไหว
+บังคับ บาก
+สิ่ง สิ้น
+วัตถุ
+ชาย อาศัย เท่านั้น
+สิน
+เกา
+ดูแล พิธีกรรม
+ภายใน
+เพศ
+หนัก ประสงค์
+เหตุ
+งาน รักษา
+เพศ ภาษา
+นี้
+คู่ สัญชาติ ต้องการ
+วิธี ระหว่าง ตกลง
+ทำนอง
+สืบ กับ ศิลปกรรม
+เหนือ วรรณกรรม
+คิด การก หน้าที่
+ชาติ ศิลปกรรม แต่
+สามัญ สอด
+เหยียด วิธี จุด
+หน้า ถ้า เบื้อง
+ประชุม
+ศิลปกรรม
+เสรีภาพ โหด ก่อ
+เกียรติศักดิ์ ป่วย เอกราช
+ประหัต มโนธรรม การ
+แทน
+ขัดขืน เวลา เสียง
+กฎบัตร พยายาม
+สิน หน้า
+จำเป็น
+ประชาธิปไตย หน่วย
+กรณี จริงจัง
+ทำนอง
+ทาษ
+เพิ่ม
+บรรดา ขวาง
+กักขัง
+มนุษย์
+ชาย ประกัน มนุษยธรรม
+จะบัน มูลฐาน เถื่อน
+พฤติ
+มิได้
+หญิง คู่
+สมา ปฏิบัติ อนึ่ง
+สิ่ง ทาษ
diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt
new file mode 100644
index 00000000..08129b01
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt
@@ -0,0 +1,54 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+mecburidir ilim
+isnadın sınırları suç
+tutuklanamaz diğer
+memleket korunmasi kullanılamaz
+İnsanlık ilerlemeyi
+bir mülk menfaatlerinin
+usul zümreye herhangi
+mahkeme vicdana ilerleyişe
+zulüm zalimane
+ilim öncelikle çocuk
+mevzubahis ancak
+muamelesi dinlenmeye
+eşitlikle prensiplerine ülkenin
+öğretim bulunmalarına yardım
+memleketler amacıyla
+birbirlerine
+olmalıdır
+bırakılamaz serbestisine
+hürriyetin iyi
+hükmü işbu zalimane
+evlenme memleketi tedbirlerle
+evlenmek ahalisi işini
+hürriyetler
+belirlenmiş kere
+elde cürüme
+tanınan dünyaca yüksek
+müddetinin ailesine
+vicdan kırıcı itibariyle
+geniş inanma
+kendi görevleri Teşkilatı
+yaymak
+öğretim vesayet
+renk kişiliğinin
+tamamlanan
+haklara bulunma
+hükmü uygulanabilecek
+etmiş geliştirilmesini hoşgörü
+sahiptir temel
+giyim
+Bundan temeli
+icaplarını
+mülk karışma tekmil
+vicdana hürriyetine işini
+Herkesin vahşiliklere
+dolaşma dünyanın
+davasının Uluslararasında idamesi
+eşittir
+haklardan hakkı
+kovuşturmalar hürriyetlerden gözönünde
+Evrensel fiilli beyannamesi
diff --git a/vendor/icu_normalizer/benches/data/wotw.txt b/vendor/icu_normalizer/benches/data/wotw.txt
new file mode 100644
index 00000000..5ffb1cf4
--- /dev/null
+++ b/vendor/icu_normalizer/benches/data/wotw.txt
@@ -0,0 +1,58 @@
+# This file is part of ICU4X. For terms of use, please see the file
+# called LICENSE at the top level of the ICU4X source tree
+# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+# The contents of this file have been translated by "Google Translate".
+
+Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này
+đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh
+lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con
+người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và
+nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể
+xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong
+một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới
+này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc
+chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển
+vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là
+nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý
+tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra.
+Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã
+qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có
+những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào
+đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian,
+những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối
+với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm,
+nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra
+những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng
+lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt
+trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà
+nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được.
+Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu
+đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng
+nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó.
+Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng
+tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có
+không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại
+sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù
+phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào
+bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa,
+hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta
+cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng
+ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất
+yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết
+thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng
+ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn
+vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó,
+nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta.
+Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã
+thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp
+của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó.
+Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành
+một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần
+thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái
+tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng
+ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm
+về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng
+ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều
+mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây
+trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân.
diff --git a/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs b/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs
new file mode 100644
index 00000000..4ee7590a
--- /dev/null
+++ b/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs
@@ -0,0 +1,213 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub nfc: String,
+ pub nfd: String,
+ pub nfkc: String,
+ pub nfkd: String,
+ pub nfc_u16: Vec<u16>,
+ pub nfd_u16: Vec<u16>,
+ pub nfkc_u16: Vec<u16>,
+ pub nfkd_u16: Vec<u16>,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 15] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+ let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let content_latin: (&str, &str) = (
+ "TestNames_Latin",
+ &strip_headers(include_str!("./data/TestNames_Latin.txt")),
+ );
+ let content_jp_h: (&str, &str) = (
+ "TestNames_Japanese_h",
+ &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
+ );
+ let content_jp_k: (&str, &str) = (
+ "TestNames_Japanese_k",
+ &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
+ );
+ let content_korean: (&str, &str) = (
+ "TestNames_Korean",
+ &strip_headers(include_str!("./data/TestNames_Korean.txt")),
+ );
+ let content_random_words_ar: (&str, &str) = (
+ "TestRandomWordsUDHR_ar",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
+ );
+ let content_random_words_de: (&str, &str) = (
+ "TestRandomWordsUDHR_de",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
+ );
+ let content_random_words_el: (&str, &str) = (
+ "TestRandomWordsUDHR_el",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
+ );
+ let content_random_words_es: (&str, &str) = (
+ "TestRandomWordsUDHR_es",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
+ );
+ let content_random_words_fr: (&str, &str) = (
+ "TestRandomWordsUDHR_fr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
+ );
+ let content_random_words_he: (&str, &str) = (
+ "TestRandomWordsUDHR_he",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
+ );
+ let content_random_words_pl: (&str, &str) = (
+ "TestRandomWordsUDHR_pl",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
+ );
+ let content_random_words_ru: (&str, &str) = (
+ "TestRandomWordsUDHR_ru",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
+ );
+ let content_random_words_th: (&str, &str) = (
+ "TestRandomWordsUDHR_th",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
+ );
+ let content_random_words_tr: (&str, &str) = (
+ "TestRandomWordsUDHR_tr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
+ );
+ let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
+
+ [
+ content_latin,
+ content_viet,
+ content_jp_k,
+ content_jp_h,
+ content_korean,
+ content_random_words_ru,
+ content_random_words_ar,
+ content_random_words_el,
+ content_random_words_es,
+ content_random_words_fr,
+ content_random_words_tr,
+ content_random_words_th,
+ content_random_words_pl,
+ content_random_words_he,
+ content_random_words_de,
+ ]
+ .map(|(file_name, raw_content)| {
+ let nfc = &nfc_normalizer.normalize(raw_content);
+ let nfd = &nfd_normalizer.normalize(raw_content);
+ let nfkc = &nfkc_normalizer.normalize(raw_content);
+ let nfkd = &nfkd_normalizer.normalize(raw_content);
+ BenchDataContent {
+ file_name: file_name.to_owned(),
+ nfc: nfc.to_string(),
+ nfd: nfd.to_string(),
+ nfkc: nfkc.to_string(),
+ nfkd: nfkd.to_string(),
+ nfc_u16: nfc.encode_utf16().collect(),
+ nfd_u16: nfd.encode_utf16().collect(),
+ nfkc_u16: nfkc.encode_utf16().collect(),
+ nfkd_u16: nfkd.encode_utf16().collect(),
+ }
+ })
+}
+
+fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) {
+ normalizer.normalize(text);
+}
+
+fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) {
+ normalizer.normalize_utf16(text);
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "decomposing_normalizer_nfd";
+
+ let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut group = criterion.benchmark_group(group_name);
+
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
+ })
+ },
+ );
+
+ // UTF 16
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
+ })
+ },
+ );
+ }
+
+ group.finish();
+}
diff --git a/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs b/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs
new file mode 100644
index 00000000..4b5d9013
--- /dev/null
+++ b/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs
@@ -0,0 +1,211 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use criterion::{black_box, BenchmarkId, Criterion};
+
+use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed};
+
+struct BenchDataContent {
+ pub file_name: String,
+ pub nfc: String,
+ pub nfd: String,
+ pub nfkc: String,
+ pub nfkd: String,
+ pub nfc_u16: Vec<u16>,
+ pub nfd_u16: Vec<u16>,
+ pub nfkc_u16: Vec<u16>,
+ pub nfkd_u16: Vec<u16>,
+}
+
+fn strip_headers(content: &str) -> String {
+ content
+ .lines()
+ .filter(|&s| !s.starts_with('#'))
+ .map(|s| s.to_owned())
+ .collect::<Vec<String>>()
+ .join("\n")
+}
+
+fn normalizer_bench_data() -> [BenchDataContent; 15] {
+ let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc();
+ let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let content_latin: (&str, &str) = (
+ "TestNames_Latin",
+ &strip_headers(include_str!("./data/TestNames_Latin.txt")),
+ );
+ let content_jp_h: (&str, &str) = (
+ "TestNames_Japanese_h",
+ &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")),
+ );
+ let content_jp_k: (&str, &str) = (
+ "TestNames_Japanese_k",
+ &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")),
+ );
+ let content_korean: (&str, &str) = (
+ "TestNames_Korean",
+ &strip_headers(include_str!("./data/TestNames_Korean.txt")),
+ );
+ let content_random_words_ar: (&str, &str) = (
+ "TestRandomWordsUDHR_ar",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")),
+ );
+ let content_random_words_de: (&str, &str) = (
+ "TestRandomWordsUDHR_de",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")),
+ );
+ let content_random_words_el: (&str, &str) = (
+ "TestRandomWordsUDHR_el",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")),
+ );
+ let content_random_words_es: (&str, &str) = (
+ "TestRandomWordsUDHR_es",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")),
+ );
+ let content_random_words_fr: (&str, &str) = (
+ "TestRandomWordsUDHR_fr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")),
+ );
+ let content_random_words_he: (&str, &str) = (
+ "TestRandomWordsUDHR_he",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")),
+ );
+ let content_random_words_pl: (&str, &str) = (
+ "TestRandomWordsUDHR_pl",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")),
+ );
+ let content_random_words_ru: (&str, &str) = (
+ "TestRandomWordsUDHR_ru",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")),
+ );
+ let content_random_words_th: (&str, &str) = (
+ "TestRandomWordsUDHR_th",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")),
+ );
+ let content_random_words_tr: (&str, &str) = (
+ "TestRandomWordsUDHR_tr",
+ &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")),
+ );
+ let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt")));
+
+ [
+ content_latin,
+ content_viet,
+ content_jp_k,
+ content_jp_h,
+ content_korean,
+ content_random_words_ru,
+ content_random_words_ar,
+ content_random_words_el,
+ content_random_words_es,
+ content_random_words_fr,
+ content_random_words_tr,
+ content_random_words_th,
+ content_random_words_pl,
+ content_random_words_he,
+ content_random_words_de,
+ ]
+ .map(|(file_name, raw_content)| {
+ let nfc = &nfc_normalizer.normalize(raw_content);
+ let nfd = &nfd_normalizer.normalize(raw_content);
+ let nfkc = &nfkc_normalizer.normalize(raw_content);
+ let nfkd = &nfkd_normalizer.normalize(raw_content);
+ BenchDataContent {
+ file_name: file_name.to_owned(),
+ nfc: nfc.to_string(),
+ nfd: nfd.to_string(),
+ nfkc: nfkc.to_string(),
+ nfkd: nfkd.to_string(),
+ nfc_u16: nfc.encode_utf16().collect(),
+ nfd_u16: nfd.encode_utf16().collect(),
+ nfkc_u16: nfkc.encode_utf16().collect(),
+ nfkd_u16: nfkd.encode_utf16().collect(),
+ }
+ })
+}
+
+fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) {
+ normalizer.normalize(text);
+}
+
+fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) {
+ normalizer.normalize_utf16(text);
+}
+
+pub fn criterion_benchmark(criterion: &mut Criterion) {
+ let group_name = "decomposing_normalizer_nfkd";
+
+ let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd();
+
+ let mut group = criterion.benchmark_group(group_name);
+ for bench_data_content in black_box(normalizer_bench_data()) {
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher
+ .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd))
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd)
+ })
+ },
+ );
+
+ // UTF 16
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16)
+ })
+ },
+ );
+ group.bench_function(
+ BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)),
+ |bencher| {
+ bencher.iter(|| {
+ function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16)
+ })
+ },
+ );
+ }
+ group.finish();
+}
diff --git a/vendor/icu_normalizer/src/lib.rs b/vendor/icu_normalizer/src/lib.rs
new file mode 100644
index 00000000..788b2682
--- /dev/null
+++ b/vendor/icu_normalizer/src/lib.rs
@@ -0,0 +1,2854 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations
+#![cfg_attr(not(any(test, doc)), no_std)]
+#![cfg_attr(
+ not(test),
+ deny(
+ clippy::indexing_slicing,
+ clippy::unwrap_used,
+ clippy::expect_used,
+ clippy::panic,
+ clippy::exhaustive_structs,
+ clippy::exhaustive_enums,
+ clippy::trivially_copy_pass_by_ref,
+ missing_debug_implementations,
+ )
+)]
+#![warn(missing_docs)]
+
+//! Normalizing text into Unicode Normalization Forms.
+//!
+//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/))
+//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project.
+//!
+//! # Functionality
+//!
+//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode
+//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD.
+//!
+//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8,
+//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator.
+//!
+//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA
+//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by
+//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the
+//! [`idna`](https://docs.rs/idna/latest/idna/) crate.
+//!
+//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and
+//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class
+//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the
+//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate.
+//!
+//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in
+//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive
+//! non-“maybe” answer.
+//!
+//! # Examples
+//!
+//! ```
+//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc();
+//! assert_eq!(nfc.normalize("a\u{0308}"), "ä");
+//! assert!(nfc.is_normalized("ä"));
+//!
+//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd();
+//! assert_eq!(nfd.normalize("ä"), "a\u{0308}");
+//! assert!(!nfd.is_normalized("ä"));
+//! ```
+
+extern crate alloc;
+
+// We don't depend on icu_properties to minimize deps, but we want to be able
+// to ensure we're using the right CCC values
+macro_rules! ccc {
+ ($name:ident, $num:expr) => {
+ const {
+ #[cfg(feature = "icu_properties")]
+ if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num {
+ panic!("icu_normalizer has incorrect ccc values")
+ }
+ CanonicalCombiningClass::from_icu4c_value($num)
+ }
+ };
+}
+
+pub mod properties;
+pub mod provider;
+pub mod uts46;
+
+use crate::provider::CanonicalCompositions;
+use crate::provider::DecompositionData;
+use crate::provider::NormalizerNfdDataV1;
+use crate::provider::NormalizerNfkdDataV1;
+use crate::provider::NormalizerUts46DataV1;
+use alloc::borrow::Cow;
+use alloc::string::String;
+use core::char::REPLACEMENT_CHARACTER;
+use icu_collections::char16trie::Char16Trie;
+use icu_collections::char16trie::Char16TrieIterator;
+use icu_collections::char16trie::TrieResult;
+use icu_collections::codepointtrie::CodePointTrie;
+#[cfg(feature = "icu_properties")]
+use icu_properties::props::CanonicalCombiningClass;
+use icu_provider::prelude::*;
+use provider::DecompositionTables;
+use provider::NormalizerNfcV1;
+use provider::NormalizerNfdTablesV1;
+use provider::NormalizerNfkdTablesV1;
+use smallvec::SmallVec;
+#[cfg(feature = "utf16_iter")]
+use utf16_iter::Utf16CharsEx;
+#[cfg(feature = "utf8_iter")]
+use utf8_iter::Utf8CharsEx;
+use zerovec::{zeroslice, ZeroSlice};
+
+/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled
+/// It should not be exposed to users.
+#[cfg(not(feature = "icu_properties"))]
+#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
+struct CanonicalCombiningClass(pub(crate) u8);
+
+#[cfg(not(feature = "icu_properties"))]
+impl CanonicalCombiningClass {
+ const fn from_icu4c_value(v: u8) -> Self {
+ Self(v)
+ }
+ const fn to_icu4c_value(self) -> u8 {
+ self.0
+ }
+}
+
+const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0);
+const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230);
+
+/// Treatment of the ignorable marker (0xFFFFFFFF) in data.
+#[derive(Debug, PartialEq, Eq)]
+enum IgnorableBehavior {
+ /// 0xFFFFFFFF in data is not supported.
+ Unsupported,
+ /// Ignorables are ignored.
+ Ignored,
+ /// Ignorables are treated as singleton decompositions
+ /// to the REPLACEMENT CHARACTER.
+ ReplacementCharacter,
+}
+
+/// Marker for UTS 46 ignorables.
+///
+/// See trie-value-format.md
+const IGNORABLE_MARKER: u32 = 0xFFFFFFFF;
+
+/// Marker that the decomposition does not round trip via NFC.
+///
+/// See trie-value-format.md
+const NON_ROUND_TRIP_MARKER: u32 = 1 << 30;
+
+/// Marker that the first character of the decomposition
+/// can combine backwards.
+///
+/// See trie-value-format.md
+const BACKWARD_COMBINING_MARKER: u32 = 1 << 31;
+
+/// Mask for the bits have to be zero for this to be a BMP
+/// singleton decomposition, or value baked into the surrogate
+/// range.
+///
+/// See trie-value-format.md
+const HIGH_ZEROS_MASK: u32 = 0x3FFF0000;
+
+/// Mask for the bits have to be zero for this to be a complex
+/// decomposition.
+///
+/// See trie-value-format.md
+const LOW_ZEROS_MASK: u32 = 0xFFE0;
+
+/// Checks if a trie value carries a (non-zero) canonical
+/// combining class.
+///
+/// See trie-value-format.md
+fn trie_value_has_ccc(trie_value: u32) -> bool {
+ (trie_value & 0x3FFFFE00) == 0xD800
+}
+
+/// Checks if the trie signifies a special non-starter decomposition.
+///
+/// See trie-value-format.md
+fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool {
+ (trie_value & 0x3FFFFF00) == 0xD900
+}
+
+/// Checks if a trie value signifies a character whose decomposition
+/// starts with a non-starter.
+///
+/// See trie-value-format.md
+fn decomposition_starts_with_non_starter(trie_value: u32) -> bool {
+ trie_value_has_ccc(trie_value)
+}
+
+/// Extracts a canonical combining class (possibly zero) from a trie value.
+///
+/// See trie-value-format.md
+fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass {
+ if trie_value_has_ccc(trie_value) {
+ CanonicalCombiningClass::from_icu4c_value(trie_value as u8)
+ } else {
+ CCC_NOT_REORDERED
+ }
+}
+
+/// The tail (everything after the first character) of the NFKD form U+FDFA
+/// as 16-bit units.
+static FDFA_NFKD: [u16; 17] = [
+ 0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648,
+ 0x633, 0x644, 0x645,
+];
+
+/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker,
+/// but they differ by `NON_ROUND_TRIP_MARKER`.)
+///
+/// See trie-value-format.md
+const FDFA_MARKER: u16 = 1;
+
+// These constants originate from page 143 of Unicode 14.0
+/// Syllable base
+const HANGUL_S_BASE: u32 = 0xAC00;
+/// Lead jamo base
+const HANGUL_L_BASE: u32 = 0x1100;
+/// Vowel jamo base
+const HANGUL_V_BASE: u32 = 0x1161;
+/// Trail jamo base (deliberately off by one to account for the absence of a trail)
+const HANGUL_T_BASE: u32 = 0x11A7;
+/// Lead jamo count
+const HANGUL_L_COUNT: u32 = 19;
+/// Vowel jamo count
+const HANGUL_V_COUNT: u32 = 21;
+/// Trail jamo count (deliberately off by one to account for the absence of a trail)
+const HANGUL_T_COUNT: u32 = 28;
+/// Vowel jamo count times trail jamo count
+const HANGUL_N_COUNT: u32 = 588;
+/// Syllable count
+const HANGUL_S_COUNT: u32 = 11172;
+
+/// One past the conjoining jamo block
+const HANGUL_JAMO_LIMIT: u32 = 0x1200;
+
+/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions
+/// are enabled and return `default` if debug assertions are not enabled.
+///
+/// Use this only if the only reason why `opt` could be `None` is bogus
+/// data from the provider.
+#[inline(always)]
+fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T {
+ if let Some(val) = opt {
+ val
+ } else {
+ // GIGO case
+ debug_assert!(false);
+ default
+ }
+}
+
+/// Convert a `u32` _obtained from data provider data_ to `char`.
+#[inline(always)]
+fn char_from_u32(u: u32) -> char {
+ unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER)
+}
+
+/// Convert a `u16` _obtained from data provider data_ to `char`.
+#[inline(always)]
+fn char_from_u16(u: u16) -> char {
+ char_from_u32(u32::from(u))
+}
+
+const EMPTY_U16: &ZeroSlice<u16> = zeroslice![];
+
+const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![];
+
+#[inline(always)]
+fn in_inclusive_range(c: char, start: char, end: char) -> bool {
+ u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start))
+}
+
+#[inline(always)]
+#[cfg(feature = "utf16_iter")]
+fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool {
+ u.wrapping_sub(start) <= (end - start)
+}
+
+/// Performs canonical composition (including Hangul) on a pair of
+/// characters or returns `None` if these characters don't compose.
+/// Composition exclusions are taken into account.
+#[inline]
+fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
+ let v = u32::from(second).wrapping_sub(HANGUL_V_BASE);
+ if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE {
+ return compose_non_hangul(iter, starter, second);
+ }
+ if v < HANGUL_V_COUNT {
+ let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
+ if l < HANGUL_L_COUNT {
+ let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
+ // Safe, because the inputs are known to be in range.
+ return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) });
+ }
+ return None;
+ }
+ if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') {
+ let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
+ if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
+ let lvt = lv + (u32::from(second) - HANGUL_T_BASE);
+ // Safe, because the inputs are known to be in range.
+ return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) });
+ }
+ }
+ None
+}
+
+/// Performs (non-Hangul) canonical composition on a pair of characters
+/// or returns `None` if these characters don't compose. Composition
+/// exclusions are taken into account.
+fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> {
+ // To make the trie smaller, the pairs are stored second character first.
+ // Given how this method is used in ways where it's known that `second`
+ // is or isn't a starter. We could potentially split the trie into two
+ // tries depending on whether `second` is a starter.
+ match iter.next(second) {
+ TrieResult::NoMatch => None,
+ TrieResult::NoValue => match iter.next(starter) {
+ TrieResult::NoMatch => None,
+ TrieResult::FinalValue(i) => {
+ if let Some(c) = char::from_u32(i as u32) {
+ Some(c)
+ } else {
+ // GIGO case
+ debug_assert!(false);
+ None
+ }
+ }
+ TrieResult::NoValue | TrieResult::Intermediate(_) => {
+ // GIGO case
+ debug_assert!(false);
+ None
+ }
+ },
+ TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => {
+ // GIGO case
+ debug_assert!(false);
+ None
+ }
+ }
+}
+
+/// See trie-value-format.md
+#[inline(always)]
+fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool {
+ // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
+ // and this function needs to ignore that.
+ (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0
+}
+
+/// See trie-value-format.md
+#[inline(always)]
+fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool {
+ (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0
+}
+
+/// Struct for holding together a character and the value
+/// looked up for it from the NFD trie in a more explicit
+/// way than an anonymous pair.
+/// Also holds a flag about the supplementary-trie provenance.
+#[derive(Debug, PartialEq, Eq)]
+struct CharacterAndTrieValue {
+ character: char,
+ /// See trie-value-format.md
+ trie_val: u32,
+}
+
+impl CharacterAndTrieValue {
+ #[inline(always)]
+ pub fn new(c: char, trie_value: u32) -> Self {
+ CharacterAndTrieValue {
+ character: c,
+ trie_val: trie_value,
+ }
+ }
+
+ #[inline(always)]
+ pub fn starter_and_decomposes_to_self(&self) -> bool {
+ starter_and_decomposes_to_self_impl(self.trie_val)
+ }
+
+ /// See trie-value-format.md
+ #[inline(always)]
+ #[cfg(feature = "utf8_iter")]
+ pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool {
+ // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value
+ // to be compared with zero. U+FFFD has that flag set despite really
+ // being being round-tripping in order to make UTF-8 errors
+ // ineligible for passthrough.
+ (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0
+ }
+
+ /// See trie-value-format.md
+ #[inline(always)]
+ pub fn can_combine_backwards(&self) -> bool {
+ (self.trie_val & BACKWARD_COMBINING_MARKER) != 0
+ }
+ /// See trie-value-format.md
+ #[inline(always)]
+ pub fn potential_passthrough(&self) -> bool {
+ (self.trie_val & NON_ROUND_TRIP_MARKER) == 0
+ }
+ /// See trie-value-format.md
+ #[inline(always)]
+ pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool {
+ potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val)
+ }
+}
+
+/// Pack a `char` and a `CanonicalCombiningClass` in
+/// 32 bits (the former in the lower 24 bits and the
+/// latter in the high 8 bits). The latter can be
+/// initialized to 0xFF upon creation, in which case
+/// it can be actually set later by calling
+/// `set_ccc_from_trie_if_not_already_set`. This is
+/// a micro optimization to avoid the Canonical
+/// Combining Class trie lookup when there is only
+/// one combining character in a sequence. This type
+/// is intentionally non-`Copy` to get compiler help
+/// in making sure that the class is set on the
+/// instance on which it is intended to be set
+/// and not on a temporary copy.
+///
+/// Note that 0xFF is won't be assigned to an actual
+/// canonical combining class per definition D104
+/// in The Unicode Standard.
+//
+// NOTE: The Pernosco debugger has special knowledge
+// of this struct. Please do not change the bit layout
+// or the crate-module-qualified name of this struct
+// without coordination.
+#[derive(Debug)]
+struct CharacterAndClass(u32);
+
+impl CharacterAndClass {
+ pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self {
+ CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24))
+ }
+ pub fn new_with_placeholder(c: char) -> Self {
+ CharacterAndClass(u32::from(c) | ((0xFF) << 24))
+ }
+ pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self {
+ Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val))
+ }
+ pub fn new_starter(c: char) -> Self {
+ CharacterAndClass(u32::from(c))
+ }
+ /// This method must exist for Pernosco to apply its special rendering.
+ /// Also, this must not be dead code!
+ pub fn character(&self) -> char {
+ // Safe, because the low 24 bits came from a `char`
+ // originally.
+ unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) }
+ }
+ /// This method must exist for Pernosco to apply its special rendering.
+ pub fn ccc(&self) -> CanonicalCombiningClass {
+ CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8)
+ }
+
+ pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) {
+ (self.character(), self.ccc())
+ }
+ pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) {
+ if self.0 >> 24 != 0xFF {
+ return;
+ }
+ let scalar = self.0 & 0xFFFFFF;
+ self.0 =
+ ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar;
+ }
+}
+
+// This function exists as a borrow check helper.
+#[inline(always)]
+fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) {
+ // We don't look up the canonical combining class for starters
+ // of for single combining characters between starters. When
+ // there's more than one combining character between starters,
+ // we look up the canonical combining class for each character
+ // exactly once.
+ if slice.len() < 2 {
+ return;
+ }
+ slice
+ .iter_mut()
+ .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie));
+ slice.sort_by_key(|cc| cc.ccc());
+}
+
+/// An iterator adaptor that turns an `Iterator` over `char` into
+/// a lazily-decomposed `char` sequence.
+#[derive(Debug)]
+pub struct Decomposition<'data, I>
+where
+ I: Iterator<Item = char>,
+{
+ delegate: I,
+ buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA
+ /// The index of the next item to be read from `buffer`.
+ /// The purpose if this index is to avoid having to move
+ /// the rest upon every read.
+ buffer_pos: usize,
+ // At the start of `next()` if not `None`, this is a pending unnormalized
+ // starter. When `Decomposition` appears alone, this is never a non-starter.
+ // However, when `Decomposition` appears inside a `Composition`, this
+ // may become a non-starter before `decomposing_next()` is called.
+ pending: Option<CharacterAndTrieValue>, // None at end of stream
+ // See trie-value-format.md
+ trie: &'data CodePointTrie<'data, u32>,
+ scalars16: &'data ZeroSlice<u16>,
+ scalars24: &'data ZeroSlice<char>,
+ supplementary_scalars16: &'data ZeroSlice<u16>,
+ supplementary_scalars24: &'data ZeroSlice<char>,
+ /// The lowest character for which either of the following does
+ /// not hold:
+ /// 1. Decomposes to self.
+ /// 2. Decomposition starts with a non-starter
+ decomposition_passthrough_bound: u32, // never above 0xC0
+ ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter
+}
+
+impl<'data, I> Decomposition<'data, I>
+where
+ I: Iterator<Item = char>,
+{
+ /// Constructs a decomposing iterator adapter from a delegate
+ /// iterator and references to the necessary data, without
+ /// supplementary data.
+ ///
+ /// Use `DecomposingNormalizer::normalize_iter()` instead unless
+ /// there's a good reason to use this constructor directly.
+ ///
+ /// Public but hidden in order to be able to use this from the
+ /// collator.
+ #[doc(hidden)] // used in collator
+ pub fn new(
+ delegate: I,
+ decompositions: &'data DecompositionData,
+ tables: &'data DecompositionTables,
+ ) -> Self {
+ Self::new_with_supplements(
+ delegate,
+ decompositions,
+ tables,
+ None,
+ 0xC0,
+ IgnorableBehavior::Unsupported,
+ )
+ }
+
+ /// Constructs a decomposing iterator adapter from a delegate
+ /// iterator and references to the necessary data, including
+ /// supplementary data.
+ ///
+ /// Use `DecomposingNormalizer::normalize_iter()` instead unless
+ /// there's a good reason to use this constructor directly.
+ fn new_with_supplements(
+ delegate: I,
+ decompositions: &'data DecompositionData,
+ tables: &'data DecompositionTables,
+ supplementary_tables: Option<&'data DecompositionTables>,
+ decomposition_passthrough_bound: u8,
+ ignorable_behavior: IgnorableBehavior,
+ ) -> Self {
+ let mut ret = Decomposition::<I> {
+ delegate,
+ buffer: SmallVec::new(), // Normalized
+ buffer_pos: 0,
+ // Initialize with a placeholder starter in case
+ // the real stream starts with a non-starter.
+ pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)),
+ trie: &decompositions.trie,
+ scalars16: &tables.scalars16,
+ scalars24: &tables.scalars24,
+ supplementary_scalars16: if let Some(supplementary) = supplementary_tables {
+ &supplementary.scalars16
+ } else {
+ EMPTY_U16
+ },
+ supplementary_scalars24: if let Some(supplementary) = supplementary_tables {
+ &supplementary.scalars24
+ } else {
+ EMPTY_CHAR
+ },
+ decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound),
+ ignorable_behavior,
+ };
+ let _ = ret.next(); // Remove the U+FFFF placeholder
+ ret
+ }
+
+ fn push_decomposition16(
+ &mut self,
+ offset: usize,
+ len: usize,
+ only_non_starters_in_trail: bool,
+ slice16: &ZeroSlice<u16>,
+ ) -> (char, usize) {
+ let (starter, tail) = slice16
+ .get_subslice(offset..offset + len)
+ .and_then(|slice| slice.split_first())
+ .map_or_else(
+ || {
+ // GIGO case
+ debug_assert!(false);
+ (REPLACEMENT_CHARACTER, EMPTY_U16)
+ },
+ |(first, trail)| (char_from_u16(first), trail),
+ );
+ if only_non_starters_in_trail {
+ // All the rest are combining
+ self.buffer.extend(
+ tail.iter()
+ .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))),
+ );
+ (starter, 0)
+ } else {
+ let mut i = 0;
+ let mut combining_start = 0;
+ for u in tail.iter() {
+ let ch = char_from_u16(u);
+ let trie_value = self.trie.get(ch);
+ self.buffer.push(CharacterAndClass::new_with_trie_value(
+ CharacterAndTrieValue::new(ch, trie_value),
+ ));
+ i += 1;
+ // Half-width kana and iota subscript don't occur in the tails
+ // of these multicharacter decompositions.
+ if !decomposition_starts_with_non_starter(trie_value) {
+ combining_start = i;
+ }
+ }
+ (starter, combining_start)
+ }
+ }
+
+ fn push_decomposition32(
+ &mut self,
+ offset: usize,
+ len: usize,
+ only_non_starters_in_trail: bool,
+ slice32: &ZeroSlice<char>,
+ ) -> (char, usize) {
+ let (starter, tail) = slice32
+ .get_subslice(offset..offset + len)
+ .and_then(|slice| slice.split_first())
+ .unwrap_or_else(|| {
+ // GIGO case
+ debug_assert!(false);
+ (REPLACEMENT_CHARACTER, EMPTY_CHAR)
+ });
+ if only_non_starters_in_trail {
+ // All the rest are combining
+ self.buffer
+ .extend(tail.iter().map(CharacterAndClass::new_with_placeholder));
+ (starter, 0)
+ } else {
+ let mut i = 0;
+ let mut combining_start = 0;
+ for ch in tail.iter() {
+ let trie_value = self.trie.get(ch);
+ self.buffer.push(CharacterAndClass::new_with_trie_value(
+ CharacterAndTrieValue::new(ch, trie_value),
+ ));
+ i += 1;
+ // Half-width kana and iota subscript don't occur in the tails
+ // of these multicharacter decompositions.
+ if !decomposition_starts_with_non_starter(trie_value) {
+ combining_start = i;
+ }
+ }
+ (starter, combining_start)
+ }
+ }
+
+ #[inline(always)]
+ fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue {
+ CharacterAndTrieValue::new(c, self.trie.get(c))
+ }
+
+ fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> {
+ debug_assert!(self.pending.is_none());
+ loop {
+ let c = self.delegate.next()?;
+
+ // TODO(#2384): Measure if this check is actually an optimization.
+ if u32::from(c) < self.decomposition_passthrough_bound {
+ return Some(CharacterAndTrieValue::new(c, 0));
+ }
+
+ let trie_val = self.trie.get(c);
+ // TODO: Can we do something better about the cost of this branch in the
+ // non-UTS 46 case?
+ if trie_val == IGNORABLE_MARKER {
+ match self.ignorable_behavior {
+ IgnorableBehavior::Unsupported => {
+ debug_assert!(false);
+ }
+ IgnorableBehavior::ReplacementCharacter => {
+ return Some(CharacterAndTrieValue::new(
+ c,
+ u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER,
+ ));
+ }
+ IgnorableBehavior::Ignored => {
+ // Else ignore this character by reading the next one from the delegate.
+ continue;
+ }
+ }
+ }
+ return Some(CharacterAndTrieValue::new(c, trie_val));
+ }
+ }
+
+ fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> {
+ if let Some(pending) = self.pending.take() {
+ // Only happens as part of `Composition` and as part of
+ // the contiguous-buffer methods of `DecomposingNormalizer`.
+ // I.e. does not happen as part of standalone iterator
+ // usage of `Decomposition`.
+ Some(pending)
+ } else {
+ self.delegate_next_no_pending()
+ }
+ }
+
+ fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char {
+ let (starter, combining_start) = {
+ let c = c_and_trie_val.character;
+ // See trie-value-format.md
+ let decomposition = c_and_trie_val.trie_val;
+ // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
+ // and that flag needs to be ignored here.
+ if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
+ // The character is its own decomposition
+ (c, 0)
+ } else {
+ let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
+ let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
+ if !high_zeros && !low_zeros {
+ // Decomposition into two BMP characters: starter and non-starter
+ let starter = char_from_u32(decomposition & 0x7FFF);
+ let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
+ self.buffer
+ .push(CharacterAndClass::new_with_placeholder(combining));
+ (starter, 0)
+ } else if high_zeros {
+ // Do the check by looking at `c` instead of looking at a marker
+ // in `singleton` below, because if we looked at the trie value,
+ // we'd still have to check that `c` is in the Hangul syllable
+ // range in order for the subsequent interpretations as `char`
+ // to be safe.
+ // Alternatively, `FDFA_MARKER` and the Hangul marker could
+ // be unified. That would add a branch for Hangul and remove
+ // a branch from singleton decompositions. It seems more
+ // important to favor Hangul syllables than singleton
+ // decompositions.
+ // Note that it would be valid to hoist this Hangul check
+ // one or even two steps earlier in this check hierarchy.
+ // Right now, it's assumed the kind of decompositions into
+ // BMP starter and non-starter, which occur in many languages,
+ // should be checked before Hangul syllables, which are about
+ // one language specifically. Hopefully, we get some
+ // instruction-level parallelism out of the disjointness of
+ // operations on `c` and `decomposition`.
+ let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec
+ if hangul_offset < HANGUL_S_COUNT {
+ debug_assert_eq!(decomposition, 1);
+ // Hangul syllable
+ // The math here comes from page 144 of Unicode 14.0
+ let l = hangul_offset / HANGUL_N_COUNT;
+ let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT;
+ let t = hangul_offset % HANGUL_T_COUNT;
+
+ // The unsafe blocks here are OK, because the values stay
+ // within the Hangul jamo block and, therefore, the scalar
+ // value range by construction.
+ self.buffer.push(CharacterAndClass::new_starter(unsafe {
+ core::char::from_u32_unchecked(HANGUL_V_BASE + v)
+ }));
+ let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) };
+ if t != 0 {
+ self.buffer.push(CharacterAndClass::new_starter(unsafe {
+ core::char::from_u32_unchecked(HANGUL_T_BASE + t)
+ }));
+ (first, 2)
+ } else {
+ (first, 1)
+ }
+ } else {
+ let singleton = decomposition as u16;
+ if singleton != FDFA_MARKER {
+ // Decomposition into one BMP character
+ let starter = char_from_u16(singleton);
+ (starter, 0)
+ } else {
+ // Special case for the NFKD form of U+FDFA.
+ self.buffer.extend(FDFA_NFKD.map(|u| {
+ // SAFETY: `FDFA_NFKD` is known not to contain
+ // surrogates.
+ CharacterAndClass::new_starter(unsafe {
+ core::char::from_u32_unchecked(u32::from(u))
+ })
+ }));
+ ('\u{0635}', 17)
+ }
+ }
+ } else {
+ debug_assert!(low_zeros);
+ // Only 12 of 14 bits used as of Unicode 16.
+ let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
+ // Only 3 of 4 bits used as of Unicode 16.
+ let len_bits = decomposition & 0b1111;
+ let only_non_starters_in_trail = (decomposition & 0b10000) != 0;
+ if offset < self.scalars16.len() {
+ self.push_decomposition16(
+ offset,
+ (len_bits + 2) as usize,
+ only_non_starters_in_trail,
+ self.scalars16,
+ )
+ } else if offset < self.scalars16.len() + self.scalars24.len() {
+ self.push_decomposition32(
+ offset - self.scalars16.len(),
+ (len_bits + 1) as usize,
+ only_non_starters_in_trail,
+ self.scalars24,
+ )
+ } else if offset
+ < self.scalars16.len()
+ + self.scalars24.len()
+ + self.supplementary_scalars16.len()
+ {
+ self.push_decomposition16(
+ offset - (self.scalars16.len() + self.scalars24.len()),
+ (len_bits + 2) as usize,
+ only_non_starters_in_trail,
+ self.supplementary_scalars16,
+ )
+ } else {
+ self.push_decomposition32(
+ offset
+ - (self.scalars16.len()
+ + self.scalars24.len()
+ + self.supplementary_scalars16.len()),
+ (len_bits + 1) as usize,
+ only_non_starters_in_trail,
+ self.supplementary_scalars24,
+ )
+ }
+ }
+ }
+ };
+ // Either we're inside `Composition` or `self.pending.is_none()`.
+
+ self.gather_and_sort_combining(combining_start);
+ starter
+ }
+
+ fn gather_and_sort_combining(&mut self, combining_start: usize) {
+ // Not a `for` loop to avoid holding a mutable reference to `self` across
+ // the loop body.
+ while let Some(ch_and_trie_val) = self.delegate_next() {
+ if !trie_value_has_ccc(ch_and_trie_val.trie_val) {
+ self.pending = Some(ch_and_trie_val);
+ break;
+ } else if !trie_value_indicates_special_non_starter_decomposition(
+ ch_and_trie_val.trie_val,
+ ) {
+ self.buffer
+ .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val));
+ } else {
+ // The Tibetan special cases are starters that decompose into non-starters.
+ let mapped = match ch_and_trie_val.character {
+ '\u{0340}' => {
+ // COMBINING GRAVE TONE MARK
+ CharacterAndClass::new('\u{0300}', CCC_ABOVE)
+ }
+ '\u{0341}' => {
+ // COMBINING ACUTE TONE MARK
+ CharacterAndClass::new('\u{0301}', CCC_ABOVE)
+ }
+ '\u{0343}' => {
+ // COMBINING GREEK KORONIS
+ CharacterAndClass::new('\u{0313}', CCC_ABOVE)
+ }
+ '\u{0344}' => {
+ // COMBINING GREEK DIALYTIKA TONOS
+ self.buffer
+ .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE));
+ CharacterAndClass::new('\u{0301}', CCC_ABOVE)
+ }
+ '\u{0F73}' => {
+ // TIBETAN VOWEL SIGN II
+ self.buffer
+ .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
+ CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130))
+ }
+ '\u{0F75}' => {
+ // TIBETAN VOWEL SIGN UU
+ self.buffer
+ .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
+ CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132))
+ }
+ '\u{0F81}' => {
+ // TIBETAN VOWEL SIGN REVERSED II
+ self.buffer
+ .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129)));
+ CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130))
+ }
+ '\u{FF9E}' => {
+ // HALFWIDTH KATAKANA VOICED SOUND MARK
+ CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8))
+ }
+ '\u{FF9F}' => {
+ // HALFWIDTH KATAKANA VOICED SOUND MARK
+ CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8))
+ }
+ _ => {
+ // GIGO case
+ debug_assert!(false);
+ CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER)
+ }
+ };
+ self.buffer.push(mapped);
+ }
+ }
+ // Slicing succeeds by construction; we've always ensured that `combining_start`
+ // is in permissible range.
+ #[allow(clippy::indexing_slicing)]
+ sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie);
+ }
+}
+
+impl<I> Iterator for Decomposition<'_, I>
+where
+ I: Iterator<Item = char>,
+{
+ type Item = char;
+
+ fn next(&mut self) -> Option<char> {
+ if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) {
+ self.buffer_pos += 1;
+ if self.buffer_pos == self.buffer.len() {
+ self.buffer.clear();
+ self.buffer_pos = 0;
+ }
+ return Some(ret);
+ }
+ debug_assert_eq!(self.buffer_pos, 0);
+ let c_and_trie_val = self.pending.take()?;
+ Some(self.decomposing_next(c_and_trie_val))
+ }
+}
+
+/// An iterator adaptor that turns an `Iterator` over `char` into
+/// a lazily-decomposed and then canonically composed `char` sequence.
+#[derive(Debug)]
+pub struct Composition<'data, I>
+where
+ I: Iterator<Item = char>,
+{
+ /// The decomposing part of the normalizer than operates before
+ /// the canonical composition is performed on its output.
+ decomposition: Decomposition<'data, I>,
+ /// Non-Hangul canonical composition data.
+ canonical_compositions: Char16Trie<'data>,
+ /// To make `next()` yield in cases where there's a non-composing
+ /// starter in the decomposition buffer, we put it here to let it
+ /// wait for the next `next()` call (or a jump forward within the
+ /// `next()` call).
+ unprocessed_starter: Option<char>,
+ /// The lowest character for which any one of the following does
+ /// not hold:
+ /// 1. Roundtrips via decomposition and recomposition.
+ /// 2. Decomposition starts with a non-starter
+ /// 3. Is not a backward-combining starter
+ composition_passthrough_bound: u32,
+}
+
+impl<'data, I> Composition<'data, I>
+where
+ I: Iterator<Item = char>,
+{
+ fn new(
+ decomposition: Decomposition<'data, I>,
+ canonical_compositions: Char16Trie<'data>,
+ composition_passthrough_bound: u16,
+ ) -> Self {
+ Self {
+ decomposition,
+ canonical_compositions,
+ unprocessed_starter: None,
+ composition_passthrough_bound: u32::from(composition_passthrough_bound),
+ }
+ }
+
+ /// Performs canonical composition (including Hangul) on a pair of
+ /// characters or returns `None` if these characters don't compose.
+ /// Composition exclusions are taken into account.
+ #[inline(always)]
+ pub fn compose(&self, starter: char, second: char) -> Option<char> {
+ compose(self.canonical_compositions.iter(), starter, second)
+ }
+
+ /// Performs (non-Hangul) canonical composition on a pair of characters
+ /// or returns `None` if these characters don't compose. Composition
+ /// exclusions are taken into account.
+ #[inline(always)]
+ fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> {
+ compose_non_hangul(self.canonical_compositions.iter(), starter, second)
+ }
+}
+
+impl<I> Iterator for Composition<'_, I>
+where
+ I: Iterator<Item = char>,
+{
+ type Item = char;
+
+ #[inline]
+ fn next(&mut self) -> Option<char> {
+ let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use.
+ if self.unprocessed_starter.is_none() {
+ // The loop is only broken out of as goto forward
+ #[allow(clippy::never_loop)]
+ loop {
+ if let Some((character, ccc)) = self
+ .decomposition
+ .buffer
+ .get(self.decomposition.buffer_pos)
+ .map(|c| c.character_and_ccc())
+ {
+ self.decomposition.buffer_pos += 1;
+ if self.decomposition.buffer_pos == self.decomposition.buffer.len() {
+ self.decomposition.buffer.clear();
+ self.decomposition.buffer_pos = 0;
+ }
+ if ccc == CCC_NOT_REORDERED {
+ // Previous decomposition contains a starter. This must
+ // now become the `unprocessed_starter` for it to have
+ // a chance to compose with the upcoming characters.
+ //
+ // E.g. parenthesized Hangul in NFKC comes through here,
+ // but suitable composition exclusion could exercise this
+ // in NFC.
+ self.unprocessed_starter = Some(character);
+ break; // We already have a starter, so skip taking one from `pending`.
+ }
+ return Some(character);
+ }
+ debug_assert_eq!(self.decomposition.buffer_pos, 0);
+ undecomposed_starter = self.decomposition.pending.take()?;
+ if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound
+ || undecomposed_starter.potential_passthrough()
+ {
+ // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming
+ // character is not below `decomposition_passthrough_bound` but is
+ // below `composition_passthrough_bound`, we read from the trie
+ // unnecessarily.
+ if let Some(upcoming) = self.decomposition.delegate_next_no_pending() {
+ let cannot_combine_backwards = u32::from(upcoming.character)
+ < self.composition_passthrough_bound
+ || !upcoming.can_combine_backwards();
+ self.decomposition.pending = Some(upcoming);
+ if cannot_combine_backwards {
+ // Fast-track succeeded!
+ return Some(undecomposed_starter.character);
+ }
+ } else {
+ // End of stream
+ return Some(undecomposed_starter.character);
+ }
+ }
+ break; // Not actually looping
+ }
+ }
+ let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use.
+
+ // The point of having this boolean is to have only one call site to
+ // `self.decomposition.decomposing_next`, which is hopefully beneficial for
+ // code size under inlining.
+ let mut attempt_composition = false;
+ loop {
+ if let Some(unprocessed) = self.unprocessed_starter.take() {
+ debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0));
+ debug_assert_eq!(starter, '\u{0}');
+ starter = unprocessed;
+ } else {
+ debug_assert_eq!(self.decomposition.buffer_pos, 0);
+ let next_starter = self.decomposition.decomposing_next(undecomposed_starter);
+ if !attempt_composition {
+ starter = next_starter;
+ } else if let Some(composed) = self.compose(starter, next_starter) {
+ starter = composed;
+ } else {
+ // This is our yield point. We'll pick this up above in the
+ // next call to `next()`.
+ self.unprocessed_starter = Some(next_starter);
+ return Some(starter);
+ }
+ }
+ // We first loop by index to avoid moving the contents of `buffer`, but
+ // if there's a discontiguous match, we'll start modifying `buffer` instead.
+ loop {
+ let (character, ccc) = if let Some((character, ccc)) = self
+ .decomposition
+ .buffer
+ .get(self.decomposition.buffer_pos)
+ .map(|c| c.character_and_ccc())
+ {
+ (character, ccc)
+ } else {
+ self.decomposition.buffer.clear();
+ self.decomposition.buffer_pos = 0;
+ break;
+ };
+ if let Some(composed) = self.compose(starter, character) {
+ starter = composed;
+ self.decomposition.buffer_pos += 1;
+ continue;
+ }
+ let mut most_recent_skipped_ccc = ccc;
+ {
+ let _ = self
+ .decomposition
+ .buffer
+ .drain(0..self.decomposition.buffer_pos);
+ }
+ self.decomposition.buffer_pos = 0;
+ if most_recent_skipped_ccc == CCC_NOT_REORDERED {
+ // We failed to compose a starter. Discontiguous match not allowed.
+ // We leave the starter in `buffer` for `next()` to find.
+ return Some(starter);
+ }
+ let mut i = 1; // We have skipped one non-starter.
+ while let Some((character, ccc)) = self
+ .decomposition
+ .buffer
+ .get(i)
+ .map(|c| c.character_and_ccc())
+ {
+ if ccc == CCC_NOT_REORDERED {
+ // Discontiguous match not allowed.
+ return Some(starter);
+ }
+ debug_assert!(ccc >= most_recent_skipped_ccc);
+ if ccc != most_recent_skipped_ccc {
+ // Using the non-Hangul version as a micro-optimization, since
+ // we already rejected the case where `second` is a starter
+ // above, and conjoining jamo are starters.
+ if let Some(composed) = self.compose_non_hangul(starter, character) {
+ self.decomposition.buffer.remove(i);
+ starter = composed;
+ continue;
+ }
+ }
+ most_recent_skipped_ccc = ccc;
+ i += 1;
+ }
+ break;
+ }
+
+ debug_assert_eq!(self.decomposition.buffer_pos, 0);
+
+ if !self.decomposition.buffer.is_empty() {
+ return Some(starter);
+ }
+ // Now we need to check if composition with an upcoming starter is possible.
+ #[allow(clippy::unwrap_used)]
+ if self.decomposition.pending.is_some() {
+ // We know that `pending_starter` decomposes to start with a starter.
+ // Otherwise, it would have been moved to `self.decomposition.buffer`
+ // by `self.decomposing_next()`. We do this set lookup here in order
+ // to get an opportunity to go back to the fast track.
+ // Note that this check has to happen _after_ checking that `pending`
+ // holds a character, because this flag isn't defined to be meaningful
+ // when `pending` isn't holding a character.
+ let pending = self.decomposition.pending.as_ref().unwrap();
+ if u32::from(pending.character) < self.composition_passthrough_bound
+ || !pending.can_combine_backwards()
+ {
+ // Won't combine backwards anyway.
+ return Some(starter);
+ }
+ // Consume what we peeked. `unwrap` OK, because we checked `is_some()`
+ // above.
+ undecomposed_starter = self.decomposition.pending.take().unwrap();
+ // The following line is OK, because we're about to loop back
+ // to `self.decomposition.decomposing_next(c);`, which will
+ // restore the between-`next()`-calls invariant of `pending`
+ // before this function returns.
+ attempt_composition = true;
+ continue;
+ }
+ // End of input
+ return Some(starter);
+ }
+ }
+}
+
+macro_rules! composing_normalize_to {
+ ($(#[$meta:meta])*,
+ $normalize_to:ident,
+ $write:path,
+ $slice:ty,
+ $prolog:block,
+ $always_valid_utf:literal,
+ $as_slice:ident,
+ $fast:block,
+ $text:ident,
+ $sink:ident,
+ $composition:ident,
+ $composition_passthrough_bound:ident,
+ $undecomposed_starter:ident,
+ $pending_slice:ident,
+ $len_utf:ident,
+ ) => {
+ $(#[$meta])*
+ pub fn $normalize_to<W: $write + ?Sized>(
+ &self,
+ $text: $slice,
+ $sink: &mut W,
+ ) -> core::fmt::Result {
+ $prolog
+ let mut $composition = self.normalize_iter($text.chars());
+ debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
+ for cc in $composition.decomposition.buffer.drain(..) {
+ $sink.write_char(cc.character())?;
+ }
+
+ // Try to get the compiler to hoist the bound to a register.
+ let $composition_passthrough_bound = $composition.composition_passthrough_bound;
+ 'outer: loop {
+ debug_assert_eq!($composition.decomposition.buffer_pos, 0);
+ let mut $undecomposed_starter =
+ if let Some(pending) = $composition.decomposition.pending.take() {
+ pending
+ } else {
+ return Ok(());
+ };
+ // Allowing indexed slicing, because a failure would be a code bug and
+ // not a data issue.
+ #[allow(clippy::indexing_slicing)]
+ if u32::from($undecomposed_starter.character) < $composition_passthrough_bound ||
+ $undecomposed_starter.potential_passthrough()
+ {
+ // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or
+ // was returned in response to an error by the iterator. Assume the
+ // latter for correctness even though it pessimizes the former.
+ if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER {
+ let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..];
+ // The `$fast` block must either:
+ // 1. Return due to reaching EOF
+ // 2. Leave a starter with its trie value in `$undecomposed_starter`
+ // and, if there is still more input, leave the next character
+ // and its trie value in `$composition.decomposition.pending`.
+ $fast
+ }
+ }
+ // Fast track above, full algorithm below
+ let mut starter = $composition
+ .decomposition
+ .decomposing_next($undecomposed_starter);
+ 'bufferloop: loop {
+ // We first loop by index to avoid moving the contents of `buffer`, but
+ // if there's a discontiguous match, we'll start modifying `buffer` instead.
+ loop {
+ let (character, ccc) = if let Some((character, ccc)) = $composition
+ .decomposition
+ .buffer
+ .get($composition.decomposition.buffer_pos)
+ .map(|c| c.character_and_ccc())
+ {
+ (character, ccc)
+ } else {
+ $composition.decomposition.buffer.clear();
+ $composition.decomposition.buffer_pos = 0;
+ break;
+ };
+ if let Some(composed) = $composition.compose(starter, character) {
+ starter = composed;
+ $composition.decomposition.buffer_pos += 1;
+ continue;
+ }
+ let mut most_recent_skipped_ccc = ccc;
+ if most_recent_skipped_ccc == CCC_NOT_REORDERED {
+ // We failed to compose a starter. Discontiguous match not allowed.
+ // Write the current `starter` we've been composing, make the unmatched
+ // starter in the buffer the new `starter` (we know it's been decomposed)
+ // and process the rest of the buffer with that as the starter.
+ $sink.write_char(starter)?;
+ starter = character;
+ $composition.decomposition.buffer_pos += 1;
+ continue 'bufferloop;
+ } else {
+ {
+ let _ = $composition
+ .decomposition
+ .buffer
+ .drain(0..$composition.decomposition.buffer_pos);
+ }
+ $composition.decomposition.buffer_pos = 0;
+ }
+ let mut i = 1; // We have skipped one non-starter.
+ while let Some((character, ccc)) = $composition
+ .decomposition
+ .buffer
+ .get(i)
+ .map(|c| c.character_and_ccc())
+ {
+ if ccc == CCC_NOT_REORDERED {
+ // Discontiguous match not allowed.
+ $sink.write_char(starter)?;
+ for cc in $composition.decomposition.buffer.drain(..i) {
+ $sink.write_char(cc.character())?;
+ }
+ starter = character;
+ {
+ let removed = $composition.decomposition.buffer.remove(0);
+ debug_assert_eq!(starter, removed.character());
+ }
+ debug_assert_eq!($composition.decomposition.buffer_pos, 0);
+ continue 'bufferloop;
+ }
+ debug_assert!(ccc >= most_recent_skipped_ccc);
+ if ccc != most_recent_skipped_ccc {
+ // Using the non-Hangul version as a micro-optimization, since
+ // we already rejected the case where `second` is a starter
+ // above, and conjoining jamo are starters.
+ if let Some(composed) =
+ $composition.compose_non_hangul(starter, character)
+ {
+ $composition.decomposition.buffer.remove(i);
+ starter = composed;
+ continue;
+ }
+ }
+ most_recent_skipped_ccc = ccc;
+ i += 1;
+ }
+ break;
+ }
+ debug_assert_eq!($composition.decomposition.buffer_pos, 0);
+
+ if !$composition.decomposition.buffer.is_empty() {
+ $sink.write_char(starter)?;
+ for cc in $composition.decomposition.buffer.drain(..) {
+ $sink.write_char(cc.character())?;
+ }
+ // We had non-empty buffer, so can't compose with upcoming.
+ continue 'outer;
+ }
+ // Now we need to check if composition with an upcoming starter is possible.
+ if $composition.decomposition.pending.is_some() {
+ // We know that `pending_starter` decomposes to start with a starter.
+ // Otherwise, it would have been moved to `composition.decomposition.buffer`
+ // by `composition.decomposing_next()`. We do this set lookup here in order
+ // to get an opportunity to go back to the fast track.
+ // Note that this check has to happen _after_ checking that `pending`
+ // holds a character, because this flag isn't defined to be meaningful
+ // when `pending` isn't holding a character.
+ let pending = $composition.decomposition.pending.as_ref().unwrap();
+ if u32::from(pending.character) < $composition.composition_passthrough_bound
+ || !pending.can_combine_backwards()
+ {
+ // Won't combine backwards anyway.
+ $sink.write_char(starter)?;
+ continue 'outer;
+ }
+ let pending_starter = $composition.decomposition.pending.take().unwrap();
+ let decomposed = $composition.decomposition.decomposing_next(pending_starter);
+ if let Some(composed) = $composition.compose(starter, decomposed) {
+ starter = composed;
+ } else {
+ $sink.write_char(starter)?;
+ starter = decomposed;
+ }
+ continue 'bufferloop;
+ }
+ // End of input
+ $sink.write_char(starter)?;
+ return Ok(());
+ } // 'bufferloop
+ }
+ }
+ };
+}
+
+macro_rules! decomposing_normalize_to {
+ ($(#[$meta:meta])*,
+ $normalize_to:ident,
+ $write:path,
+ $slice:ty,
+ $prolog:block,
+ $as_slice:ident,
+ $fast:block,
+ $text:ident,
+ $sink:ident,
+ $decomposition:ident,
+ $decomposition_passthrough_bound:ident,
+ $undecomposed_starter:ident,
+ $pending_slice:ident,
+ $outer:lifetime, // loop labels use lifetime tokens
+ ) => {
+ $(#[$meta])*
+ pub fn $normalize_to<W: $write + ?Sized>(
+ &self,
+ $text: $slice,
+ $sink: &mut W,
+ ) -> core::fmt::Result {
+ $prolog
+
+ let mut $decomposition = self.normalize_iter($text.chars());
+ debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported);
+
+ // Try to get the compiler to hoist the bound to a register.
+ let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound;
+ $outer: loop {
+ for cc in $decomposition.buffer.drain(..) {
+ $sink.write_char(cc.character())?;
+ }
+ debug_assert_eq!($decomposition.buffer_pos, 0);
+ let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() {
+ pending
+ } else {
+ return Ok(());
+ };
+ // Allowing indexed slicing, because a failure would be a code bug and
+ // not a data issue.
+ #[allow(clippy::indexing_slicing)]
+ if $undecomposed_starter.starter_and_decomposes_to_self() {
+ // Don't bother including `undecomposed_starter` in a contiguous buffer
+ // write: Just write it right away:
+ $sink.write_char($undecomposed_starter.character)?;
+
+ let $pending_slice = $decomposition.delegate.$as_slice();
+ $fast
+ }
+ let starter = $decomposition.decomposing_next($undecomposed_starter);
+ $sink.write_char(starter)?;
+ }
+ }
+ };
+}
+
+macro_rules! normalizer_methods {
+ () => {
+ /// Normalize a string slice into a `Cow<'a, str>`.
+ pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> {
+ let (head, tail) = self.split_normalized(text);
+ if tail.is_empty() {
+ return Cow::Borrowed(head);
+ }
+ let mut ret = String::new();
+ ret.reserve(text.len());
+ ret.push_str(head);
+ let _ = self.normalize_to(tail, &mut ret);
+ Cow::Owned(ret)
+ }
+
+ /// Split a string slice into maximum normalized prefix and unnormalized suffix
+ /// such that the concatenation of the prefix and the normalization of the suffix
+ /// is the normalization of the whole input.
+ pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) {
+ let up_to = self.is_normalized_up_to(text);
+ text.split_at_checked(up_to).unwrap_or_else(|| {
+ // Internal bug, not even GIGO, never supposed to happen
+ debug_assert!(false);
+ ("", text)
+ })
+ }
+
+ /// Return the index a string slice is normalized up to.
+ fn is_normalized_up_to(&self, text: &str) -> usize {
+ let mut sink = IsNormalizedSinkStr::new(text);
+ let _ = self.normalize_to(text, &mut sink);
+ text.len() - sink.remaining_len()
+ }
+
+ /// Check whether a string slice is normalized.
+ pub fn is_normalized(&self, text: &str) -> bool {
+ self.is_normalized_up_to(text) == text.len()
+ }
+
+ /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`.
+ ///
+ /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
+ /// before normalizing.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> {
+ let (head, tail) = self.split_normalized_utf16(text);
+ if tail.is_empty() {
+ return Cow::Borrowed(head);
+ }
+ let mut ret = alloc::vec::Vec::with_capacity(text.len());
+ ret.extend_from_slice(head);
+ let _ = self.normalize_utf16_to(tail, &mut ret);
+ Cow::Owned(ret)
+ }
+
+ /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid)
+ /// prefix and unnormalized suffix such that the concatenation of the prefix and the
+ /// normalization of the suffix is the normalization of the whole input.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) {
+ let up_to = self.is_normalized_utf16_up_to(text);
+ text.split_at_checked(up_to).unwrap_or_else(|| {
+ // Internal bug, not even GIGO, never supposed to happen
+ debug_assert!(false);
+ (&[], text)
+ })
+ }
+
+ /// Return the index a slice of potentially-invalid UTF-16 is normalized up to.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize {
+ let mut sink = IsNormalizedSinkUtf16::new(text);
+ let _ = self.normalize_utf16_to(text, &mut sink);
+ text.len() - sink.remaining_len()
+ }
+
+ /// Checks whether a slice of potentially-invalid UTF-16 is normalized.
+ ///
+ /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ pub fn is_normalized_utf16(&self, text: &[u16]) -> bool {
+ self.is_normalized_utf16_up_to(text) == text.len()
+ }
+
+ /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`.
+ ///
+ /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
+ /// according to the WHATWG Encoding Standard.
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> {
+ let (head, tail) = self.split_normalized_utf8(text);
+ if tail.is_empty() {
+ return Cow::Borrowed(head);
+ }
+ let mut ret = String::new();
+ ret.reserve(text.len());
+ ret.push_str(head);
+ let _ = self.normalize_utf8_to(tail, &mut ret);
+ Cow::Owned(ret)
+ }
+
+ /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid)
+ /// prefix and unnormalized suffix such that the concatenation of the prefix and the
+ /// normalization of the suffix is the normalization of the whole input.
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) {
+ let up_to = self.is_normalized_utf8_up_to(text);
+ let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| {
+ // Internal bug, not even GIGO, never supposed to happen
+ debug_assert!(false);
+ (&[], text)
+ });
+ // SAFETY: The normalization check also checks for
+ // UTF-8 well-formedness.
+ (unsafe { core::str::from_utf8_unchecked(head) }, tail)
+ }
+
+ /// Return the index a slice of potentially-invalid UTF-8 is normalized up to
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize {
+ let mut sink = IsNormalizedSinkUtf8::new(text);
+ let _ = self.normalize_utf8_to(text, &mut sink);
+ text.len() - sink.remaining_len()
+ }
+
+ /// Check if a slice of potentially-invalid UTF-8 is normalized.
+ ///
+ /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
+ /// according to the WHATWG Encoding Standard before checking.
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ pub fn is_normalized_utf8(&self, text: &[u8]) -> bool {
+ self.is_normalized_utf8_up_to(text) == text.len()
+ }
+ };
+}
+
+/// Borrowed version of a normalizer for performing decomposing normalization.
+#[derive(Debug)]
+pub struct DecomposingNormalizerBorrowed<'a> {
+ decompositions: &'a DecompositionData<'a>,
+ tables: &'a DecompositionTables<'a>,
+ supplementary_tables: Option<&'a DecompositionTables<'a>>,
+ decomposition_passthrough_bound: u8, // never above 0xC0
+ composition_passthrough_bound: u16, // never above 0x0300
+}
+
+impl DecomposingNormalizerBorrowed<'static> {
+ /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`].
+ ///
+ /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some
+ /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`].
+ pub const fn static_to_owned(self) -> DecomposingNormalizer {
+ DecomposingNormalizer {
+ decompositions: DataPayload::from_static_ref(self.decompositions),
+ tables: DataPayload::from_static_ref(self.tables),
+ supplementary_tables: if let Some(s) = self.supplementary_tables {
+ // `map` not available in const context
+ Some(DataPayload::from_static_ref(s))
+ } else {
+ None
+ },
+ decomposition_passthrough_bound: self.decomposition_passthrough_bound,
+ composition_passthrough_bound: self.composition_passthrough_bound,
+ }
+ }
+
+ /// NFD constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfd() -> Self {
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars24
+ .const_len()
+ <= 0xFFF,
+ "future extension"
+ );
+
+ DecomposingNormalizerBorrowed {
+ decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
+ tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
+ supplementary_tables: None,
+ decomposition_passthrough_bound: 0xC0,
+ composition_passthrough_bound: 0x0300,
+ }
+ }
+
+ /// NFKD constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfkd() -> Self {
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars24
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
+ .scalars24
+ .const_len()
+ <= 0xFFF,
+ "future extension"
+ );
+
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300,
+ "invalid"
+ );
+
+ let decomposition_capped =
+ if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 {
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
+ } else {
+ 0xC0
+ };
+ let composition_capped =
+ if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 {
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap
+ } else {
+ 0x0300
+ };
+
+ DecomposingNormalizerBorrowed {
+ decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1,
+ tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
+ supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
+ decomposition_passthrough_bound: decomposition_capped as u8,
+ composition_passthrough_bound: composition_capped,
+ }
+ }
+
+ #[cfg(feature = "compiled_data")]
+ pub(crate) const fn new_uts46_decomposed() -> Self {
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars24
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1
+ .scalars24
+ .const_len()
+ <= 0xFFF,
+ "future extension"
+ );
+
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300,
+ "invalid"
+ );
+
+ let decomposition_capped =
+ if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 {
+ crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
+ } else {
+ 0xC0
+ };
+ let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1
+ .passthrough_cap
+ < 0x0300
+ {
+ crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap
+ } else {
+ 0x0300
+ };
+
+ DecomposingNormalizerBorrowed {
+ decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1,
+ tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
+ supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1),
+ decomposition_passthrough_bound: decomposition_capped as u8,
+ composition_passthrough_bound: composition_capped,
+ }
+ }
+}
+
+impl<'data> DecomposingNormalizerBorrowed<'data> {
+ /// Wraps a delegate iterator into a decomposing iterator
+ /// adapter by using the data already held by this normalizer.
+ pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> {
+ Decomposition::new_with_supplements(
+ iter,
+ self.decompositions,
+ self.tables,
+ self.supplementary_tables,
+ self.decomposition_passthrough_bound,
+ IgnorableBehavior::Unsupported,
+ )
+ }
+
+ normalizer_methods!();
+
+ decomposing_normalize_to!(
+ /// Normalize a string slice into a `Write` sink.
+ ,
+ normalize_to,
+ core::fmt::Write,
+ &str,
+ {
+ },
+ as_str,
+ {
+ let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 {
+ 0xC3u8
+ } else {
+ decomposition_passthrough_bound.min(0x80) as u8
+ };
+ // The attribute belongs on an inner statement, but Rust doesn't allow it there.
+ #[allow(clippy::unwrap_used)]
+ 'fast: loop {
+ let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter();
+ 'fastest: loop {
+ if let Some(&upcoming_byte) = code_unit_iter.next() {
+ if upcoming_byte < decomposition_passthrough_byte_bound {
+ // Fast-track succeeded!
+ continue 'fastest;
+ }
+ decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
+ break 'fastest;
+ }
+ // End of stream
+ sink.write_str(pending_slice)?;
+ return Ok(());
+ }
+
+ // `unwrap()` OK, because the slice is valid UTF-8 and we know there
+ // is an upcoming byte.
+ let upcoming = decomposition.delegate.next().unwrap();
+ let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
+ if upcoming_with_trie_value.starter_and_decomposes_to_self() {
+ continue 'fast;
+ }
+ let consumed_so_far_slice = &pending_slice[..pending_slice.len()
+ - decomposition.delegate.as_str().len()
+ - upcoming.len_utf8()];
+ sink.write_str(consumed_so_far_slice)?;
+
+ // Now let's figure out if we got a starter or a non-starter.
+ if decomposition_starts_with_non_starter(
+ upcoming_with_trie_value.trie_val,
+ ) {
+ // Let this trie value to be reprocessed in case it is
+ // one of the rare decomposing ones.
+ decomposition.pending = Some(upcoming_with_trie_value);
+ decomposition.gather_and_sort_combining(0);
+ continue 'outer;
+ }
+ undecomposed_starter = upcoming_with_trie_value;
+ debug_assert!(decomposition.pending.is_none());
+ break 'fast;
+ }
+ },
+ text,
+ sink,
+ decomposition,
+ decomposition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ 'outer,
+ );
+
+ decomposing_normalize_to!(
+ /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
+ ///
+ /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
+ /// according to the WHATWG Encoding Standard.
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ ,
+ normalize_utf8_to,
+ core::fmt::Write,
+ &[u8],
+ {
+ },
+ as_slice,
+ {
+ let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8;
+ // The attribute belongs on an inner statement, but Rust doesn't allow it there.
+ #[allow(clippy::unwrap_used)]
+ 'fast: loop {
+ let mut code_unit_iter = decomposition.delegate.as_slice().iter();
+ 'fastest: loop {
+ if let Some(&upcoming_byte) = code_unit_iter.next() {
+ if upcoming_byte < decomposition_passthrough_byte_bound {
+ // Fast-track succeeded!
+ continue 'fastest;
+ }
+ break 'fastest;
+ }
+ // End of stream
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
+ return Ok(());
+ }
+ decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
+
+ // `unwrap()` OK, because the slice is valid UTF-8 and we know there
+ // is an upcoming byte.
+ let upcoming = decomposition.delegate.next().unwrap();
+ let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming);
+ if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() {
+ // Note: The trie value of the REPLACEMENT CHARACTER is
+ // intentionally formatted to fail the
+ // `starter_and_decomposes_to_self` test even though it
+ // really is a starter that decomposes to self. This
+ // Allows moving the branch on REPLACEMENT CHARACTER
+ // below this `continue`.
+ continue 'fast;
+ }
+
+ // TODO: Annotate as unlikely.
+ if upcoming == REPLACEMENT_CHARACTER {
+ // We might have an error, so fall out of the fast path.
+
+ // Since the U+FFFD might signify an error, we can't
+ // assume `upcoming.len_utf8()` for the backoff length.
+ let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars();
+ let back = consumed_so_far.next_back();
+ debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
+ let consumed_so_far_slice = consumed_so_far.as_slice();
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
+
+ // We could call `gather_and_sort_combining` here and
+ // `continue 'outer`, but this should be better for code
+ // size.
+ undecomposed_starter = upcoming_with_trie_value;
+ debug_assert!(decomposition.pending.is_none());
+ break 'fast;
+ }
+
+ let consumed_so_far_slice = &pending_slice[..pending_slice.len()
+ - decomposition.delegate.as_slice().len()
+ - upcoming.len_utf8()];
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?;
+
+ // Now let's figure out if we got a starter or a non-starter.
+ if decomposition_starts_with_non_starter(
+ upcoming_with_trie_value.trie_val,
+ ) {
+ // Let this trie value to be reprocessed in case it is
+ // one of the rare decomposing ones.
+ decomposition.pending = Some(upcoming_with_trie_value);
+ decomposition.gather_and_sort_combining(0);
+ continue 'outer;
+ }
+ undecomposed_starter = upcoming_with_trie_value;
+ debug_assert!(decomposition.pending.is_none());
+ break 'fast;
+ }
+ },
+ text,
+ sink,
+ decomposition,
+ decomposition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ 'outer,
+ );
+
+ decomposing_normalize_to!(
+ /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
+ ///
+ /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
+ /// before normalizing.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ ,
+ normalize_utf16_to,
+ write16::Write16,
+ &[u16],
+ {
+ sink.size_hint(text.len())?;
+ },
+ as_slice,
+ {
+ let mut code_unit_iter = decomposition.delegate.as_slice().iter();
+ 'fast: loop {
+ if let Some(&upcoming_code_unit) = code_unit_iter.next() {
+ let mut upcoming32 = u32::from(upcoming_code_unit);
+ if upcoming32 < decomposition_passthrough_bound {
+ continue 'fast;
+ }
+ // We might be doing a trie lookup by surrogate. Surrogates get
+ // a decomposition to U+FFFD.
+ let mut trie_value = decomposition.trie.get32(upcoming32);
+ if starter_and_decomposes_to_self_impl(trie_value) {
+ continue 'fast;
+ }
+ // We might now be looking at a surrogate.
+ // The loop is only broken out of as goto forward
+ #[allow(clippy::never_loop)]
+ 'surrogateloop: loop {
+ let surrogate_base = upcoming32.wrapping_sub(0xD800);
+ if surrogate_base > (0xDFFF - 0xD800) {
+ // Not surrogate
+ break 'surrogateloop;
+ }
+ if surrogate_base <= (0xDBFF - 0xD800) {
+ let iter_backup = code_unit_iter.clone();
+ if let Some(&low) = code_unit_iter.next() {
+ if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
+ upcoming32 = (upcoming32 << 10) + u32::from(low)
+ - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
+ // Successfully-paired surrogate. Read from the trie again.
+ trie_value = decomposition.trie.get32(upcoming32);
+ if starter_and_decomposes_to_self_impl(trie_value) {
+ continue 'fast;
+ }
+ break 'surrogateloop;
+ } else {
+ code_unit_iter = iter_backup;
+ }
+ }
+ }
+ // unpaired surrogate
+ upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
+ // trie_value already holds a decomposition to U+FFFD.
+ break 'surrogateloop;
+ }
+
+ let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
+ let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
+
+ let consumed_so_far_slice = &pending_slice[..pending_slice.len()
+ - code_unit_iter.as_slice().len()
+ - upcoming.len_utf16()];
+ sink.write_slice(consumed_so_far_slice)?;
+
+ // Now let's figure out if we got a starter or a non-starter.
+ if decomposition_starts_with_non_starter(
+ upcoming_with_trie_value.trie_val,
+ ) {
+ // Sync with main iterator
+ decomposition.delegate = code_unit_iter.as_slice().chars();
+ // Let this trie value to be reprocessed in case it is
+ // one of the rare decomposing ones.
+ decomposition.pending = Some(upcoming_with_trie_value);
+ decomposition.gather_and_sort_combining(0);
+ continue 'outer;
+ }
+ undecomposed_starter = upcoming_with_trie_value;
+ debug_assert!(decomposition.pending.is_none());
+ break 'fast;
+ }
+ // End of stream
+ sink.write_slice(pending_slice)?;
+ return Ok(());
+ }
+ // Sync the main iterator
+ decomposition.delegate = code_unit_iter.as_slice().chars();
+ },
+ text,
+ sink,
+ decomposition,
+ decomposition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ 'outer,
+ );
+}
+
+/// A normalizer for performing decomposing normalization.
+#[derive(Debug)]
+pub struct DecomposingNormalizer {
+ decompositions: DataPayload<NormalizerNfdDataV1>,
+ tables: DataPayload<NormalizerNfdTablesV1>,
+ supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>,
+ decomposition_passthrough_bound: u8, // never above 0xC0
+ composition_passthrough_bound: u16, // never above 0x0300
+}
+
+impl DecomposingNormalizer {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed {
+ DecomposingNormalizerBorrowed {
+ decompositions: self.decompositions.get(),
+ tables: self.tables.get(),
+ supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()),
+ decomposition_passthrough_bound: self.decomposition_passthrough_bound,
+ composition_passthrough_bound: self.composition_passthrough_bound,
+ }
+ }
+
+ /// NFD constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> {
+ DecomposingNormalizerBorrowed::new_nfd()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(
+ () -> error: DataError,
+ functions: [
+ new_nfd: skip,
+ try_new_nfd_with_buffer_provider,
+ try_new_nfd_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)]
+ pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized,
+ {
+ let decompositions: DataPayload<NormalizerNfdDataV1> =
+ provider.load(Default::default())?.payload;
+ let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
+
+ if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
+ // The data is from a future where there exists a normalization flavor whose
+ // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
+ // of space. If a good use case from such a decomposition flavor arises, we can
+ // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
+ // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
+ // since for now the masks are hard-coded, error out.
+ return Err(
+ DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
+ );
+ }
+
+ let cap = decompositions.get().passthrough_cap;
+ if cap > 0x0300 {
+ return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO));
+ }
+ let decomposition_capped = cap.min(0xC0);
+ let composition_capped = cap.min(0x0300);
+
+ Ok(DecomposingNormalizer {
+ decompositions,
+ tables,
+ supplementary_tables: None,
+ decomposition_passthrough_bound: decomposition_capped as u8,
+ composition_passthrough_bound: composition_capped,
+ })
+ }
+
+ icu_provider::gen_buffer_data_constructors!(
+ () -> error: DataError,
+ functions: [
+ new_nfkd: skip,
+ try_new_nfkd_with_buffer_provider,
+ try_new_nfkd_unstable,
+ Self,
+ ]
+ );
+
+ /// NFKD constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> {
+ DecomposingNormalizerBorrowed::new_nfkd()
+ }
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)]
+ pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfkdDataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfkdTablesV1>
+ + ?Sized,
+ {
+ let decompositions: DataPayload<NormalizerNfkdDataV1> =
+ provider.load(Default::default())?.payload;
+ let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
+ let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
+ provider.load(Default::default())?.payload;
+
+ if tables.get().scalars16.len()
+ + tables.get().scalars24.len()
+ + supplementary_tables.get().scalars16.len()
+ + supplementary_tables.get().scalars24.len()
+ > 0xFFF
+ {
+ // The data is from a future where there exists a normalization flavor whose
+ // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
+ // of space. If a good use case from such a decomposition flavor arises, we can
+ // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
+ // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
+ // since for now the masks are hard-coded, error out.
+ return Err(
+ DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
+ );
+ }
+
+ let cap = decompositions.get().passthrough_cap;
+ if cap > 0x0300 {
+ return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO));
+ }
+ let decomposition_capped = cap.min(0xC0);
+ let composition_capped = cap.min(0x0300);
+
+ Ok(DecomposingNormalizer {
+ decompositions: decompositions.cast(),
+ tables,
+ supplementary_tables: Some(supplementary_tables),
+ decomposition_passthrough_bound: decomposition_capped as u8,
+ composition_passthrough_bound: composition_capped,
+ })
+ }
+
+ /// UTS 46 decomposed constructor (testing only)
+ ///
+ /// This is a special building block normalization for IDNA. It is the decomposed counterpart of
+ /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and
+ /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in
+ /// NFD in this normalization. In both cases, the previous UTS 46 processing before using
+ /// normalization is expected to deal with these characters. Making the disallowed characters
+ /// behave like this is beneficial to data size, and this normalizer implementation cannot
+ /// deal with a character normalizing to the empty string, which doesn't happen in NFD or
+ /// NFKD as of Unicode 14.
+ ///
+ /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
+ /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
+ /// U+0345 from a reordered character into a non-reordered character before reordering happens.
+ /// Therefore, the output of this normalization may differ for different inputs that are
+ /// canonically equivalent with each other if they differ by how U+0345 is ordered relative
+ /// to other reorderable characters.
+ pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerUts46DataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfkdTablesV1>
+ // UTS 46 tables merged into CompatibilityDecompositionTablesV1
+ + ?Sized,
+ {
+ let decompositions: DataPayload<NormalizerUts46DataV1> =
+ provider.load(Default::default())?.payload;
+ let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
+ let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> =
+ provider.load(Default::default())?.payload;
+
+ if tables.get().scalars16.len()
+ + tables.get().scalars24.len()
+ + supplementary_tables.get().scalars16.len()
+ + supplementary_tables.get().scalars24.len()
+ > 0xFFF
+ {
+ // The data is from a future where there exists a normalization flavor whose
+ // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
+ // of space. If a good use case from such a decomposition flavor arises, we can
+ // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
+ // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
+ // since for now the masks are hard-coded, error out.
+ return Err(
+ DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO)
+ );
+ }
+
+ let cap = decompositions.get().passthrough_cap;
+ if cap > 0x0300 {
+ return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO));
+ }
+ let decomposition_capped = cap.min(0xC0);
+ let composition_capped = cap.min(0x0300);
+
+ Ok(DecomposingNormalizer {
+ decompositions: decompositions.cast(),
+ tables,
+ supplementary_tables: Some(supplementary_tables),
+ decomposition_passthrough_bound: decomposition_capped as u8,
+ composition_passthrough_bound: composition_capped,
+ })
+ }
+}
+
+/// Borrowed version of a normalizer for performing composing normalization.
+#[derive(Debug)]
+pub struct ComposingNormalizerBorrowed<'a> {
+ decomposing_normalizer: DecomposingNormalizerBorrowed<'a>,
+ canonical_compositions: &'a CanonicalCompositions<'a>,
+}
+
+impl ComposingNormalizerBorrowed<'static> {
+ /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`].
+ ///
+ /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some
+ /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`].
+ pub const fn static_to_owned(self) -> ComposingNormalizer {
+ ComposingNormalizer {
+ decomposing_normalizer: self.decomposing_normalizer.static_to_owned(),
+ canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
+ }
+ }
+
+ /// NFC constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfc() -> Self {
+ ComposingNormalizerBorrowed {
+ decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(),
+ canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
+ }
+ }
+
+ /// NFKC constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfkc() -> Self {
+ ComposingNormalizerBorrowed {
+ decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(),
+ canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
+ }
+ }
+
+ /// This is a special building block normalization for IDNA that implements parts of the Map
+ /// step and the following Normalize step.
+ ///
+ /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior
+ /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns
+ /// U+0345 from a reordered character into a non-reordered character before reordering happens.
+ /// Therefore, the output of this normalization may differ for different inputs that are
+ /// canonically equivalents with each other if they differ by how U+0345 is ordered relative
+ /// to other reorderable characters.
+ #[cfg(feature = "compiled_data")]
+ pub(crate) const fn new_uts46() -> Self {
+ ComposingNormalizerBorrowed {
+ decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(),
+ canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
+ }
+ }
+}
+
+impl<'data> ComposingNormalizerBorrowed<'data> {
+ /// Wraps a delegate iterator into a composing iterator
+ /// adapter by using the data already held by this normalizer.
+ pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> {
+ self.normalize_iter_private(iter, IgnorableBehavior::Unsupported)
+ }
+
+ fn normalize_iter_private<I: Iterator<Item = char>>(
+ &self,
+ iter: I,
+ ignorable_behavior: IgnorableBehavior,
+ ) -> Composition<'data, I> {
+ Composition::new(
+ Decomposition::new_with_supplements(
+ iter,
+ self.decomposing_normalizer.decompositions,
+ self.decomposing_normalizer.tables,
+ self.decomposing_normalizer.supplementary_tables,
+ self.decomposing_normalizer.decomposition_passthrough_bound,
+ ignorable_behavior,
+ ),
+ self.canonical_compositions.canonical_compositions.clone(),
+ self.decomposing_normalizer.composition_passthrough_bound,
+ )
+ }
+
+ normalizer_methods!();
+
+ composing_normalize_to!(
+ /// Normalize a string slice into a `Write` sink.
+ ,
+ normalize_to,
+ core::fmt::Write,
+ &str,
+ {},
+ true,
+ as_str,
+ {
+ // Let's hope LICM hoists this outside `'outer`.
+ let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 {
+ 0xCCu8
+ } else {
+ // We can make this fancy if a normalization other than NFC where looking at
+ // non-ASCII lead bytes is worthwhile is ever introduced.
+ composition_passthrough_bound.min(0x80) as u8
+ };
+ // Attributes have to be on blocks, so hoisting all the way here.
+ #[allow(clippy::unwrap_used)]
+ 'fast: loop {
+ let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter();
+ 'fastest: loop {
+ if let Some(&upcoming_byte) = code_unit_iter.next() {
+ if upcoming_byte < composition_passthrough_byte_bound {
+ // Fast-track succeeded!
+ continue 'fastest;
+ }
+ composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars();
+ break 'fastest;
+ }
+ // End of stream
+ sink.write_str(pending_slice)?;
+ return Ok(());
+ }
+ // `unwrap()` OK, because the slice is valid UTF-8 and we know there
+ // is an upcoming byte.
+ let upcoming = composition.decomposition.delegate.next().unwrap();
+ let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
+ if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
+ // Can't combine backwards, hence a plain (non-backwards-combining)
+ // starter albeit past `composition_passthrough_bound`
+
+ // Fast-track succeeded!
+ continue 'fast;
+ }
+ // We need to fall off the fast path.
+ composition.decomposition.pending = Some(upcoming_with_trie_value);
+
+ // slicing and unwrap OK, because we've just evidently read enough previously.
+ let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars();
+ // `unwrap` OK, because we've previously manage to read the previous character
+ undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
+ let consumed_so_far_slice = consumed_so_far.as_str();
+ sink.write_str(consumed_so_far_slice)?;
+ break 'fast;
+ }
+ },
+ text,
+ sink,
+ composition,
+ composition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ len_utf8,
+ );
+
+ composing_normalize_to!(
+ /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink.
+ ///
+ /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER
+ /// according to the WHATWG Encoding Standard.
+ ///
+ /// ✨ *Enabled with the `utf8_iter` Cargo feature.*
+ #[cfg(feature = "utf8_iter")]
+ ,
+ normalize_utf8_to,
+ core::fmt::Write,
+ &[u8],
+ {},
+ false,
+ as_slice,
+ {
+ 'fast: loop {
+ if let Some(upcoming) = composition.decomposition.delegate.next() {
+ if u32::from(upcoming) < composition_passthrough_bound {
+ // Fast-track succeeded!
+ continue 'fast;
+ }
+ // TODO: Be statically aware of fast/small trie.
+ let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming);
+ if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() {
+ // Note: The trie value of the REPLACEMENT CHARACTER is
+ // intentionally formatted to fail the
+ // `potential_passthrough_and_cannot_combine_backwards`
+ // test even though it really is a starter that decomposes
+ // to self and cannot combine backwards. This
+ // Allows moving the branch on REPLACEMENT CHARACTER
+ // below this `continue`.
+ continue 'fast;
+ }
+ // We need to fall off the fast path.
+
+ // TODO(#2006): Annotate as unlikely
+ if upcoming == REPLACEMENT_CHARACTER {
+ // Can't tell if this is an error or a literal U+FFFD in
+ // the input. Assuming the former to be sure.
+
+ // Since the U+FFFD might signify an error, we can't
+ // assume `upcoming.len_utf8()` for the backoff length.
+ let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars();
+ let back = consumed_so_far.next_back();
+ debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER));
+ let consumed_so_far_slice = consumed_so_far.as_slice();
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?;
+ undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0);
+ composition.decomposition.pending = None;
+ break 'fast;
+ }
+
+ composition.decomposition.pending = Some(upcoming_with_trie_value);
+ // slicing and unwrap OK, because we've just evidently read enough previously.
+ // `unwrap` OK, because we've previously manage to read the previous character
+ let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars();
+ #[allow(clippy::unwrap_used)]
+ {
+ // TODO: If the previous character was below the passthrough bound,
+ // we really need to read from the trie. Otherwise, we could maintain
+ // the most-recent trie value. Need to measure what's more expensive:
+ // Remembering the trie value on each iteration or re-reading the
+ // last one after the fast-track run.
+ undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
+ }
+ let consumed_so_far_slice = consumed_so_far.as_slice();
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?;
+ break 'fast;
+ }
+ // End of stream
+ sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?;
+ return Ok(());
+ }
+ },
+ text,
+ sink,
+ composition,
+ composition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ len_utf8,
+ );
+
+ composing_normalize_to!(
+ /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink.
+ ///
+ /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER
+ /// before normalizing.
+ ///
+ /// ✨ *Enabled with the `utf16_iter` Cargo feature.*
+ #[cfg(feature = "utf16_iter")]
+ ,
+ normalize_utf16_to,
+ write16::Write16,
+ &[u16],
+ {
+ sink.size_hint(text.len())?;
+ },
+ false,
+ as_slice,
+ {
+ let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter();
+ let mut upcoming32;
+ // Declaring this up here is useful for getting compile errors about invalid changes
+ // to the code structure below.
+ let mut trie_value;
+ 'fast: loop {
+ if let Some(&upcoming_code_unit) = code_unit_iter.next() {
+ upcoming32 = u32::from(upcoming_code_unit); // may be surrogate
+ if upcoming32 < composition_passthrough_bound {
+ // No need for surrogate or U+FFFD check, because
+ // `composition_passthrough_bound` cannot be higher than
+ // U+0300.
+ // Fast-track succeeded!
+ // At this point, `trie_value` is out of sync with `upcoming32`.
+ // However, we either 1) reach the end of `code_unit_iter`, at
+ // which point nothing reads `trie_value` anymore or we
+ // execute the line immediately below this loop.
+ continue 'fast;
+ }
+ // We might be doing a trie lookup by surrogate. Surrogates get
+ // a decomposition to U+FFFD.
+ trie_value = composition.decomposition.trie.get32(upcoming32);
+ if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
+ // Can't combine backwards, hence a plain (non-backwards-combining)
+ // starter albeit past `composition_passthrough_bound`
+
+ // Fast-track succeeded!
+ continue 'fast;
+ }
+
+ // We might now be looking at a surrogate.
+ // The loop is only broken out of as goto forward
+ #[allow(clippy::never_loop)]
+ 'surrogateloop: loop {
+ let surrogate_base = upcoming32.wrapping_sub(0xD800);
+ if surrogate_base > (0xDFFF - 0xD800) {
+ // Not surrogate
+ break 'surrogateloop;
+ }
+ if surrogate_base <= (0xDBFF - 0xD800) {
+ let iter_backup = code_unit_iter.clone();
+ if let Some(&low) = code_unit_iter.next() {
+ if in_inclusive_range16(low, 0xDC00, 0xDFFF) {
+ upcoming32 = (upcoming32 << 10) + u32::from(low)
+ - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
+ // Successfully-paired surrogate. Read from the trie again.
+ trie_value = composition.decomposition.trie.get32(upcoming32);
+ if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) {
+ // Fast-track succeeded!
+ continue 'fast;
+ }
+ break 'surrogateloop;
+ } else {
+ code_unit_iter = iter_backup;
+ }
+ }
+ }
+ // unpaired surrogate
+ upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check.
+ // trie_value already holds a decomposition to U+FFFD.
+ debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD);
+ break 'surrogateloop;
+ }
+
+ // SAFETY: upcoming32 can no longer be a surrogate.
+ let upcoming = unsafe { char::from_u32_unchecked(upcoming32) };
+ let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value);
+ // We need to fall off the fast path.
+ composition.decomposition.pending = Some(upcoming_with_trie_value);
+ let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars();
+ // `unwrap` OK, because we've previously managed to read the previous character
+ #[allow(clippy::unwrap_used)]
+ {
+ // TODO: If the previous character was below the passthrough bound,
+ // we really need to read from the trie. Otherwise, we could maintain
+ // the most-recent trie value. Need to measure what's more expensive:
+ // Remembering the trie value on each iteration or re-reading the
+ // last one after the fast-track run.
+ undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap());
+ }
+ let consumed_so_far_slice = consumed_so_far.as_slice();
+ sink.write_slice(consumed_so_far_slice)?;
+ break 'fast;
+ }
+ // End of stream
+ sink.write_slice(pending_slice)?;
+ return Ok(());
+ }
+ // Sync the main iterator
+ composition.decomposition.delegate = code_unit_iter.as_slice().chars();
+ },
+ text,
+ sink,
+ composition,
+ composition_passthrough_bound,
+ undecomposed_starter,
+ pending_slice,
+ len_utf16,
+ );
+}
+
+/// A normalizer for performing composing normalization.
+#[derive(Debug)]
+pub struct ComposingNormalizer {
+ decomposing_normalizer: DecomposingNormalizer,
+ canonical_compositions: DataPayload<NormalizerNfcV1>,
+}
+
+impl ComposingNormalizer {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> {
+ ComposingNormalizerBorrowed {
+ decomposing_normalizer: self.decomposing_normalizer.as_borrowed(),
+ canonical_compositions: self.canonical_compositions.get(),
+ }
+ }
+
+ /// NFC constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> {
+ ComposingNormalizerBorrowed::new_nfc()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(
+ () -> error: DataError,
+ functions: [
+ new_nfc: skip,
+ try_new_nfc_with_buffer_provider,
+ try_new_nfc_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)]
+ pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfdDataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfcV1>
+ + ?Sized,
+ {
+ let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?;
+
+ let canonical_compositions: DataPayload<NormalizerNfcV1> =
+ provider.load(Default::default())?.payload;
+
+ Ok(ComposingNormalizer {
+ decomposing_normalizer,
+ canonical_compositions,
+ })
+ }
+
+ /// NFKC constructor using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> {
+ ComposingNormalizerBorrowed::new_nfkc()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(
+ () -> error: DataError,
+ functions: [
+ new_nfkc: skip,
+ try_new_nfkc_with_buffer_provider,
+ try_new_nfkc_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)]
+ pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfkdDataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfkdTablesV1>
+ + DataProvider<NormalizerNfcV1>
+ + ?Sized,
+ {
+ let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?;
+
+ let canonical_compositions: DataPayload<NormalizerNfcV1> =
+ provider.load(Default::default())?.payload;
+
+ Ok(ComposingNormalizer {
+ decomposing_normalizer,
+ canonical_compositions,
+ })
+ }
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)]
+ pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerUts46DataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfkdTablesV1>
+ // UTS 46 tables merged into CompatibilityDecompositionTablesV1
+ + DataProvider<NormalizerNfcV1>
+ + ?Sized,
+ {
+ let decomposing_normalizer =
+ DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?;
+
+ let canonical_compositions: DataPayload<NormalizerNfcV1> =
+ provider.load(Default::default())?.payload;
+
+ Ok(ComposingNormalizer {
+ decomposing_normalizer,
+ canonical_compositions,
+ })
+ }
+}
+
+#[cfg(feature = "utf16_iter")]
+struct IsNormalizedSinkUtf16<'a> {
+ expect: &'a [u16],
+}
+
+#[cfg(feature = "utf16_iter")]
+impl<'a> IsNormalizedSinkUtf16<'a> {
+ pub fn new(slice: &'a [u16]) -> Self {
+ IsNormalizedSinkUtf16 { expect: slice }
+ }
+ pub fn remaining_len(&self) -> usize {
+ self.expect.len()
+ }
+}
+
+#[cfg(feature = "utf16_iter")]
+impl write16::Write16 for IsNormalizedSinkUtf16<'_> {
+ fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result {
+ // We know that if we get a slice, it's a pass-through,
+ // so we can compare addresses. Indexing is OK, because
+ // an indexing failure would be a code bug rather than
+ // an input or data issue.
+ #[allow(clippy::indexing_slicing)]
+ if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
+ self.expect = &self.expect[s.len()..];
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+
+ fn write_char(&mut self, c: char) -> core::fmt::Result {
+ let mut iter = self.expect.chars();
+ if iter.next() == Some(c) {
+ self.expect = iter.as_slice();
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+}
+
+#[cfg(feature = "utf8_iter")]
+struct IsNormalizedSinkUtf8<'a> {
+ expect: &'a [u8],
+}
+
+#[cfg(feature = "utf8_iter")]
+impl<'a> IsNormalizedSinkUtf8<'a> {
+ pub fn new(slice: &'a [u8]) -> Self {
+ IsNormalizedSinkUtf8 { expect: slice }
+ }
+ pub fn remaining_len(&self) -> usize {
+ self.expect.len()
+ }
+}
+
+#[cfg(feature = "utf8_iter")]
+impl core::fmt::Write for IsNormalizedSinkUtf8<'_> {
+ fn write_str(&mut self, s: &str) -> core::fmt::Result {
+ // We know that if we get a slice, it's a pass-through,
+ // so we can compare addresses. Indexing is OK, because
+ // an indexing failure would be a code bug rather than
+ // an input or data issue.
+ #[allow(clippy::indexing_slicing)]
+ if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
+ self.expect = &self.expect[s.len()..];
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+
+ fn write_char(&mut self, c: char) -> core::fmt::Result {
+ let mut iter = self.expect.chars();
+ if iter.next() == Some(c) {
+ self.expect = iter.as_slice();
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+}
+
+struct IsNormalizedSinkStr<'a> {
+ expect: &'a str,
+}
+
+impl<'a> IsNormalizedSinkStr<'a> {
+ pub fn new(slice: &'a str) -> Self {
+ IsNormalizedSinkStr { expect: slice }
+ }
+ pub fn remaining_len(&self) -> usize {
+ self.expect.len()
+ }
+}
+
+impl core::fmt::Write for IsNormalizedSinkStr<'_> {
+ fn write_str(&mut self, s: &str) -> core::fmt::Result {
+ // We know that if we get a slice, it's a pass-through,
+ // so we can compare addresses. Indexing is OK, because
+ // an indexing failure would be a code bug rather than
+ // an input or data issue.
+ #[allow(clippy::indexing_slicing)]
+ if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) {
+ self.expect = &self.expect[s.len()..];
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+
+ fn write_char(&mut self, c: char) -> core::fmt::Result {
+ let mut iter = self.expect.chars();
+ if iter.next() == Some(c) {
+ self.expect = iter.as_str();
+ Ok(())
+ } else {
+ Err(core::fmt::Error {})
+ }
+ }
+}
diff --git a/vendor/icu_normalizer/src/properties.rs b/vendor/icu_normalizer/src/properties.rs
new file mode 100644
index 00000000..948780e1
--- /dev/null
+++ b/vendor/icu_normalizer/src/properties.rs
@@ -0,0 +1,663 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Access to the Unicode properties or property-based operations that
+//! are required for NFC and NFD.
+//!
+//! Applications should generally use the full normalizers that are
+//! provided at the top level of this crate. However, the APIs in this
+//! module are provided for callers such as HarfBuzz that specifically
+//! want access to the raw canonical composition operation e.g. for use in a
+//! glyph-availability-guided custom normalizer.
+
+use crate::char_from_u16;
+use crate::char_from_u32;
+use crate::in_inclusive_range;
+use crate::provider::CanonicalCompositions;
+use crate::provider::DecompositionData;
+use crate::provider::DecompositionTables;
+use crate::provider::NonRecursiveDecompositionSupplement;
+use crate::provider::NormalizerNfcV1;
+use crate::provider::NormalizerNfdDataV1;
+use crate::provider::NormalizerNfdSupplementV1;
+use crate::provider::NormalizerNfdTablesV1;
+use crate::trie_value_has_ccc;
+use crate::CanonicalCombiningClass;
+use crate::BACKWARD_COMBINING_MARKER;
+use crate::FDFA_MARKER;
+use crate::HANGUL_L_BASE;
+use crate::HANGUL_N_COUNT;
+use crate::HANGUL_S_BASE;
+use crate::HANGUL_S_COUNT;
+use crate::HANGUL_T_BASE;
+use crate::HANGUL_T_COUNT;
+use crate::HANGUL_V_BASE;
+use crate::HIGH_ZEROS_MASK;
+use crate::LOW_ZEROS_MASK;
+use crate::NON_ROUND_TRIP_MARKER;
+use icu_provider::prelude::*;
+
+/// Borrowed version of the raw canonical composition operation.
+///
+/// Callers should generally use `ComposingNormalizer` instead of this API.
+/// However, this API is provided for callers such as HarfBuzz that specifically
+/// want access to the raw canonical composition operation e.g. for use in a
+/// glyph-availability-guided custom normalizer.
+#[derive(Debug, Copy, Clone)]
+pub struct CanonicalCompositionBorrowed<'a> {
+ canonical_compositions: &'a CanonicalCompositions<'a>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalCompositionBorrowed<'static> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl CanonicalCompositionBorrowed<'static> {
+ /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`].
+ ///
+ /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some
+ /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`].
+ pub const fn static_to_owned(self) -> CanonicalComposition {
+ CanonicalComposition {
+ canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions),
+ }
+ }
+
+ /// Constructs a new `CanonicalComposition` using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new() -> Self {
+ Self {
+ canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1,
+ }
+ }
+}
+
+impl CanonicalCompositionBorrowed<'_> {
+ /// Performs canonical composition (including Hangul) on a pair of
+ /// characters or returns `None` if these characters don't compose.
+ /// Composition exclusions are taken into account.
+ ///
+ /// # Examples
+ ///
+ /// ```
+ /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new();
+ ///
+ /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters
+ /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
+ /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
+ /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion
+ /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter
+ /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
+ /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
+ /// ```
+ #[inline(always)]
+ pub fn compose(self, starter: char, second: char) -> Option<char> {
+ crate::compose(
+ self.canonical_compositions.canonical_compositions.iter(),
+ starter,
+ second,
+ )
+ }
+}
+
+/// The raw canonical composition operation.
+///
+/// Callers should generally use `ComposingNormalizer` instead of this API.
+/// However, this API is provided for callers such as HarfBuzz that specifically
+/// want access to the raw canonical composition operation e.g. for use in a
+/// glyph-availability-guided custom normalizer.
+#[derive(Debug)]
+pub struct CanonicalComposition {
+ canonical_compositions: DataPayload<NormalizerNfcV1>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalComposition {
+ fn default() -> Self {
+ Self::new().static_to_owned()
+ }
+}
+
+impl CanonicalComposition {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> {
+ CanonicalCompositionBorrowed {
+ canonical_compositions: self.canonical_compositions.get(),
+ }
+ }
+
+ /// Constructs a new `CanonicalCompositionBorrowed` using compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ #[allow(clippy::new_ret_no_self)]
+ pub const fn new() -> CanonicalCompositionBorrowed<'static> {
+ CanonicalCompositionBorrowed::new()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
+ functions: [
+ new: skip,
+ try_new_with_buffer_provider,
+ try_new_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
+ pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfcV1> + ?Sized,
+ {
+ let canonical_compositions: DataPayload<NormalizerNfcV1> =
+ provider.load(Default::default())?.payload;
+ Ok(CanonicalComposition {
+ canonical_compositions,
+ })
+ }
+}
+
+/// The outcome of non-recursive canonical decomposition of a character.
+#[allow(clippy::exhaustive_enums)]
+#[derive(Debug, PartialEq, Eq)]
+pub enum Decomposed {
+ /// The character is its own canonical decomposition.
+ Default,
+ /// The character decomposes to a single different character.
+ Singleton(char),
+ /// The character decomposes to two characters.
+ Expansion(char, char),
+}
+
+/// Borrowed version of the raw (non-recursive) canonical decomposition operation.
+///
+/// Callers should generally use `DecomposingNormalizer` instead of this API.
+/// However, this API is provided for callers such as HarfBuzz that specifically
+/// want access to non-recursive canonical decomposition e.g. for use in a
+/// glyph-availability-guided custom normalizer.
+#[derive(Debug)]
+pub struct CanonicalDecompositionBorrowed<'a> {
+ decompositions: &'a DecompositionData<'a>,
+ tables: &'a DecompositionTables<'a>,
+ non_recursive: &'a NonRecursiveDecompositionSupplement<'a>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalDecompositionBorrowed<'static> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl CanonicalDecompositionBorrowed<'static> {
+ /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`].
+ ///
+ /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some
+ /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`].
+ pub const fn static_to_owned(self) -> CanonicalDecomposition {
+ CanonicalDecomposition {
+ decompositions: DataPayload::from_static_ref(self.decompositions),
+ tables: DataPayload::from_static_ref(self.tables),
+ non_recursive: DataPayload::from_static_ref(self.non_recursive),
+ }
+ }
+
+ /// Construct from compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new() -> Self {
+ const _: () = assert!(
+ crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars16
+ .const_len()
+ + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1
+ .scalars24
+ .const_len()
+ <= 0xFFF,
+ "future extension"
+ );
+
+ Self {
+ decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
+ tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1,
+ non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1,
+ }
+ }
+}
+
+impl CanonicalDecompositionBorrowed<'_> {
+ /// Performs non-recursive canonical decomposition (including for Hangul).
+ ///
+ /// ```
+ /// use icu::normalizer::properties::Decomposed;
+ /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new();
+ ///
+ /// assert_eq!(decomp.decompose('e'), Decomposed::Default);
+ /// assert_eq!(
+ /// decomp.decompose('ệ'),
+ /// Decomposed::Expansion('ẹ', '\u{0302}')
+ /// );
+ /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
+ /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
+ /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
+ /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
+ /// ```
+ #[inline]
+ pub fn decompose(&self, c: char) -> Decomposed {
+ let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE);
+ if lvt >= HANGUL_S_COUNT {
+ return self.decompose_non_hangul(c);
+ }
+ // Invariant: lvt ≤ HANGUL_S_COUNT = 1172
+ let t = lvt % HANGUL_T_COUNT;
+ // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41)
+ if t == 0 {
+ let l = lvt / HANGUL_N_COUNT;
+ // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2)
+ let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT;
+ // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21)
+ return Decomposed::Expansion(
+ // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41
+ // max will keep it in range, less than 0xD800
+ unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) },
+ unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) },
+ );
+ }
+ let lv = lvt - t;
+ // Invariant: lvt < 1172
+ // Safe because values known to be in range
+ Decomposed::Expansion(
+ // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41
+ // max will keep it in range, less than 0xD800
+ unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) },
+ unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) },
+ )
+ }
+
+ /// Performs non-recursive canonical decomposition except Hangul syllables
+ /// are reported as `Decomposed::Default`.
+ #[inline(always)]
+ fn decompose_non_hangul(&self, c: char) -> Decomposed {
+ let decomposition = self.decompositions.trie.get(c);
+ // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set,
+ // and that flag needs to be ignored here.
+ if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 {
+ return Decomposed::Default;
+ }
+ // The loop is only broken out of as goto forward
+ #[allow(clippy::never_loop)]
+ loop {
+ let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0;
+ let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0;
+ if !high_zeros && !low_zeros {
+ // Decomposition into two BMP characters: starter and non-starter
+ if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') {
+ // Look in the other trie due to oxia singleton
+ // mappings to corresponding character with tonos.
+ break;
+ }
+ let starter = char_from_u32(decomposition & 0x7FFF);
+ let combining = char_from_u32((decomposition >> 15) & 0x7FFF);
+ return Decomposed::Expansion(starter, combining);
+ }
+ if high_zeros {
+ // Decomposition into one BMP character or non-starter
+ if trie_value_has_ccc(decomposition) {
+ // Non-starter
+ if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') {
+ return Decomposed::Default;
+ }
+ return match c {
+ '\u{0340}' => {
+ // COMBINING GRAVE TONE MARK
+ Decomposed::Singleton('\u{0300}')
+ }
+ '\u{0341}' => {
+ // COMBINING ACUTE TONE MARK
+ Decomposed::Singleton('\u{0301}')
+ }
+ '\u{0343}' => {
+ // COMBINING GREEK KORONIS
+ Decomposed::Singleton('\u{0313}')
+ }
+ '\u{0344}' => {
+ // COMBINING GREEK DIALYTIKA TONOS
+ Decomposed::Expansion('\u{0308}', '\u{0301}')
+ }
+ '\u{0F73}' => {
+ // TIBETAN VOWEL SIGN II
+ Decomposed::Expansion('\u{0F71}', '\u{0F72}')
+ }
+ '\u{0F75}' => {
+ // TIBETAN VOWEL SIGN UU
+ Decomposed::Expansion('\u{0F71}', '\u{0F74}')
+ }
+ '\u{0F81}' => {
+ // TIBETAN VOWEL SIGN REVERSED II
+ Decomposed::Expansion('\u{0F71}', '\u{0F80}')
+ }
+ _ => Decomposed::Default,
+ };
+ }
+ let singleton = decomposition as u16;
+ debug_assert_ne!(
+ singleton, FDFA_MARKER,
+ "How come we got the U+FDFA NFKD marker here?"
+ );
+ return Decomposed::Singleton(char_from_u16(singleton));
+ }
+ if c == '\u{212B}' {
+ // ANGSTROM SIGN
+ return Decomposed::Singleton('\u{00C5}');
+ }
+ // Only 12 of 14 bits used as of Unicode 16.
+ let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1;
+ // Only 3 of 4 bits used as of Unicode 16.
+ let len_bits = decomposition & 0b1111;
+ let tables = self.tables;
+ if offset < tables.scalars16.len() {
+ if len_bits != 0 {
+ // i.e. logical len isn't 2
+ break;
+ }
+ if let Some(first) = tables.scalars16.get(offset) {
+ if let Some(second) = tables.scalars16.get(offset + 1) {
+ // Two BMP starters
+ return Decomposed::Expansion(char_from_u16(first), char_from_u16(second));
+ }
+ }
+ // GIGO case
+ debug_assert!(false);
+ return Decomposed::Default;
+ }
+ let len = len_bits + 1;
+ if len > 2 {
+ break;
+ }
+ let offset24 = offset - tables.scalars16.len();
+ if let Some(first_c) = tables.scalars24.get(offset24) {
+ if len == 1 {
+ return Decomposed::Singleton(first_c);
+ }
+ if let Some(second_c) = tables.scalars24.get(offset24 + 1) {
+ return Decomposed::Expansion(first_c, second_c);
+ }
+ }
+ // GIGO case
+ debug_assert!(false);
+ return Decomposed::Default;
+ }
+ let non_recursive = self.non_recursive;
+ let non_recursive_decomposition = non_recursive.trie.get(c);
+ if non_recursive_decomposition == 0 {
+ // GIGO case
+ debug_assert!(false);
+ return Decomposed::Default;
+ }
+ let trail_or_complex = (non_recursive_decomposition >> 16) as u16;
+ let lead = non_recursive_decomposition as u16;
+ if lead != 0 && trail_or_complex != 0 {
+ // Decomposition into two BMP characters
+ return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex));
+ }
+ if lead != 0 {
+ // Decomposition into one BMP character
+ return Decomposed::Singleton(char_from_u16(lead));
+ }
+ // Decomposition into two non-BMP characters
+ // Low is offset into a table plus one to keep it non-zero.
+ let offset = usize::from(trail_or_complex - 1);
+ if let Some(first) = non_recursive.scalars24.get(offset) {
+ if let Some(second) = non_recursive.scalars24.get(offset + 1) {
+ return Decomposed::Expansion(first, second);
+ }
+ }
+ // GIGO case
+ debug_assert!(false);
+ Decomposed::Default
+ }
+}
+
+/// The raw (non-recursive) canonical decomposition operation.
+///
+/// Callers should generally use `DecomposingNormalizer` instead of this API.
+/// However, this API is provided for callers such as HarfBuzz that specifically
+/// want access to non-recursive canonical decomposition e.g. for use in a
+/// glyph-availability-guided custom normalizer.
+#[derive(Debug)]
+pub struct CanonicalDecomposition {
+ decompositions: DataPayload<NormalizerNfdDataV1>,
+ tables: DataPayload<NormalizerNfdTablesV1>,
+ non_recursive: DataPayload<NormalizerNfdSupplementV1>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalDecomposition {
+ fn default() -> Self {
+ Self::new().static_to_owned()
+ }
+}
+
+impl CanonicalDecomposition {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> {
+ CanonicalDecompositionBorrowed {
+ decompositions: self.decompositions.get(),
+ tables: self.tables.get(),
+ non_recursive: self.non_recursive.get(),
+ }
+ }
+
+ /// Construct from compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ #[allow(clippy::new_ret_no_self)]
+ pub const fn new() -> CanonicalDecompositionBorrowed<'static> {
+ CanonicalDecompositionBorrowed::new()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
+ functions: [
+ new: skip,
+ try_new_with_buffer_provider,
+ try_new_unstable,
+ Self,
+ ]
+ );
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
+ pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfdDataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfdSupplementV1>
+ + ?Sized,
+ {
+ let decompositions: DataPayload<NormalizerNfdDataV1> =
+ provider.load(Default::default())?.payload;
+ let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload;
+
+ if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF {
+ // The data is from a future where there exists a normalization flavor whose
+ // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points
+ // of space. If a good use case from such a decomposition flavor arises, we can
+ // dynamically change the bit masks so that the length mask becomes 0x1FFF instead
+ // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However,
+ // since for now the masks are hard-coded, error out.
+ return Err(DataError::custom("future extension"));
+ }
+
+ let non_recursive: DataPayload<NormalizerNfdSupplementV1> =
+ provider.load(Default::default())?.payload;
+
+ Ok(CanonicalDecomposition {
+ decompositions,
+ tables,
+ non_recursive,
+ })
+ }
+}
+
+/// Borrowed version of lookup of the Canonical_Combining_Class Unicode property.
+///
+/// # Example
+///
+/// ```
+/// use icu::properties::props::CanonicalCombiningClass;
+/// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed;
+///
+/// let map = CanonicalCombiningClassMapBorrowed::new();
+/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A
+/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT
+/// ```
+#[derive(Debug)]
+pub struct CanonicalCombiningClassMapBorrowed<'a> {
+ /// The data trie
+ decompositions: &'a DecompositionData<'a>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalCombiningClassMapBorrowed<'static> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl CanonicalCombiningClassMapBorrowed<'static> {
+ /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`].
+ ///
+ /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some
+ /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`].
+ pub const fn static_to_owned(self) -> CanonicalCombiningClassMap {
+ CanonicalCombiningClassMap {
+ decompositions: DataPayload::from_static_ref(self.decompositions),
+ }
+ }
+
+ /// Construct from compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ pub const fn new() -> Self {
+ CanonicalCombiningClassMapBorrowed {
+ decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1,
+ }
+ }
+}
+
+impl CanonicalCombiningClassMapBorrowed<'_> {
+ /// Look up the canonical combining class for a scalar value.
+ ///
+ /// The return value is a u8 representing the canonical combining class,
+ /// you may enable the `"icu_properties"` feature if you would like to use a typed
+ /// `CanonicalCombiningClass`.
+ #[inline(always)]
+ pub fn get_u8(&self, c: char) -> u8 {
+ self.get32_u8(u32::from(c))
+ }
+
+ /// Look up the canonical combining class for a scalar value
+ /// represented as `u32`. If the argument is outside the scalar
+ /// value range, `Not_Reordered` is returned.
+ ///
+ /// The return value is a u8 representing the canonical combining class,
+ /// you may enable the `"icu_properties"` feature if you would like to use a typed
+ /// `CanonicalCombiningClass`.
+ pub fn get32_u8(&self, c: u32) -> u8 {
+ let trie_value = self.decompositions.trie.get32(c);
+ if trie_value_has_ccc(trie_value) {
+ trie_value as u8
+ } else {
+ ccc!(NotReordered, 0).to_icu4c_value()
+ }
+ }
+
+ /// Look up the canonical combining class for a scalar value
+ ///
+ /// ✨ *Enabled with the `icu_properties` Cargo feature.*
+ #[inline(always)]
+ #[cfg(feature = "icu_properties")]
+ pub fn get(&self, c: char) -> CanonicalCombiningClass {
+ CanonicalCombiningClass::from_icu4c_value(self.get_u8(c))
+ }
+
+ /// Look up the canonical combining class for a scalar value
+ /// represented as `u32`. If the argument is outside the scalar
+ /// value range, `CanonicalCombiningClass::NotReordered` is returned.
+ ///
+ /// ✨ *Enabled with the `icu_properties` Cargo feature.*
+ #[cfg(feature = "icu_properties")]
+ pub fn get32(&self, c: u32) -> CanonicalCombiningClass {
+ CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c))
+ }
+}
+
+/// Lookup of the Canonical_Combining_Class Unicode property.
+#[derive(Debug)]
+pub struct CanonicalCombiningClassMap {
+ /// The data trie
+ decompositions: DataPayload<NormalizerNfdDataV1>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for CanonicalCombiningClassMap {
+ fn default() -> Self {
+ Self::new().static_to_owned()
+ }
+}
+
+impl CanonicalCombiningClassMap {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> {
+ CanonicalCombiningClassMapBorrowed {
+ decompositions: self.decompositions.get(),
+ }
+ }
+
+ /// Construct from compiled data.
+ ///
+ /// ✨ *Enabled with the `compiled_data` Cargo feature.*
+ ///
+ /// [📚 Help choosing a constructor](icu_provider::constructors)
+ #[cfg(feature = "compiled_data")]
+ #[allow(clippy::new_ret_no_self)]
+ pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> {
+ CanonicalCombiningClassMapBorrowed::new()
+ }
+
+ icu_provider::gen_buffer_data_constructors!(() -> error: DataError,
+ functions: [
+ new: skip,
+ try_new_with_buffer_provider,
+ try_new_unstable,
+ Self,
+ ]);
+
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
+ pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerNfdDataV1> + ?Sized,
+ {
+ let decompositions: DataPayload<NormalizerNfdDataV1> =
+ provider.load(Default::default())?.payload;
+ Ok(CanonicalCombiningClassMap { decompositions })
+ }
+}
diff --git a/vendor/icu_normalizer/src/provider.rs b/vendor/icu_normalizer/src/provider.rs
new file mode 100644
index 00000000..9502f016
--- /dev/null
+++ b/vendor/icu_normalizer/src/provider.rs
@@ -0,0 +1,216 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component.
+//!
+//! <div class="stab unstable">
+//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
+//! to be stable, their Rust representation might not be. Use with caution.
+//! </div>
+//!
+//! Read more about data providers: [`icu_provider`]
+
+// Provider structs must be stable
+#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
+
+use icu_collections::char16trie::Char16Trie;
+use icu_collections::codepointtrie::CodePointTrie;
+use icu_provider::prelude::*;
+use zerovec::ZeroVec;
+
+#[cfg(feature = "compiled_data")]
+#[derive(Debug)]
+/// Baked data
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
+/// guaranteed to match with this version's `*_unstable` providers. Use with caution.
+/// </div>
+pub struct Baked;
+
+#[cfg(feature = "compiled_data")]
+#[allow(unused_imports)]
+const _: () = {
+ use icu_normalizer_data::*;
+ pub mod icu {
+ pub use crate as normalizer;
+ pub use icu_collections as collections;
+ }
+ make_provider!(Baked);
+ impl_normalizer_nfc_v1!(Baked);
+ impl_normalizer_nfd_data_v1!(Baked);
+ impl_normalizer_nfd_supplement_v1!(Baked);
+ impl_normalizer_nfd_tables_v1!(Baked);
+ impl_normalizer_nfkd_data_v1!(Baked);
+ impl_normalizer_nfkd_tables_v1!(Baked);
+ impl_normalizer_uts46_data_v1!(Baked);
+};
+
+icu_provider::data_marker!(
+ /// Marker for data for canonical decomposition.
+ NormalizerNfdDataV1,
+ "normalizer/nfd/data/v1",
+ DecompositionData<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for additional data for canonical decomposition.
+ NormalizerNfdTablesV1,
+ "normalizer/nfd/tables/v1",
+ DecompositionTables<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for data for compatibility decomposition.
+ NormalizerNfkdDataV1,
+ "normalizer/nfkd/data/v1",
+ DecompositionData<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for additional data for compatibility decomposition.
+ NormalizerNfkdTablesV1,
+ "normalizer/nfkd/tables/v1",
+ DecompositionTables<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for data for UTS-46 decomposition.
+ NormalizerUts46DataV1,
+ "normalizer/uts46/data/v1",
+ DecompositionData<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for data for composition.
+ NormalizerNfcV1,
+ "normalizer/nfc/v1",
+ CanonicalCompositions<'static>,
+ is_singleton = true
+);
+icu_provider::data_marker!(
+ /// Marker for additional data for non-recusrsive composition.
+ NormalizerNfdSupplementV1,
+ "normalizer/nfd/supplement/v1",
+ NonRecursiveDecompositionSupplement<'static>,
+ is_singleton = true
+);
+
+#[cfg(feature = "datagen")]
+/// The latest minimum set of markers required by this component.
+pub const MARKERS: &[DataMarkerInfo] = &[
+ NormalizerNfcV1::INFO,
+ NormalizerNfdDataV1::INFO,
+ NormalizerNfdTablesV1::INFO,
+ NormalizerNfkdDataV1::INFO,
+ NormalizerNfkdTablesV1::INFO,
+ NormalizerNfdSupplementV1::INFO,
+ NormalizerUts46DataV1::INFO,
+];
+
+/// Decomposition data
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct DecompositionData<'data> {
+ /// Trie for decomposition.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub trie: CodePointTrie<'data, u32>,
+ /// The passthrough bounds of NFD/NFC are lowered to this
+ /// maximum instead. (16-bit, because cannot be higher
+ /// than 0x0300, which is the bound for NFC.)
+ pub passthrough_cap: u16,
+}
+
+icu_provider::data_struct!(
+ DecompositionData<'_>,
+ #[cfg(feature = "datagen")]
+);
+
+/// The expansion tables for cases where the decomposition isn't
+/// contained in the trie value
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct DecompositionTables<'data> {
+ /// Decompositions that are fully within the BMP
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub scalars16: ZeroVec<'data, u16>,
+ /// Decompositions with at least one character outside
+ /// the BMP
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub scalars24: ZeroVec<'data, char>,
+}
+
+icu_provider::data_struct!(
+ DecompositionTables<'_>,
+ #[cfg(feature = "datagen")]
+);
+
+/// Non-Hangul canonical compositions
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct CanonicalCompositions<'data> {
+ /// Trie keys are two-`char` strings with the second
+ /// character coming first. The value, if any, is the
+ /// (non-Hangul) canonical composition.
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub canonical_compositions: Char16Trie<'data>,
+}
+
+icu_provider::data_struct!(
+ CanonicalCompositions<'_>,
+ #[cfg(feature = "datagen")]
+);
+
+/// Non-recursive canonical decompositions that differ from
+/// `DecompositionData`.
+///
+/// <div class="stab unstable">
+/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
+/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
+/// to be stable, their Rust representation might not be. Use with caution.
+/// </div>
+#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
+#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
+#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))]
+#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
+pub struct NonRecursiveDecompositionSupplement<'data> {
+ /// Trie for the supplementary non-recursive decompositions
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub trie: CodePointTrie<'data, u32>,
+ /// Decompositions with at least one character outside
+ /// the BMP
+ #[cfg_attr(feature = "serde", serde(borrow))]
+ pub scalars24: ZeroVec<'data, char>,
+}
+
+icu_provider::data_struct!(
+ NonRecursiveDecompositionSupplement<'_>,
+ #[cfg(feature = "datagen")]
+);
diff --git a/vendor/icu_normalizer/src/uts46.rs b/vendor/icu_normalizer/src/uts46.rs
new file mode 100644
index 00000000..672f5c5c
--- /dev/null
+++ b/vendor/icu_normalizer/src/uts46.rs
@@ -0,0 +1,177 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+//! Bundles the part of UTS 46 that makes sense to implement as a
+//! normalization.
+//!
+//! This is meant to be used as a building block of an UTS 46
+//! implementation, such as the `idna` crate.
+
+use crate::ComposingNormalizer;
+use crate::ComposingNormalizerBorrowed;
+use crate::NormalizerNfcV1;
+use crate::NormalizerNfdTablesV1;
+use crate::NormalizerNfkdTablesV1;
+use crate::NormalizerUts46DataV1;
+use icu_provider::DataError;
+use icu_provider::DataProvider;
+
+// Implementation note: Despite merely wrapping a `ComposingNormalizer`,
+// having a `Uts46Mapper` serves two purposes:
+//
+// 1. Denying public access to parts of the `ComposingNormalizer` API
+// that don't work when the data contains markers for ignorables.
+// 2. Providing a place where additional iterator pre-processing or
+// post-processing can take place if needed in the future. (When
+// writing this, it looked like such processing was needed but
+// now isn't needed after all.)
+
+/// A borrowed version of a mapper that knows how to performs the
+/// subsets of UTS 46 processing documented on the methods.
+#[derive(Debug)]
+pub struct Uts46MapperBorrowed<'a> {
+ normalizer: ComposingNormalizerBorrowed<'a>,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for Uts46MapperBorrowed<'static> {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl Uts46MapperBorrowed<'static> {
+ /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`].
+ ///
+ /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some
+ /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`].
+ pub const fn static_to_owned(self) -> Uts46Mapper {
+ Uts46Mapper {
+ normalizer: self.normalizer.static_to_owned(),
+ }
+ }
+
+ /// Construct with compiled data.
+ #[cfg(feature = "compiled_data")]
+ pub const fn new() -> Self {
+ Uts46MapperBorrowed {
+ normalizer: ComposingNormalizerBorrowed::new_uts46(),
+ }
+ }
+}
+
+impl Uts46MapperBorrowed<'_> {
+ /// Returns an iterator adaptor that turns an `Iterator` over `char`
+ /// into an iterator yielding a `char` sequence that gets the following
+ /// operations from the "Map" and "Normalize" steps of the "Processing"
+ /// section of UTS 46 lazily applied to it:
+ ///
+ /// 1. The _ignored_ characters are ignored.
+ /// 2. The _mapped_ characters are mapped.
+ /// 3. The _disallowed_ characters are replaced with U+FFFD,
+ /// which itself is a disallowed character.
+ /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
+ /// as appropriate.
+ /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
+ /// 6. The _disallowed_STD3_mapped_ characters are treated as
+ /// _mapped_.
+ /// 7. The result is normalized to NFC.
+ ///
+ /// Notably:
+ ///
+ /// * The STD3 or WHATWG ASCII deny list should be implemented as a
+ /// post-processing step.
+ /// * Transitional processing is not performed. Transitional mapping
+ /// would be a pre-processing step, but transitional processing is
+ /// deprecated, and none of Firefox, Safari, or Chrome use it.
+ pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>(
+ &'delegate self,
+ iter: I,
+ ) -> impl Iterator<Item = char> + 'delegate {
+ self.normalizer
+ .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored)
+ }
+
+ /// Returns an iterator adaptor that turns an `Iterator` over `char`
+ /// into an iterator yielding a `char` sequence that gets the following
+ /// operations from the NFC check and statucs steps of the "Validity
+ /// Criteria" section of UTS 46 lazily applied to it:
+ ///
+ /// 1. The _ignored_ characters are treated as _disallowed_.
+ /// 2. The _mapped_ characters are mapped.
+ /// 3. The _disallowed_ characters are replaced with U+FFFD,
+ /// which itself is a disallowed character.
+ /// 4. The _deviation_ characters are treated as _mapped_ or _valid_
+ /// as appropriate.
+ /// 5. The _disallowed_STD3_valid_ characters are treated as allowed.
+ /// 6. The _disallowed_STD3_mapped_ characters are treated as
+ /// _mapped_.
+ /// 7. The result is normalized to NFC.
+ ///
+ /// Notably:
+ ///
+ /// * The STD3 or WHATWG ASCII deny list should be implemented as a
+ /// post-processing step.
+ /// * Transitional processing is not performed. Transitional mapping
+ /// would be a pre-processing step, but transitional processing is
+ /// deprecated, and none of Firefox, Safari, or Chrome use it.
+ /// * The output needs to be compared with input to see if anything
+ /// changed. This check catches failures to adhere to the normalization
+ /// and status requirements. In particular, this comparison results
+ /// in _mapped_ characters resulting in error like "Validity Criteria"
+ /// requires.
+ pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>(
+ &'delegate self,
+ iter: I,
+ ) -> impl Iterator<Item = char> + 'delegate {
+ self.normalizer
+ .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter)
+ }
+}
+
+/// A mapper that knows how to performs the subsets of UTS 46 processing
+/// documented on the methods.
+#[derive(Debug)]
+pub struct Uts46Mapper {
+ normalizer: ComposingNormalizer,
+}
+
+#[cfg(feature = "compiled_data")]
+impl Default for Uts46Mapper {
+ fn default() -> Self {
+ Self::new().static_to_owned()
+ }
+}
+
+impl Uts46Mapper {
+ /// Constructs a borrowed version of this type for more efficient querying.
+ pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> {
+ Uts46MapperBorrowed {
+ normalizer: self.normalizer.as_borrowed(),
+ }
+ }
+
+ /// Construct with compiled data.
+ #[cfg(feature = "compiled_data")]
+ #[allow(clippy::new_ret_no_self)]
+ pub const fn new() -> Uts46MapperBorrowed<'static> {
+ Uts46MapperBorrowed::new()
+ }
+
+ /// Construct with provider.
+ #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)]
+ pub fn try_new<D>(provider: &D) -> Result<Self, DataError>
+ where
+ D: DataProvider<NormalizerUts46DataV1>
+ + DataProvider<NormalizerNfdTablesV1>
+ + DataProvider<NormalizerNfkdTablesV1>
+ // UTS 46 tables merged into NormalizerNfkdTablesV1
+ + DataProvider<NormalizerNfcV1>
+ + ?Sized,
+ {
+ let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?;
+
+ Ok(Uts46Mapper { normalizer })
+ }
+}
diff --git a/vendor/icu_normalizer/tests/data/NormalizationTest.txt b/vendor/icu_normalizer/tests/data/NormalizationTest.txt
new file mode 100644
index 00000000..0d224b05
--- /dev/null
+++ b/vendor/icu_normalizer/tests/data/NormalizationTest.txt
@@ -0,0 +1,4 @@
+# This is a placeholder in the interest of keeping the repository size smaller.
+# Replace this file with the contents of
+# https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually
+# run the conformance test.
diff --git a/vendor/icu_normalizer/tests/data/README.md b/vendor/icu_normalizer/tests/data/README.md
new file mode 100644
index 00000000..8d407e46
--- /dev/null
+++ b/vendor/icu_normalizer/tests/data/README.md
@@ -0,0 +1,2 @@
+The test data comes from
+https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt
diff --git a/vendor/icu_normalizer/tests/tests.rs b/vendor/icu_normalizer/tests/tests.rs
new file mode 100644
index 00000000..5e6d8770
--- /dev/null
+++ b/vendor/icu_normalizer/tests/tests.rs
@@ -0,0 +1,2083 @@
+// This file is part of ICU4X. For terms of use, please see the file
+// called LICENSE at the top level of the ICU4X source tree
+// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
+
+use icu_normalizer::properties::CanonicalCombiningClassMap;
+use icu_normalizer::properties::CanonicalCombiningClassMapBorrowed;
+use icu_normalizer::properties::CanonicalComposition;
+use icu_normalizer::properties::CanonicalCompositionBorrowed;
+use icu_normalizer::properties::CanonicalDecomposition;
+use icu_normalizer::properties::CanonicalDecompositionBorrowed;
+use icu_normalizer::properties::Decomposed;
+use icu_normalizer::uts46::Uts46Mapper;
+use icu_normalizer::uts46::Uts46MapperBorrowed;
+use icu_normalizer::ComposingNormalizer;
+use icu_normalizer::ComposingNormalizerBorrowed;
+use icu_normalizer::DecomposingNormalizer;
+use icu_normalizer::DecomposingNormalizerBorrowed;
+
+#[test]
+fn test_nfd_basic() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ assert_eq!(normalizer.normalize("ä"), "a\u{0308}");
+ assert_eq!(normalizer.normalize("Ä"), "A\u{0308}");
+ assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}");
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged
+ assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfd_owned() {
+ let owned =
+ DecomposingNormalizer::try_new_nfd_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let normalizer = owned.as_borrowed();
+ assert_eq!(normalizer.normalize("ä"), "a\u{0308}");
+ assert_eq!(normalizer.normalize("Ä"), "A\u{0308}");
+ assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}");
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged
+ assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfkd_basic() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+ assert_eq!(normalizer.normalize("ä"), "a\u{0308}");
+ assert_eq!(normalizer.normalize("Ä"), "A\u{0308}");
+ assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}");
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ヘ\u{3099}"); // half-width to full-width
+ assert_eq!(normalizer.normalize("ペ"), "ヘ\u{309A}"); // half-width to full-width
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{635}\u{644}\u{649} \u{627}\u{644}\u{644}\u{647} \u{639}\u{644}\u{64A}\u{647} \u{648}\u{633}\u{644}\u{645}");
+ // ligature expanded
+ assert_eq!(normalizer.normalize("㈎"), "(\u{1100}\u{1161})"); // parenthetical expanded
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfkd_owned() {
+ let owned =
+ DecomposingNormalizer::try_new_nfkd_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let normalizer = owned.as_borrowed();
+ assert_eq!(normalizer.normalize("ä"), "a\u{0308}");
+ assert_eq!(normalizer.normalize("Ä"), "A\u{0308}");
+ assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}");
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ヘ\u{3099}"); // half-width to full-width
+ assert_eq!(normalizer.normalize("ペ"), "ヘ\u{309A}"); // half-width to full-width
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{635}\u{644}\u{649} \u{627}\u{644}\u{644}\u{647} \u{639}\u{644}\u{64A}\u{647} \u{648}\u{633}\u{644}\u{645}");
+ // ligature expanded
+ assert_eq!(normalizer.normalize("㈎"), "(\u{1100}\u{1161})"); // parenthetical expanded
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfc_basic() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+ assert_eq!(normalizer.normalize("a\u{0308}"), "ä");
+ assert_eq!(normalizer.normalize("A\u{0308}"), "Ä");
+ assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ");
+ assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion
+
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged
+ assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfc_owned() {
+ let owned =
+ ComposingNormalizer::try_new_nfc_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let normalizer = owned.as_borrowed();
+ assert_eq!(normalizer.normalize("a\u{0308}"), "ä");
+ assert_eq!(normalizer.normalize("A\u{0308}"), "Ä");
+ assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ");
+ assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion
+
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged
+ assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfkc_basic() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfkc();
+ assert_eq!(normalizer.normalize("a\u{0308}"), "ä");
+ assert_eq!(normalizer.normalize("A\u{0308}"), "Ä");
+ assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ");
+ assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion
+
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width to full-width, the compose
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width to full-width, the compose
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}");
+ // ligature expanded
+ assert_eq!(normalizer.normalize("㈎"), "(가)"); // parenthetical expanded and partially recomposed
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_nfkc_owned() {
+ let owned =
+ ComposingNormalizer::try_new_nfkc_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let normalizer = owned.as_borrowed();
+ assert_eq!(normalizer.normalize("a\u{0308}"), "ä");
+ assert_eq!(normalizer.normalize("A\u{0308}"), "Ä");
+ assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ");
+ assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ");
+ assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion
+
+ assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign
+ assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width to full-width, the compose
+ assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width to full-width, the compose
+ assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded
+ assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}");
+ // ligature expanded
+ assert_eq!(normalizer.normalize("㈎"), "(가)"); // parenthetical expanded and partially recomposed
+ assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript
+}
+
+#[test]
+fn test_uts46_map_normalize() {
+ let mapper = Uts46MapperBorrowed::new();
+ assert_eq!(
+ mapper
+ .map_normalize("a\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("A\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("e\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("E\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper.map_normalize("𝅗𝅥".chars()).collect::<String>(),
+ "𝅗\u{1D165}"
+ ); // Composition exclusion
+
+ assert_eq!(
+ mapper.map_normalize("\u{2126}".chars()).collect::<String>(),
+ "ω"
+ ); // ohm sign
+ assert_eq!(mapper.map_normalize("ベ".chars()).collect::<String>(), "ベ"); // half-width to full-width, the compose
+ assert_eq!(mapper.map_normalize("ペ".chars()).collect::<String>(), "ペ"); // half-width to full-width, the compose
+ assert_eq!(mapper.map_normalize("fi".chars()).collect::<String>(), "fi"); // ligature expanded
+ assert_eq!(mapper.map_normalize("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}");
+ // ligature expanded
+ assert_eq!(
+ mapper.map_normalize("㈎".chars()).collect::<String>(),
+ "(가)"
+ ); // parenthetical expanded and partially recomposed
+
+ // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4)
+ assert_eq!(
+ mapper.map_normalize("\u{200C}".chars()).collect::<String>(),
+ "\u{200C}"
+ );
+ assert_eq!(
+ mapper.map_normalize("\u{200D}".chars()).collect::<String>(),
+ "\u{200D}"
+ );
+ assert_eq!(mapper.map_normalize("ß".chars()).collect::<String>(), "ß");
+ assert_eq!(mapper.map_normalize("ς".chars()).collect::<String>(), "ς");
+
+ // Iota subscript
+ assert_eq!(
+ mapper.map_normalize("\u{0345}".chars()).collect::<String>(),
+ "ι"
+ );
+
+ // Disallowed
+ assert_eq!(
+ mapper.map_normalize("\u{061C}".chars()).collect::<String>(),
+ "\u{FFFD}"
+ );
+
+ // Ignored
+ assert_eq!(
+ mapper
+ .map_normalize("a\u{180B}b".chars())
+ .collect::<String>(),
+ "ab"
+ );
+}
+
+#[test]
+fn test_uts46_owned() {
+ let owned = Uts46Mapper::try_new(&icu_normalizer::provider::Baked).unwrap();
+ let mapper = owned.as_borrowed();
+ assert_eq!(
+ mapper
+ .map_normalize("a\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("A\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("e\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper
+ .map_normalize("E\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper.map_normalize("𝅗𝅥".chars()).collect::<String>(),
+ "𝅗\u{1D165}"
+ ); // Composition exclusion
+
+ assert_eq!(
+ mapper.map_normalize("\u{2126}".chars()).collect::<String>(),
+ "ω"
+ ); // ohm sign
+ assert_eq!(mapper.map_normalize("ベ".chars()).collect::<String>(), "ベ"); // half-width to full-width, the compose
+ assert_eq!(mapper.map_normalize("ペ".chars()).collect::<String>(), "ペ"); // half-width to full-width, the compose
+ assert_eq!(mapper.map_normalize("fi".chars()).collect::<String>(), "fi"); // ligature expanded
+ assert_eq!(mapper.map_normalize("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}");
+ // ligature expanded
+ assert_eq!(
+ mapper.map_normalize("㈎".chars()).collect::<String>(),
+ "(가)"
+ ); // parenthetical expanded and partially recomposed
+
+ // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4)
+ assert_eq!(
+ mapper.map_normalize("\u{200C}".chars()).collect::<String>(),
+ "\u{200C}"
+ );
+ assert_eq!(
+ mapper.map_normalize("\u{200D}".chars()).collect::<String>(),
+ "\u{200D}"
+ );
+ assert_eq!(mapper.map_normalize("ß".chars()).collect::<String>(), "ß");
+ assert_eq!(mapper.map_normalize("ς".chars()).collect::<String>(), "ς");
+
+ // Iota subscript
+ assert_eq!(
+ mapper.map_normalize("\u{0345}".chars()).collect::<String>(),
+ "ι"
+ );
+
+ // Disallowed
+ assert_eq!(
+ mapper.map_normalize("\u{061C}".chars()).collect::<String>(),
+ "\u{FFFD}"
+ );
+
+ // Ignored
+ assert_eq!(
+ mapper
+ .map_normalize("a\u{180B}b".chars())
+ .collect::<String>(),
+ "ab"
+ );
+}
+
+#[test]
+fn test_uts46_normalize_validate() {
+ let mapper = Uts46MapperBorrowed::new();
+ assert_eq!(
+ mapper
+ .normalize_validate("a\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .normalize_validate("A\u{0308}".chars())
+ .collect::<String>(),
+ "ä"
+ );
+ assert_eq!(
+ mapper
+ .normalize_validate("e\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper
+ .normalize_validate("E\u{0323}\u{0302}".chars())
+ .collect::<String>(),
+ "ệ"
+ );
+ assert_eq!(
+ mapper.normalize_validate("𝅗𝅥".chars()).collect::<String>(),
+ "𝅗\u{1D165}"
+ ); // Composition exclusion
+
+ assert_eq!(
+ mapper
+ .normalize_validate("\u{2126}".chars())
+ .collect::<String>(),
+ "ω"
+ ); // ohm sign
+ assert_eq!(
+ mapper.normalize_validate("ベ".chars()).collect::<String>(),
+ "ベ"
+ ); // half-width to full-width, the compose
+ assert_eq!(
+ mapper.normalize_validate("ペ".chars()).collect::<String>(),
+ "ペ"
+ ); // half-width to full-width, the compose
+ assert_eq!(
+ mapper.normalize_validate("fi".chars()).collect::<String>(),
+ "fi"
+ ); // ligature expanded
+ assert_eq!(mapper.normalize_validate("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}");
+ // ligature expanded
+ assert_eq!(
+ mapper.normalize_validate("㈎".chars()).collect::<String>(),
+ "(가)"
+ ); // parenthetical expanded and partially recomposed
+
+ // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4)
+ assert_eq!(
+ mapper
+ .normalize_validate("\u{200C}".chars())
+ .collect::<String>(),
+ "\u{200C}"
+ );
+ assert_eq!(
+ mapper
+ .normalize_validate("\u{200D}".chars())
+ .collect::<String>(),
+ "\u{200D}"
+ );
+ assert_eq!(
+ mapper.normalize_validate("ß".chars()).collect::<String>(),
+ "ß"
+ );
+ assert_eq!(
+ mapper.normalize_validate("ς".chars()).collect::<String>(),
+ "ς"
+ );
+
+ // Iota subscript
+ assert_eq!(
+ mapper
+ .normalize_validate("\u{0345}".chars())
+ .collect::<String>(),
+ "ι"
+ );
+
+ // Disallowed
+ assert_eq!(
+ mapper
+ .normalize_validate("\u{061C}".chars())
+ .collect::<String>(),
+ "\u{FFFD}"
+ );
+
+ // Ignored
+ assert_eq!(
+ mapper
+ .normalize_validate("a\u{180B}b".chars())
+ .collect::<String>(),
+ "a\u{FFFD}b"
+ );
+}
+
+type StackString = arraystring::ArrayString<arraystring::typenum::U48>;
+
+#[test]
+fn test_nfd_str_to() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut buf = StackString::new();
+ assert!(normalizer.normalize_to("ä", &mut buf).is_ok());
+ assert_eq!(&buf, "a\u{0308}");
+
+ buf.clear();
+ assert!(normalizer.normalize_to("ệ", &mut buf).is_ok());
+ assert_eq!(&buf, "e\u{0323}\u{0302}");
+}
+
+#[test]
+fn test_nfd_utf8_to() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut buf = StackString::new();
+ assert!(normalizer
+ .normalize_utf8_to("ä".as_bytes(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "a\u{0308}");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to("ệ".as_bytes(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "e\u{0323}\u{0302}");
+}
+
+type StackVec = arrayvec::ArrayVec<u16, 32>;
+
+#[test]
+fn test_nfd_utf16_to() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut buf = StackVec::new();
+ assert!(normalizer
+ .normalize_utf16_to([0x00E4u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0x0061u16, 0x0308u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0x1EC7u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0x0065u16, 0x0323u16, 0x0302u16].as_slice());
+}
+
+#[test]
+fn test_nfc_str_to() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut buf = StackString::new();
+ assert!(normalizer.normalize_to("a\u{0308}", &mut buf).is_ok());
+ assert_eq!(&buf, "ä");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_to("e\u{0323}\u{0302}", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "ệ");
+}
+
+#[test]
+fn test_nfc_utf8_to() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut buf = StackString::new();
+ assert!(normalizer
+ .normalize_utf8_to("a\u{0308}".as_bytes(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "ä");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to("e\u{0323}\u{0302}".as_bytes(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "ệ");
+}
+
+#[test]
+fn test_nfc_utf16_to() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut buf = StackVec::new();
+ assert!(normalizer
+ .normalize_utf16_to([0x0061u16, 0x0308u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0x00E4u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0x0065u16, 0x0323u16, 0x0302u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0x1EC7u16].as_slice());
+}
+
+#[test]
+fn test_nfc_utf8_to_errors() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut buf = StackString::new();
+ assert!(normalizer
+ .normalize_utf8_to(b"\xFFa\xCC\x88\xFF", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "\u{FFFD}ä\u{FFFD}");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"\x80e\xCC\xA3\xCC\x82\x80", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "\u{FFFD}ệ\u{FFFD}");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"aaa\xFFaaa\xFFaaa", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"aaa\xE2\x98aaa\xE2\x98aaa", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa");
+}
+
+#[test]
+fn test_nfd_utf8_to_errors() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut buf = StackString::new();
+ assert!(normalizer
+ .normalize_utf8_to(b"\xFF\xC3\xA4\xFF", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "\u{FFFD}a\u{0308}\u{FFFD}");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"\x80\xE1\xBB\x87\x80", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "\u{FFFD}e\u{0323}\u{0302}\u{FFFD}");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"aaa\xFFaaa\xFFaaa", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa");
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf8_to(b"aaa\xE2\x98aaa\xE2\x98aaa", &mut buf)
+ .is_ok());
+ assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa");
+}
+
+#[test]
+fn test_nfc_utf16_to_errors() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ let mut buf = StackVec::new();
+ assert!(normalizer
+ .normalize_utf16_to([0xD800u16, 0x0061u16, 0x0308u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0xFFFDu16, 0x00E4u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0xDC00u16, 0x0061u16, 0x0308u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0xFFFDu16, 0x00E4u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0061u16, 0x0308u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(&buf, [0x0061u16, 0xFFFDu16, 0x00E4u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0061u16, 0x0308u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(&buf, [0x0061u16, 0xFFFDu16, 0x00E4u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0061u16, 0x0308u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x00E4u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0061u16, 0x0308u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x00E4u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0061u16, 0x0061u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0061u16, 0x0061u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0308u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0308u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+}
+
+#[test]
+fn test_nfd_utf16_to_errors() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+
+ let mut buf = StackVec::new();
+ assert!(normalizer
+ .normalize_utf16_to([0xD800u16, 0x00E4u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0xFFFDu16, 0x0061u16, 0x0308u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0xDC00u16, 0x00E4u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(&buf, [0xFFFDu16, 0x0061u16, 0x0308u16].as_slice());
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0x0061u16, 0xD800u16, 0x00E4u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to([0x0061u16, 0xDC00u16, 0x00E4u16].as_slice(), &mut buf)
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x00E4u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x00E4u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0061u16, 0x0061u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0061u16, 0x0061u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xD800u16, 0x0308u16, 0xD800u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+
+ buf.clear();
+ assert!(normalizer
+ .normalize_utf16_to(
+ [0x0061u16, 0xDC00u16, 0x0308u16, 0xDC00u16].as_slice(),
+ &mut buf
+ )
+ .is_ok());
+ assert_eq!(
+ &buf,
+ [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice()
+ );
+}
+
+use atoi::FromRadix16;
+use icu_properties::props::CanonicalCombiningClass;
+
+/// Parse five semicolon-terminated strings consisting of space-separated hexadecimal scalar values
+fn parse_hex(mut hexes: &[u8]) -> [StackString; 5] {
+ let mut strings = [
+ StackString::new(),
+ StackString::new(),
+ StackString::new(),
+ StackString::new(),
+ StackString::new(),
+ ];
+ let mut current = 0;
+ loop {
+ let (scalar, mut offset) = u32::from_radix_16(hexes);
+ let c = core::char::from_u32(scalar).unwrap();
+ strings[current].try_push(c).unwrap();
+ match hexes[offset] {
+ b';' => {
+ current += 1;
+ if current == strings.len() {
+ return strings;
+ }
+ offset += 1;
+ }
+ b' ' => {
+ offset += 1;
+ }
+ _ => {
+ panic!("Bad format: Garbage");
+ }
+ }
+ hexes = &hexes[offset..];
+ }
+}
+
+#[test]
+fn test_conformance() {
+ let nfd = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
+ let nfc = ComposingNormalizerBorrowed::new_nfc();
+ let nfkc = ComposingNormalizerBorrowed::new_nfkc();
+
+ let mut prev = 0u32;
+ let mut part = 0u8;
+ let data = include_bytes!("data/NormalizationTest.txt");
+ let lines = data.split(|b| b == &b'\n');
+ for line in lines {
+ if line.is_empty() {
+ continue;
+ }
+ if line.starts_with(b"#") {
+ continue;
+ }
+ if line.starts_with(&b"@Part"[..]) {
+ part = line[5] - b'0';
+ if part == 2 {
+ for u in prev + 1..=0x10FFFF {
+ if let Some(c) = char::from_u32(u) {
+ assert!(nfd
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfkd
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfc
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfkc
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ }
+ }
+ }
+ continue;
+ }
+ let strings = parse_hex(line);
+ // 0: source
+ // 1: NFC
+ // 2: NFD
+ // 3: NFKC
+ // 4: NFKD
+ if part == 1 {
+ let mut iter = strings[0].chars();
+ let current = iter.next().unwrap();
+ assert_eq!(iter.next(), None);
+ let current_u = u32::from(current);
+ for u in prev + 1..current_u {
+ if let Some(c) = char::from_u32(u) {
+ assert!(nfd
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfkd
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfc
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ assert!(nfkc
+ .normalize_iter(core::iter::once(c))
+ .eq(core::iter::once(c)));
+ }
+ }
+ prev = current_u;
+ }
+ // NFC
+ assert!(nfc
+ .normalize_iter(strings[0].chars())
+ .eq(strings[1].chars()));
+ assert!(nfc
+ .normalize_iter(strings[1].chars())
+ .eq(strings[1].chars()));
+ assert!(nfc
+ .normalize_iter(strings[2].chars())
+ .eq(strings[1].chars()));
+
+ assert!(nfc
+ .normalize_iter(strings[3].chars())
+ .eq(strings[3].chars()));
+ assert!(nfc
+ .normalize_iter(strings[4].chars())
+ .eq(strings[3].chars()));
+
+ // NFD
+ assert!(nfd
+ .normalize_iter(strings[0].chars())
+ .eq(strings[2].chars()));
+ assert!(nfd
+ .normalize_iter(strings[1].chars())
+ .eq(strings[2].chars()));
+ assert!(nfd
+ .normalize_iter(strings[2].chars())
+ .eq(strings[2].chars()));
+
+ assert!(nfd
+ .normalize_iter(strings[3].chars())
+ .eq(strings[4].chars()));
+ assert!(nfd
+ .normalize_iter(strings[4].chars())
+ .eq(strings[4].chars()));
+
+ // NFKC
+ assert!(nfkc
+ .normalize_iter(strings[0].chars())
+ .eq(strings[3].chars()));
+ assert!(nfkc
+ .normalize_iter(strings[1].chars())
+ .eq(strings[3].chars()));
+ assert!(nfkc
+ .normalize_iter(strings[2].chars())
+ .eq(strings[3].chars()));
+ assert!(nfkc
+ .normalize_iter(strings[3].chars())
+ .eq(strings[3].chars()));
+ assert!(nfkc
+ .normalize_iter(strings[4].chars())
+ .eq(strings[3].chars()));
+
+ // NFKD
+ assert!(nfkd
+ .normalize_iter(strings[0].chars())
+ .eq(strings[4].chars()));
+ assert!(nfkd
+ .normalize_iter(strings[1].chars())
+ .eq(strings[4].chars()));
+ assert!(nfkd
+ .normalize_iter(strings[2].chars())
+ .eq(strings[4].chars()));
+ assert!(nfkd
+ .normalize_iter(strings[3].chars())
+ .eq(strings[4].chars()));
+ assert!(nfkd
+ .normalize_iter(strings[4].chars())
+ .eq(strings[4].chars()));
+ }
+}
+
+// Commented out, because we don't currently have a way to force a no-op set for testing.
+// #[test]
+// fn test_hangul() {
+// use icu_collections::codepointinvlist::{CodePointSet, CodePointSetBuilder};
+// use zerofrom::ZeroFrom;
+// let builder = CodePointSetBuilder::new();
+// let set: CodePointSet = builder.build();
+
+// let normalizer: ComposingNormalizer = ComposingNormalizerBorrowed::new_nfc();
+// {
+// let mut norm_iter = normalizer.normalize_iter("A\u{AC00}\u{11A7}".chars());
+// // Pessimize passthrough to avoid hiding bugs.
+// norm_iter
+// .decomposition
+// .potential_passthrough_and_not_backward_combining = Some(ZeroFrom::zero_from(&set));
+// assert!(norm_iter.eq("A\u{AC00}\u{11A7}".chars()));
+// }
+// {
+// let mut norm_iter = normalizer.normalize_iter("A\u{AC00}\u{11C2}".chars());
+// // Pessimize passthrough to avoid hiding bugs.
+// norm_iter
+// .decomposition
+// .potential_passthrough_and_not_backward_combining = Some(ZeroFrom::zero_from(&set));
+// assert!(norm_iter.eq("A\u{AC1B}".chars()));
+// }
+// }
+
+fn str_to_utf16(s: &str, sink: &mut StackVec) {
+ sink.clear();
+ let mut buf = [0u16; 2];
+ for c in s.chars() {
+ sink.try_extend_from_slice(c.encode_utf16(&mut buf))
+ .unwrap();
+ }
+}
+
+fn char_to_utf16(c: char, sink: &mut StackVec) {
+ sink.clear();
+ let mut buf = [0u16; 2];
+ sink.try_extend_from_slice(c.encode_utf16(&mut buf))
+ .unwrap();
+}
+
+fn str_to_str(s: &str, sink: &mut StackString) {
+ sink.clear();
+ sink.try_push_str(s).unwrap();
+}
+
+fn char_to_str(c: char, sink: &mut StackString) {
+ sink.clear();
+ sink.try_push(c).unwrap();
+}
+
+#[test]
+fn test_conformance_utf16() {
+ let nfd = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
+ let nfc = ComposingNormalizerBorrowed::new_nfc();
+ let nfkc = ComposingNormalizerBorrowed::new_nfkc();
+
+ let mut input = StackVec::new();
+ let mut normalized = StackVec::new();
+ let mut expected = StackVec::new();
+
+ let mut prev = 0u32;
+ let mut part = 0u8;
+ let data = include_bytes!("data/NormalizationTest.txt");
+ let lines = data.split(|b| b == &b'\n');
+ for line in lines {
+ if line.is_empty() {
+ continue;
+ }
+ if line.starts_with(b"#") {
+ continue;
+ }
+ if line.starts_with(&b"@Part"[..]) {
+ part = line[5] - b'0';
+ if part == 2 {
+ for u in prev + 1..=0x10FFFF {
+ if let Some(c) = char::from_u32(u) {
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+ }
+ }
+ }
+ continue;
+ }
+ let strings = parse_hex(line);
+ // 0: source
+ // 1: NFC
+ // 2: NFD
+ // 3: NFKC
+ // 4: NFKD
+ if part == 1 {
+ let mut iter = strings[0].chars();
+ let current = iter.next().unwrap();
+ assert_eq!(iter.next(), None);
+ let current_u = u32::from(current);
+ for u in prev + 1..current_u {
+ if let Some(c) = char::from_u32(u) {
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_utf16(c, &mut input);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &input);
+ }
+ }
+ prev = current_u;
+ }
+ // NFC
+ normalized.clear();
+ str_to_utf16(&strings[0], &mut input);
+ str_to_utf16(&strings[1], &mut expected);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[1], &mut input);
+ str_to_utf16(&strings[1], &mut expected);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[2], &mut input);
+ str_to_utf16(&strings[1], &mut expected);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[3], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[4], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFD
+ normalized.clear();
+ str_to_utf16(&strings[0], &mut input);
+ str_to_utf16(&strings[2], &mut expected);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[1], &mut input);
+ str_to_utf16(&strings[2], &mut expected);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[2], &mut input);
+ str_to_utf16(&strings[2], &mut expected);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[3], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[4], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFKC
+ normalized.clear();
+ str_to_utf16(&strings[0], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[1], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[2], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[3], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[4], &mut input);
+ str_to_utf16(&strings[3], &mut expected);
+ assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFKD
+ normalized.clear();
+ str_to_utf16(&strings[0], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[1], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[2], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[3], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_utf16(&strings[4], &mut input);
+ str_to_utf16(&strings[4], &mut expected);
+ assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok());
+ assert_eq!(&normalized, &expected);
+ }
+}
+
+#[test]
+fn test_conformance_utf8() {
+ let nfd = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
+ let nfc = ComposingNormalizerBorrowed::new_nfc();
+ let nfkc = ComposingNormalizerBorrowed::new_nfkc();
+
+ let mut input = StackString::new();
+ let mut normalized = StackString::new();
+ let mut expected = StackString::new();
+
+ let mut prev = 0u32;
+ let mut part = 0u8;
+ let data = include_bytes!("data/NormalizationTest.txt");
+ let lines = data.split(|b| b == &b'\n');
+ for line in lines {
+ if line.is_empty() {
+ continue;
+ }
+ if line.starts_with(b"#") {
+ continue;
+ }
+ if line.starts_with(&b"@Part"[..]) {
+ part = line[5] - b'0';
+ if part == 2 {
+ for u in prev + 1..=0x10FFFF {
+ if let Some(c) = char::from_u32(u) {
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+ }
+ }
+ }
+ continue;
+ }
+ let strings = parse_hex(line);
+ // 0: source
+ // 1: NFC
+ // 2: NFD
+ // 3: NFKC
+ // 4: NFKD
+ if part == 1 {
+ let mut iter = strings[0].chars();
+ let current = iter.next().unwrap();
+ assert_eq!(iter.next(), None);
+ let current_u = u32::from(current);
+ for u in prev + 1..current_u {
+ if let Some(c) = char::from_u32(u) {
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+
+ normalized.clear();
+ char_to_str(c, &mut input);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &input);
+ }
+ }
+ prev = current_u;
+ }
+ // NFC
+ normalized.clear();
+ str_to_str(&strings[0], &mut input);
+ str_to_str(&strings[1], &mut expected);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[1], &mut input);
+ str_to_str(&strings[1], &mut expected);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[2], &mut input);
+ str_to_str(&strings[1], &mut expected);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[3], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[4], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFD
+ normalized.clear();
+ str_to_str(&strings[0], &mut input);
+ str_to_str(&strings[2], &mut expected);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[1], &mut input);
+ str_to_str(&strings[2], &mut expected);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[2], &mut input);
+ str_to_str(&strings[2], &mut expected);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[3], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[4], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFKC
+ normalized.clear();
+ str_to_str(&strings[0], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[1], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[2], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[3], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[4], &mut input);
+ str_to_str(&strings[3], &mut expected);
+ assert!(nfkc
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ // NFKD
+ normalized.clear();
+ str_to_str(&strings[0], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[1], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[2], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[3], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+
+ normalized.clear();
+ str_to_str(&strings[4], &mut input);
+ str_to_str(&strings[4], &mut expected);
+ assert!(nfkd
+ .normalize_utf8_to(input.as_bytes(), &mut normalized)
+ .is_ok());
+ assert_eq!(&normalized, &expected);
+ }
+}
+
+#[test]
+fn test_canonical_composition() {
+ let comp = CanonicalCompositionBorrowed::new();
+
+ assert_eq!(comp.compose('a', 'b'), None); // Just two starters
+
+ assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
+ assert_eq!(comp.compose('A', '\u{0308}'), Some('Ä'));
+ assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
+ assert_eq!(comp.compose('Ẹ', '\u{0302}'), Some('Ệ'));
+ assert_eq!(comp.compose('\u{1D157}', '\u{1D165}'), None); // Composition exclusion
+
+ assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter; BMP
+ assert_eq!(comp.compose('𑄱', '𑄧'), Some('𑄮')); // Second is starter; non-BMP
+
+ assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
+ assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
+}
+
+#[test]
+fn test_canonical_composition_owned() {
+ let owned = CanonicalComposition::try_new_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let comp = owned.as_borrowed();
+
+ assert_eq!(comp.compose('a', 'b'), None); // Just two starters
+
+ assert_eq!(comp.compose('a', '\u{0308}'), Some('ä'));
+ assert_eq!(comp.compose('A', '\u{0308}'), Some('Ä'));
+ assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ'));
+ assert_eq!(comp.compose('Ẹ', '\u{0302}'), Some('Ệ'));
+ assert_eq!(comp.compose('\u{1D157}', '\u{1D165}'), None); // Composition exclusion
+
+ assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter; BMP
+ assert_eq!(comp.compose('𑄱', '𑄧'), Some('𑄮')); // Second is starter; non-BMP
+
+ assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV
+ assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT
+}
+
+#[test]
+fn test_canonical_decomposition() {
+ let decomp = CanonicalDecompositionBorrowed::new();
+
+ assert_eq!(
+ decomp.decompose('ä'),
+ Decomposed::Expansion('a', '\u{0308}')
+ );
+ assert_eq!(
+ decomp.decompose('Ä'),
+ Decomposed::Expansion('A', '\u{0308}')
+ );
+ assert_eq!(
+ decomp.decompose('ệ'),
+ Decomposed::Expansion('ẹ', '\u{0302}')
+ );
+ assert_eq!(
+ decomp.decompose('Ệ'),
+ Decomposed::Expansion('Ẹ', '\u{0302}')
+ );
+ assert_eq!(
+ decomp.decompose('\u{1D15E}'),
+ Decomposed::Expansion('\u{1D157}', '\u{1D165}')
+ );
+ assert_eq!(decomp.decompose('ো'), Decomposed::Expansion('ে', 'া'));
+ assert_eq!(decomp.decompose('𑄮'), Decomposed::Expansion('𑄱', '𑄧'));
+ assert_eq!(decomp.decompose('가'), Decomposed::Expansion('ᄀ', 'ᅡ'));
+ assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
+
+ assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
+ assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
+
+ assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
+ assert_eq!(
+ decomp.decompose('\u{1F72}'),
+ Decomposed::Expansion('ε', '\u{0300}')
+ ); // not oxia but in the oxia range
+ assert_eq!(
+ decomp.decompose('ά'),
+ Decomposed::Expansion('α', '\u{0301}')
+ ); // tonos
+}
+
+#[test]
+fn test_canonical_decomposition_owned() {
+ let owned = CanonicalDecomposition::try_new_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let decomp = owned.as_borrowed();
+
+ assert_eq!(
+ decomp.decompose('ä'),
+ Decomposed::Expansion('a', '\u{0308}')
+ );
+ assert_eq!(
+ decomp.decompose('Ä'),
+ Decomposed::Expansion('A', '\u{0308}')
+ );
+ assert_eq!(
+ decomp.decompose('ệ'),
+ Decomposed::Expansion('ẹ', '\u{0302}')
+ );
+ assert_eq!(
+ decomp.decompose('Ệ'),
+ Decomposed::Expansion('Ẹ', '\u{0302}')
+ );
+ assert_eq!(
+ decomp.decompose('\u{1D15E}'),
+ Decomposed::Expansion('\u{1D157}', '\u{1D165}')
+ );
+ assert_eq!(decomp.decompose('ো'), Decomposed::Expansion('ে', 'া'));
+ assert_eq!(decomp.decompose('𑄮'), Decomposed::Expansion('𑄱', '𑄧'));
+ assert_eq!(decomp.decompose('가'), Decomposed::Expansion('ᄀ', 'ᅡ'));
+ assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ'));
+
+ assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN
+ assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN
+
+ assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia
+ assert_eq!(
+ decomp.decompose('\u{1F72}'),
+ Decomposed::Expansion('ε', '\u{0300}')
+ ); // not oxia but in the oxia range
+ assert_eq!(
+ decomp.decompose('ά'),
+ Decomposed::Expansion('α', '\u{0301}')
+ ); // tonos
+}
+
+#[test]
+fn test_ccc() {
+ let map = CanonicalCombiningClassMapBorrowed::new();
+ for u in 0..=0x10FFFF {
+ assert_eq!(
+ map.get32(u),
+ icu_properties::CodePointMapData::<CanonicalCombiningClass>::new().get32(u)
+ );
+ }
+}
+
+#[test]
+fn test_ccc_owned() {
+ let owned =
+ CanonicalCombiningClassMap::try_new_unstable(&icu_normalizer::provider::Baked).unwrap();
+ let map = owned.as_borrowed();
+ for u in 0..=0x10FFFF {
+ assert_eq!(
+ map.get32(u),
+ icu_properties::CodePointMapData::<CanonicalCombiningClass>::new().get32(u)
+ );
+ }
+}
+
+#[test]
+fn test_utf16_basic() {
+ let normalizer = ComposingNormalizerBorrowed::new_nfc();
+
+ assert_eq!(
+ normalizer.normalize_utf16(&[0x0061]).as_ref(),
+ [0x0061].as_slice()
+ );
+ assert_eq!(
+ normalizer.normalize_utf16(&[0x0300, 0x0323]).as_ref(),
+ [0x0323, 0x0300].as_slice()
+ );
+}
+
+#[test]
+fn test_accented_digraph() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfkd();
+ assert_eq!(
+ normalizer.normalize("\u{01C4}\u{0323}"),
+ "DZ\u{0323}\u{030C}"
+ );
+ assert_eq!(
+ normalizer.normalize("DZ\u{030C}\u{0323}"),
+ "DZ\u{0323}\u{030C}"
+ );
+}
+
+#[test]
+fn test_ddd() {
+ let normalizer = DecomposingNormalizerBorrowed::new_nfd();
+ assert_eq!(
+ normalizer.normalize("\u{0DDD}\u{0334}"),
+ "\u{0DD9}\u{0DCF}\u{0334}\u{0DCA}"
+ );
+}
+
+#[test]
+fn test_is_normalized() {
+ let nfd = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
+ let nfc = ComposingNormalizerBorrowed::new_nfc();
+ let nfkc = ComposingNormalizerBorrowed::new_nfkc();
+
+ let aaa = "aaa";
+ assert!(nfd.is_normalized(aaa));
+ assert!(nfkd.is_normalized(aaa));
+ assert!(nfc.is_normalized(aaa));
+ assert!(nfkc.is_normalized(aaa));
+
+ assert!(nfd.is_normalized_utf8(aaa.as_bytes()));
+ assert!(nfkd.is_normalized_utf8(aaa.as_bytes()));
+ assert!(nfc.is_normalized_utf8(aaa.as_bytes()));
+ assert!(nfkc.is_normalized_utf8(aaa.as_bytes()));
+
+ let aaa16 = [0x0061u16, 0x0061u16, 0x0061u16].as_slice();
+ assert!(nfd.is_normalized_utf16(aaa16));
+ assert!(nfkd.is_normalized_utf16(aaa16));
+ assert!(nfc.is_normalized_utf16(aaa16));
+ assert!(nfkc.is_normalized_utf16(aaa16));
+
+ let affa = b"a\xFFa";
+ assert!(nfd.is_normalized_utf8(affa));
+ assert!(nfkd.is_normalized_utf8(affa));
+ assert!(nfc.is_normalized_utf8(affa));
+ assert!(nfkc.is_normalized_utf8(affa));
+
+ let a_surrogate_a = [0x0061u16, 0xD800u16, 0x0061u16].as_slice();
+ assert!(nfd.is_normalized_utf16(a_surrogate_a));
+ assert!(nfkd.is_normalized_utf16(a_surrogate_a));
+ assert!(nfc.is_normalized_utf16(a_surrogate_a));
+ assert!(nfkc.is_normalized_utf16(a_surrogate_a));
+
+ let note = "a𝅗\u{1D165}a";
+ assert!(nfd.is_normalized(note));
+ assert!(nfkd.is_normalized(note));
+ assert!(nfc.is_normalized(note));
+ assert!(nfkc.is_normalized(note));
+
+ assert!(nfd.is_normalized_utf8(note.as_bytes()));
+ assert!(nfkd.is_normalized_utf8(note.as_bytes()));
+ assert!(nfc.is_normalized_utf8(note.as_bytes()));
+ assert!(nfkc.is_normalized_utf8(note.as_bytes()));
+
+ let note16 = [
+ 0x0061u16, 0xD834u16, 0xDD57u16, 0xD834u16, 0xDD65u16, 0x0061u16,
+ ]
+ .as_slice();
+ assert!(nfd.is_normalized_utf16(note16));
+ assert!(nfkd.is_normalized_utf16(note16));
+ assert!(nfc.is_normalized_utf16(note16));
+ assert!(nfkc.is_normalized_utf16(note16));
+
+ let umlaut = "aäa";
+ assert!(!nfd.is_normalized(umlaut));
+ assert!(!nfkd.is_normalized(umlaut));
+ assert!(nfc.is_normalized(umlaut));
+ assert!(nfkc.is_normalized(umlaut));
+
+ assert!(!nfd.is_normalized_utf8(umlaut.as_bytes()));
+ assert!(!nfkd.is_normalized_utf8(umlaut.as_bytes()));
+ assert!(nfc.is_normalized_utf8(umlaut.as_bytes()));
+ assert!(nfkc.is_normalized_utf8(umlaut.as_bytes()));
+
+ let umlaut16 = [0x0061u16, 0x00E4u16, 0x0061u16].as_slice();
+ assert!(!nfd.is_normalized_utf16(umlaut16));
+ assert!(!nfkd.is_normalized_utf16(umlaut16));
+ assert!(nfc.is_normalized_utf16(umlaut16));
+ assert!(nfkc.is_normalized_utf16(umlaut16));
+
+ let fraction = "a½a";
+ assert!(nfd.is_normalized(fraction));
+ assert!(!nfkd.is_normalized(fraction));
+ assert!(nfc.is_normalized(fraction));
+ assert!(!nfkc.is_normalized(fraction));
+
+ assert!(nfd.is_normalized_utf8(fraction.as_bytes()));
+ assert!(!nfkd.is_normalized_utf8(fraction.as_bytes()));
+ assert!(nfc.is_normalized_utf8(fraction.as_bytes()));
+ assert!(!nfkc.is_normalized_utf8(fraction.as_bytes()));
+
+ let fraction16 = [0x0061u16, 0x00BDu16, 0x0061u16].as_slice();
+ assert!(nfd.is_normalized_utf16(fraction16));
+ assert!(!nfkd.is_normalized_utf16(fraction16));
+ assert!(nfc.is_normalized_utf16(fraction16));
+ assert!(!nfkc.is_normalized_utf16(fraction16));
+}
+
+#[test]
+fn test_is_normalized_up_to() {
+ let nfd = DecomposingNormalizerBorrowed::new_nfd();
+ let nfkd = DecomposingNormalizerBorrowed::new_nfkd();
+ let nfc = ComposingNormalizerBorrowed::new_nfc();
+ let nfkc = ComposingNormalizerBorrowed::new_nfkc();
+
+ // Check a string slice is normalized up to where is_normalized_up_to reports
+ let check_str = |input: &str| {
+ // Check nfd
+ let (head, tail) = nfd.split_normalized(input);
+ let mut normalized = String::from(head);
+ let _ = nfd.normalize_to(tail, &mut normalized);
+ assert!(nfd.is_normalized(&normalized));
+
+ // Check nfkd
+ let (head, tail) = nfkd.split_normalized(input);
+ let mut normalized = String::from(head);
+ let _ = nfkd.normalize_to(tail, &mut normalized);
+ assert!(nfkd.is_normalized(&normalized));
+
+ // Check nfc
+ let (head, tail) = nfc.split_normalized(input);
+ let mut normalized = String::from(head);
+ let _ = nfc.normalize_to(tail, &mut normalized);
+ assert!(nfc.is_normalized(&normalized));
+
+ // Check nfkc
+ let (head, tail) = nfkc.split_normalized(input);
+ let mut normalized = String::from(head);
+ let _ = nfkc.normalize_to(tail, &mut normalized);
+ assert!(nfkc.is_normalized(&normalized));
+ };
+
+ // Check a string of UTF8 bytes is normalized up to where is_normalized_up_to reports
+ // note: from_utf8 can panic with invalid UTF8 input
+ let check_utf8 = |input: &[u8]| {
+ // Check nfd
+ let (head, tail) = nfd.split_normalized_utf8(input);
+ let mut normalized = String::from(head);
+ let _ = nfd.normalize_utf8_to(tail, &mut normalized);
+ assert!(nfd.is_normalized(&normalized));
+
+ // Check nfkd
+ let (head, tail) = nfkd.split_normalized_utf8(input);
+ let mut normalized = String::from(head);
+ let _ = nfkd.normalize_utf8_to(tail, &mut normalized);
+ assert!(nfkd.is_normalized(&normalized));
+
+ // Check nfc
+ let (head, tail) = nfc.split_normalized_utf8(input);
+ let mut normalized = String::from(head);
+ let _ = nfc.normalize_utf8_to(tail, &mut normalized);
+ assert!(nfc.is_normalized(&normalized));
+
+ // Check nfkc
+ let (head, tail) = nfkc.split_normalized_utf8(input);
+ let mut normalized = String::from(head);
+ let _ = nfkc.normalize_utf8_to(tail, &mut normalized);
+ assert!(nfkc.is_normalized(&normalized));
+ };
+
+ // Check a string of UTF-16 code units is normalized up to where is_normalized_up_to reports
+ let check_utf16 = |input: &[u16]| {
+ // Check nfd
+ let (head, tail) = nfd.split_normalized_utf16(input);
+ let mut normalized = head.to_vec();
+ let _ = nfd.normalize_utf16_to(tail, &mut normalized);
+ assert!(nfd.is_normalized_utf16(&normalized));
+
+ // Check nfkd
+ let (head, tail) = nfkd.split_normalized_utf16(input);
+ let mut normalized = head.to_vec();
+ let _ = nfkd.normalize_utf16_to(tail, &mut normalized);
+ assert!(nfkd.is_normalized_utf16(&normalized));
+
+ // Check nfc
+ let (head, tail) = nfc.split_normalized_utf16(input);
+ let mut normalized = head.to_vec();
+ let _ = nfc.normalize_utf16_to(tail, &mut normalized);
+ assert!(nfc.is_normalized_utf16(&normalized));
+
+ // Check nfkc
+ let (head, tail) = nfkc.split_normalized_utf16(input);
+ let mut normalized = head.to_vec();
+ let _ = nfkc.normalize_utf16_to(tail, &mut normalized);
+ assert!(nfkc.is_normalized_utf16(&normalized));
+ };
+
+ let aaa = "aaa";
+ check_str(aaa);
+
+ let aaa_utf8 = aaa.as_bytes();
+ check_utf8(aaa_utf8);
+
+ let aaa_utf16: Vec<u16> = aaa.encode_utf16().collect();
+ check_utf16(&aaa_utf16);
+
+ assert!(nfd.split_normalized(aaa).0.len() == aaa.len());
+ assert!(nfkd.split_normalized(aaa).0.len() == aaa.len());
+ assert!(nfc.split_normalized(aaa).0.len() == aaa.len());
+ assert!(nfkc.split_normalized(aaa).0.len() == aaa.len());
+ assert!(nfd.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len());
+ assert!(nfkd.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len());
+ assert!(nfc.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len());
+ assert!(nfkc.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len());
+ assert!(nfd.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len());
+ assert!(nfkd.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len());
+ assert!(nfc.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len());
+ assert!(nfkc.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len());
+
+ let note = "a𝅗\u{1D165}a";
+ check_str(note);
+
+ let note_utf8 = note.as_bytes();
+ check_utf8(note_utf8);
+
+ let note_utf16: Vec<u16> = note.encode_utf16().collect();
+ check_utf16(&note_utf16);
+
+ assert!(nfd.split_normalized(note).0.len() == note.len());
+ assert!(nfkd.split_normalized(note).0.len() == note.len());
+ assert!(nfc.split_normalized(note).0.len() == note.len());
+ assert!(nfkc.split_normalized(note).0.len() == note.len());
+ assert!(nfd.split_normalized_utf8(note_utf8).0.len() == note_utf8.len());
+ assert!(nfkd.split_normalized_utf8(note_utf8).0.len() == note_utf8.len());
+ assert!(nfc.split_normalized_utf8(note_utf8).0.len() == note_utf8.len());
+ assert!(nfkc.split_normalized_utf8(note_utf8).0.len() == note_utf8.len());
+ assert!(nfd.split_normalized_utf16(&note_utf16).0.len() == note_utf16.len());
+ assert!(nfkd.split_normalized_utf16(&note_utf16).0.len() == note_utf16.len());
+ assert!(nfc.split_normalized_utf16(&note_utf16).0.len() == note_utf16.len());
+ assert!(nfkc.split_normalized_utf16(&note_utf16).0.len() == note_utf16.len());
+
+ let umlaut = "aäa";
+ check_str(umlaut);
+
+ let umlaut_utf8 = umlaut.as_bytes();
+ check_utf8(umlaut_utf8);
+
+ let umlaut_utf16: Vec<u16> = umlaut.encode_utf16().collect();
+ check_utf16(&umlaut_utf16);
+
+ assert_eq!(nfd.split_normalized(umlaut).0.len(), 1);
+ assert_eq!(nfkd.split_normalized(umlaut).0.len(), 1);
+ assert_eq!(nfc.split_normalized(umlaut).0.len(), 4);
+ assert_eq!(nfkc.split_normalized(umlaut).0.len(), 4);
+ assert_eq!(nfd.split_normalized_utf8(umlaut_utf8).0.len(), 1);
+ assert_eq!(nfkd.split_normalized_utf8(umlaut_utf8).0.len(), 1);
+ assert_eq!(nfc.split_normalized_utf8(umlaut_utf8).0.len(), 4);
+ assert_eq!(nfkc.split_normalized_utf8(umlaut_utf8).0.len(), 4);
+ assert_eq!(nfd.split_normalized_utf16(&umlaut_utf16).0.len(), 1);
+ assert_eq!(nfkd.split_normalized_utf16(&umlaut_utf16).0.len(), 1);
+ assert_eq!(nfc.split_normalized_utf16(&umlaut_utf16).0.len(), 3);
+ assert_eq!(nfkc.split_normalized_utf16(&umlaut_utf16).0.len(), 3);
+
+ let fraction = "a½a";
+ check_str(fraction);
+
+ let fraction_utf8 = fraction.as_bytes();
+ check_utf8(fraction_utf8);
+
+ let fraction_utf16: Vec<u16> = fraction.encode_utf16().collect();
+ check_utf16(&fraction_utf16);
+
+ assert_eq!(nfd.split_normalized(fraction).0.len(), 4);
+ assert_eq!(nfkd.split_normalized(fraction).0.len(), 1);
+ assert_eq!(nfc.split_normalized(fraction).0.len(), 4);
+ assert_eq!(nfkc.split_normalized(fraction).0.len(), 1);
+ assert_eq!(nfd.split_normalized_utf8(fraction_utf8).0.len(), 4);
+ assert_eq!(nfkd.split_normalized_utf8(fraction_utf8).0.len(), 1);
+ assert_eq!(nfc.split_normalized_utf8(fraction_utf8).0.len(), 4);
+ assert_eq!(nfkc.split_normalized_utf8(fraction_utf8).0.len(), 1);
+ assert_eq!(nfd.split_normalized_utf16(&fraction_utf16).0.len(), 3);
+ assert_eq!(nfkd.split_normalized_utf16(&fraction_utf16).0.len(), 1);
+ assert_eq!(nfc.split_normalized_utf16(&fraction_utf16).0.len(), 3);
+ assert_eq!(nfkc.split_normalized_utf16(&fraction_utf16).0.len(), 1);
+
+ let reversed_vietnamese = "e\u{0302}\u{0323}";
+ check_str(reversed_vietnamese);
+
+ let reversed_vietnamese_utf8 = reversed_vietnamese.as_bytes();
+ check_utf8(reversed_vietnamese_utf8);
+
+ let reversed_vietnamese_utf16: Vec<u16> = reversed_vietnamese.encode_utf16().collect();
+ check_utf16(&reversed_vietnamese_utf16);
+
+ assert_eq!(nfd.split_normalized(reversed_vietnamese).0.len(), 1);
+ assert_eq!(nfkd.split_normalized(reversed_vietnamese).0.len(), 1);
+ assert_eq!(nfc.split_normalized(reversed_vietnamese).0.len(), 0);
+ assert_eq!(nfkc.split_normalized(reversed_vietnamese).0.len(), 0);
+ assert_eq!(
+ nfd.split_normalized_utf8(reversed_vietnamese_utf8).0.len(),
+ 1
+ );
+ assert_eq!(
+ nfkd.split_normalized_utf8(reversed_vietnamese_utf8).0.len(),
+ 1
+ );
+ assert_eq!(
+ nfc.split_normalized_utf8(reversed_vietnamese_utf8).0.len(),
+ 0
+ );
+ assert_eq!(
+ nfkc.split_normalized_utf8(reversed_vietnamese_utf8).0.len(),
+ 0
+ );
+ assert_eq!(
+ nfd.split_normalized_utf16(&reversed_vietnamese_utf16)
+ .0
+ .len(),
+ 1
+ );
+ assert_eq!(
+ nfkd.split_normalized_utf16(&reversed_vietnamese_utf16)
+ .0
+ .len(),
+ 1
+ );
+ assert_eq!(
+ nfc.split_normalized_utf16(&reversed_vietnamese_utf16)
+ .0
+ .len(),
+ 0
+ );
+ assert_eq!(
+ nfkc.split_normalized_utf16(&reversed_vietnamese_utf16)
+ .0
+ .len(),
+ 0
+ );
+
+ let truncated_vietnamese = "e\u{0302}";
+ check_str(truncated_vietnamese);
+
+ let truncated_vietnamese_utf8 = truncated_vietnamese.as_bytes();
+ check_utf8(truncated_vietnamese_utf8);
+
+ let truncated_vietnamese_utf16: Vec<u16> = truncated_vietnamese.encode_utf16().collect();
+ check_utf16(&truncated_vietnamese_utf16);
+
+ assert_eq!(nfd.split_normalized(truncated_vietnamese).0.len(), 3);
+ assert_eq!(nfkd.split_normalized(truncated_vietnamese).0.len(), 3);
+ assert_eq!(nfc.split_normalized(truncated_vietnamese).0.len(), 0);
+ assert_eq!(nfkc.split_normalized(truncated_vietnamese).0.len(), 0);
+ assert_eq!(
+ nfd.split_normalized_utf8(truncated_vietnamese_utf8).0.len(),
+ 3
+ );
+ assert_eq!(
+ nfkd.split_normalized_utf8(truncated_vietnamese_utf8)
+ .0
+ .len(),
+ 3
+ );
+ assert_eq!(
+ nfc.split_normalized_utf8(truncated_vietnamese_utf8).0.len(),
+ 0
+ );
+ assert_eq!(
+ nfkc.split_normalized_utf8(truncated_vietnamese_utf8)
+ .0
+ .len(),
+ 0
+ );
+ assert_eq!(
+ nfd.split_normalized_utf16(&truncated_vietnamese_utf16)
+ .0
+ .len(),
+ 2
+ );
+ assert_eq!(
+ nfkd.split_normalized_utf16(&truncated_vietnamese_utf16)
+ .0
+ .len(),
+ 2
+ );
+ assert_eq!(
+ nfc.split_normalized_utf16(&truncated_vietnamese_utf16)
+ .0
+ .len(),
+ 0
+ );
+ assert_eq!(
+ nfkc.split_normalized_utf16(&truncated_vietnamese_utf16)
+ .0
+ .len(),
+ 0
+ );
+}