diff options
| author | mo khan <mo@mokhan.ca> | 2025-07-10 13:11:11 -0600 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2025-07-10 13:11:11 -0600 |
| commit | 01959b16a21b22b5df5f16569c2a8e8f92beecef (patch) | |
| tree | 32afa5d747c5466345c59ec52161a7cba3d6d755 /vendor/icu_normalizer | |
| parent | ff30574117a996df332e23d1fb6f65259b316b5b (diff) | |
chore: vendor dependencies
Diffstat (limited to 'vendor/icu_normalizer')
36 files changed, 9397 insertions, 0 deletions
diff --git a/vendor/icu_normalizer/.cargo-checksum.json b/vendor/icu_normalizer/.cargo-checksum.json new file mode 100644 index 00000000..93f500cf --- /dev/null +++ b/vendor/icu_normalizer/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.lock":"1c8fe1c6e24d42329df5fb51aa5c07741ff411e78bcbef0f15cfc0cf400b4335","Cargo.toml":"b3ebc0d4deaf34153984d80c71ecfde9fe30d5621081322a00ff87c73348e57e","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"aec56e279d7e40a901b47a2eccb52197fde6c9499011b349c5ef509363bee6a9","benches/bench.rs":"9cd781e3d0e8d772860cd332b4f403910f3ca52fd69a459f5ac95d28f0e25ac2","benches/canonical_composition.rs":"0aa91d5d400f58da61865f5fabe878c8506e60466c78503f77041ef7257e6dbe","benches/canonical_decomposition.rs":"3b44b8f832e426e8c82e449743117182ab7b138288001b621ccc9325b4c27b6c","benches/composing_normalizer_nfc.rs":"9a7aaae94e0096ccac9f3d1a83585c3f449af87f9f0f8b05615d2a010078e3e8","benches/composing_normalizer_nfkc.rs":"ad92d562a1e9aad3611521526882e1896aa436d2ac59493c8c00686c57bdf31e","benches/data/README.md":"fa79b84815a228c3fbfa5d4c6d12885036994ca8ad61e683b2113cf2b428bb85","benches/data/TestNames_Japanese_h.txt":"6522f8ed794ad348c904079082ec3aa303ae7acf3f68bbc49fa0ee90eebf31e0","benches/data/TestNames_Japanese_k.txt":"e4e18804fe742ecd27ae48bc3564c6bc653180a3c649d43a2ab4d8b7f2607627","benches/data/TestNames_Korean.txt":"9cbf54d5ee16726c0fc9477366e273ba1b82e651c9e88e6c7532df5344f03920","benches/data/TestNames_Latin.txt":"3a30d450d259a6be4a6aee8eeef08d3767d11fcc047b8f58060c542efe1182d1","benches/data/TestNames_Thai.txt":"28d76ddb62d6f47646232860fce7440544f402158443889393fd7e8bf10e9c3d","benches/data/TestRandomWordsUDHR_ar.txt":"02a775153e9746ae938a9db0b60244f2c00d911bb72b611a3593b0991fd95723","benches/data/TestRandomWordsUDHR_de.txt":"100b9502e7ddcb2fcbd055cb7ec9113245105bd1c606cace5e5bc147cc18727b","benches/data/TestRandomWordsUDHR_el.txt":"d1a2f0f9efc9ce663026ca7c285177391937c90008479a8c5b909c300dc86972","benches/data/TestRandomWordsUDHR_es.txt":"deeebda09e0ce0f80dd805317e96d1a630908601ff2a4d1ccb0021b00b55814b","benches/data/TestRandomWordsUDHR_fr.txt":"5931edc9f1af2c27a0b35c9624732e70b87b0fd72ab486710f3aa6367c7ad35f","benches/data/TestRandomWordsUDHR_he.txt":"dc77a89ffb9803e5c574d87f4789cb17624df73e40a8a92961df8ea8be103425","benches/data/TestRandomWordsUDHR_pl.txt":"26c378295ee2ef75ccacea691df0456394184a9a5c9ce48b2bada169b2402bbb","benches/data/TestRandomWordsUDHR_ru.txt":"a1c339f6d7b69cf9154e855c290ab09eeaf167ebcdf6d4bcb917de039fba10ee","benches/data/TestRandomWordsUDHR_th.txt":"3ba518be9863c85c3ac80cbb12299e3594e6f5afed3406d910d948007adaaf4e","benches/data/TestRandomWordsUDHR_tr.txt":"815c7babbc7228ef89b56f29638aeb63013aeca0003a49e58994e26b41cba01c","benches/data/wotw.txt":"8f28e68041ce75bbf75e72e186a6145e4c2de9e7e62b9b86ce0621c527a23669","benches/decomposing_normalizer_nfd.rs":"28f3d54c9af813af7ac9d0fbc9d45a7a6d27a25266bd593453eb35c1894280b5","benches/decomposing_normalizer_nfkd.rs":"cbaa2755878ee1cc90170210fddb7c79836457f89eb84f4f32fb51348f350bd5","src/lib.rs":"49621ffe84e82515aecf3c660234355561520ee11066d30d49ef1189181b4ef4","src/properties.rs":"3940f55f1e608fe9a70cb943e71cfd37894339af6b7d13697ae1776d7c1a2cc0","src/provider.rs":"5850afc7ae842c7af74ce029be256944c64f5d0b51d95725a8366f5af22163e9","src/uts46.rs":"a54b6191cbb0538da16d8ef0b6dfb3adfa2ca30e4161aaf37bcaae3e6537de80","tests/data/NormalizationTest.txt":"1b04c22b82064adf871e76fd2148cd749129163f7d05bd7ace923516a65afe02","tests/data/README.md":"521fcd44a1f10f21629df88113fa29ca9f4e1dfbeea79fda19a7dc8ba435e24b","tests/tests.rs":"01db1c9dc1c7c71f80aed528e4309f416349af9eec887d2e438a3a11f2ee7f7c"},"package":"436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979"}
\ No newline at end of file diff --git a/vendor/icu_normalizer/Cargo.lock b/vendor/icu_normalizer/Cargo.lock new file mode 100644 index 00000000..0c89b8b3 --- /dev/null +++ b/vendor/icu_normalizer/Cargo.lock @@ -0,0 +1,970 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" + +[[package]] +name = "arraystring" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d517c467117e1d8ca795bc8cc90857ff7f79790cca0e26f6e9462694ece0185" +dependencies = [ + "typenum", +] + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e578d6ec4194633722ccf9544794b71b1385c3c027efe0c55db226fc880865c" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4df4df40ec50c46000231c914968278b1eb05098cf8f1b3a518a95030e71d1c7" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" + +[[package]] +name = "cobs" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67ba02a97a2bd10f4b59b25c7973101c79642302776489e030cd13cdab09ed15" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" + +[[package]] +name = "databake" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" +dependencies = [ + "databake-derive", + "proc-macro2", + "quote", +] + +[[package]] +name = "databake-derive" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6834770958c7b84223607e49758ec0dde273c4df915e734aad50f62968a4c134" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "detone" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5b580660e7375410c9199e84aa298f919925fb53d8cc9b02d8010ff5a14d09" + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "erased-serde" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e004d887f51fcb9fef17317a2f3525c887d8aa3f4f50fed920816a688284a5b7" +dependencies = [ + "serde", + "typeid", +] + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + +[[package]] +name = "hermit-abi" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "databake", + "displaydoc", + "potential_utf", + "serde", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "databake", + "displaydoc", + "litemap", + "serde", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +dependencies = [ + "arraystring", + "arrayvec", + "atoi", + "criterion", + "databake", + "detone", + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "serde", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2549ca8c7241c82f59c80ba2a6f415d931c5b58d24fb8412caa1a1f02c49139a" +dependencies = [ + "databake", + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "serde", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8197e866e47b68f8f7d95249e172903bec06004b18b2937f1095d40a0c57de04" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "databake", + "displaydoc", + "erased-serde", + "icu_locale_core", + "postcard", + "serde", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" +dependencies = [ + "serde", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "oorandom" +version = "11.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + +[[package]] +name = "postcard" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "170a2601f67cc9dba8edd8c4870b15f71a6a2dc196daec8c83f72b59dff628a8" +dependencies = [ + "cobs", + "serde", +] + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "databake", + "serde", + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "rustversion" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.140" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "smallvec" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8917285742e9f3e1683f0a9c4e6b57960b7314d0b08d30d1ecd426713ee2eee9" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "serde", + "zerovec", +] + +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "typeid" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc7d623258602320d5c55d1bc22793b57daff0ec7efc270ea7d55ce1d5f5471c" + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" +dependencies = [ + "arrayvec", +] + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "databake", + "displaydoc", + "litemap", + "serde", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "zerovec" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +dependencies = [ + "databake", + "serde", + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/vendor/icu_normalizer/Cargo.toml b/vendor/icu_normalizer/Cargo.toml new file mode 100644 index 00000000..f9eb1163 --- /dev/null +++ b/vendor/icu_normalizer/Cargo.toml @@ -0,0 +1,201 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.82" +name = "icu_normalizer" +version = "2.0.0" +authors = ["The ICU4X Project Developers"] +build = false +include = [ + "data/**/*", + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md", + "build.rs", +] +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "API for normalizing text into Unicode Normalization Forms" +homepage = "https://icu4x.unicode.org" +readme = "README.md" +categories = ["internationalization"] +license = "Unicode-3.0" +repository = "https://github.com/unicode-org/icu4x" + +[package.metadata.docs.rs] +all-features = true + +[features] +compiled_data = [ + "dep:icu_normalizer_data", + "icu_properties?/compiled_data", + "icu_provider/baked", +] +datagen = [ + "serde", + "dep:databake", + "icu_properties", + "icu_collections/databake", + "zerovec/databake", + "icu_properties?/datagen", + "icu_provider/export", +] +default = [ + "compiled_data", + "utf8_iter", + "utf16_iter", +] +experimental = [] +icu_properties = ["dep:icu_properties"] +serde = [ + "dep:serde", + "icu_collections/serde", + "zerovec/serde", + "icu_properties?/serde", + "icu_provider/serde", +] +utf16_iter = [ + "dep:utf16_iter", + "write16", +] +utf8_iter = ["dep:utf8_iter"] + +[lib] +name = "icu_normalizer" +path = "src/lib.rs" + +[[test]] +name = "tests" +path = "tests/tests.rs" + +[[bench]] +name = "bench" +path = "benches/bench.rs" +harness = false +required-features = [ + "utf16_iter", + "utf8_iter", +] + +[[bench]] +name = "canonical_composition" +path = "benches/canonical_composition.rs" + +[[bench]] +name = "canonical_decomposition" +path = "benches/canonical_decomposition.rs" + +[[bench]] +name = "composing_normalizer_nfc" +path = "benches/composing_normalizer_nfc.rs" + +[[bench]] +name = "composing_normalizer_nfkc" +path = "benches/composing_normalizer_nfkc.rs" + +[[bench]] +name = "decomposing_normalizer_nfd" +path = "benches/decomposing_normalizer_nfd.rs" + +[[bench]] +name = "decomposing_normalizer_nfkd" +path = "benches/decomposing_normalizer_nfkd.rs" + +[dependencies.databake] +version = "0.2.0" +features = ["derive"] +optional = true +default-features = false + +[dependencies.displaydoc] +version = "0.2.3" +default-features = false + +[dependencies.icu_collections] +version = "~2.0.0" +default-features = false + +[dependencies.icu_normalizer_data] +version = "~2.0.0" +optional = true +default-features = false + +[dependencies.icu_properties] +version = "~2.0.0" +optional = true +default-features = false + +[dependencies.icu_provider] +version = "2.0.0" +default-features = false + +[dependencies.serde] +version = "1.0.110" +features = [ + "derive", + "alloc", +] +optional = true +default-features = false + +[dependencies.smallvec] +version = "1.10.0" +default-features = false + +[dependencies.utf16_iter] +version = "1.0.2" +optional = true +default-features = false + +[dependencies.utf8_iter] +version = "1.0.2" +optional = true +default-features = false + +[dependencies.write16] +version = "1.0.0" +features = ["alloc"] +optional = true +default-features = false + +[dependencies.zerovec] +version = "0.11.1" +default-features = false + +[dev-dependencies.arraystring] +version = "0.3.0" + +[dev-dependencies.arrayvec] +version = "0.7.2" +default-features = false + +[dev-dependencies.atoi] +version = "2.0.0" + +[dev-dependencies.detone] +version = "1.0.0" + +[dev-dependencies.write16] +version = "1.0.0" +features = ["arrayvec"] +default-features = false + +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies.criterion] +version = "0.5.0" diff --git a/vendor/icu_normalizer/LICENSE b/vendor/icu_normalizer/LICENSE new file mode 100644 index 00000000..c9be6012 --- /dev/null +++ b/vendor/icu_normalizer/LICENSE @@ -0,0 +1,46 @@ +UNICODE LICENSE V3 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 2020-2024 Unicode, Inc. + +NOTICE TO USER: Carefully read the following legal agreement. BY +DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR +SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT +DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of data files and any associated documentation (the "Data Files") or +software and any associated documentation (the "Software") to deal in the +Data Files or Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Data Files or Software, and to permit persons to whom the +Data Files or Software are furnished to do so, provided that either (a) +this copyright and permission notice appear with all copies of the Data +Files or Software, or (b) this copyright and permission notice appear in +associated Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. + +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE +BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, +OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA +FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + +SPDX-License-Identifier: Unicode-3.0 + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/vendor/icu_normalizer/README.md b/vendor/icu_normalizer/README.md new file mode 100644 index 00000000..5c9e7409 --- /dev/null +++ b/vendor/icu_normalizer/README.md @@ -0,0 +1,48 @@ +# icu_normalizer [](https://crates.io/crates/icu_normalizer) + +<!-- cargo-rdme start --> + +Normalizing text into Unicode Normalization Forms. + +This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) +and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. + +## Functionality + +The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode +Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. + +Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, +and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. + +The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA +Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by +applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the +[`idna`](https://docs.rs/idna/latest/idna/) crate. + +The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and +the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class +property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the +[`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate. + +Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in +addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive +non-“maybe” answer. + +## Examples + +```rust +let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); +assert_eq!(nfc.normalize("a\u{0308}"), "ä"); +assert!(nfc.is_normalized("ä")); + +let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); +assert_eq!(nfd.normalize("ä"), "a\u{0308}"); +assert!(!nfd.is_normalized("ä")); +``` + +<!-- cargo-rdme end --> + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/vendor/icu_normalizer/benches/bench.rs b/vendor/icu_normalizer/benches/bench.rs new file mode 100644 index 00000000..011478af --- /dev/null +++ b/vendor/icu_normalizer/benches/bench.rs @@ -0,0 +1,24 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{criterion_group, criterion_main}; + +mod canonical_composition; +mod canonical_decomposition; +mod composing_normalizer_nfc; +mod composing_normalizer_nfkc; +mod decomposing_normalizer_nfd; +mod decomposing_normalizer_nfkd; + +criterion_group!( + benches, + canonical_composition::criterion_benchmark, + canonical_decomposition::criterion_benchmark, + composing_normalizer_nfc::criterion_benchmark, + composing_normalizer_nfkc::criterion_benchmark, + decomposing_normalizer_nfd::criterion_benchmark, + decomposing_normalizer_nfkd::criterion_benchmark, +); + +criterion_main!(benches); diff --git a/vendor/icu_normalizer/benches/canonical_composition.rs b/vendor/icu_normalizer/benches/canonical_composition.rs new file mode 100644 index 00000000..134c08d8 --- /dev/null +++ b/vendor/icu_normalizer/benches/canonical_composition.rs @@ -0,0 +1,188 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; +use detone::IterDecomposeVietnamese; + +use icu_normalizer::properties::{ + CanonicalCompositionBorrowed, CanonicalDecompositionBorrowed, Decomposed, +}; +use icu_normalizer::ComposingNormalizerBorrowed; + +struct BenchDataContent { + pub file_name: String, + pub pairs: Vec<(char, char)>, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 16] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + + [ + BenchDataContent { + file_name: "TestNames_Latin".to_owned(), + pairs: decompose_data( + &nfc_normalizer + .normalize(&strip_headers(include_str!("./data/TestNames_Latin.txt"))), + ), + }, + BenchDataContent { + file_name: "TestNames_Japanese_h".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestNames_Japanese_h.txt" + )))), + }, + BenchDataContent { + file_name: "TestNames_Japanese_k".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestNames_Japanese_k.txt" + )))), + }, + BenchDataContent { + file_name: "TestNames_Korean".to_owned(), + pairs: decompose_data( + &nfc_normalizer + .normalize(&strip_headers(include_str!("./data/TestNames_Korean.txt"))), + ), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_ar".to_owned(), + #[cfg(debug_assertions)] + pairs: Vec::new(), + #[cfg(not(debug_assertions))] + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_ar.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_de".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_de.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_el".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_el.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_es".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_es.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_fr".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_fr.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_he".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_he.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_pl".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_pl.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_ru".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_ru.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_th".to_owned(), + #[cfg(debug_assertions)] + pairs: Vec::new(), + #[cfg(not(debug_assertions))] + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_th.txt" + )))), + }, + BenchDataContent { + file_name: "TestRandomWordsUDHR_tr".to_owned(), + pairs: decompose_data(&nfc_normalizer.normalize(&strip_headers(include_str!( + "./data/TestRandomWordsUDHR_tr.txt" + )))), + }, + BenchDataContent { + file_name: "udhr_vie".to_owned(), + pairs: decompose_data( + &nfc_normalizer.normalize(&strip_headers(include_str!("data/wotw.txt"))), + ), + }, + BenchDataContent { + file_name: "udhr_vie_detone".to_owned(), + pairs: { + let result: Vec<(char, char)> = nfc_normalizer + .normalize(&strip_headers(include_str!("data/wotw.txt"))) + .chars() + .filter_map(|c| { + let mut iter = std::iter::once(c).decompose_vietnamese_tones(true); + if let Some(base) = iter.next() { + iter.next().map(|tone| (base, tone)) + } else { + None + } + }) + .collect(); + assert!(!result.is_empty()); + result + }, + }, + ] +} + +fn function_under_bench( + canonical_composer: &CanonicalCompositionBorrowed, + composable_points: &[(char, char)], +) { + for pair in composable_points.iter() { + canonical_composer.compose(pair.0, pair.1); + } +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "canonical_composition"; + let mut group = criterion.benchmark_group(group_name); + + let composer = CanonicalCompositionBorrowed::new(); + + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| bencher.iter(|| function_under_bench(&composer, &bench_data_content.pairs)), + ); + } + + group.finish(); +} + +fn decompose_data(nfc: &str) -> Vec<(char, char)> { + let decomposer = CanonicalDecompositionBorrowed::new(); + nfc.chars() + .map(|c| decomposer.decompose(c)) + .filter_map(|decomposed| { + if let Decomposed::Expansion(a, b) = decomposed { + Some((a, b)) + } else { + None + } + }) + .collect() +} diff --git a/vendor/icu_normalizer/benches/canonical_decomposition.rs b/vendor/icu_normalizer/benches/canonical_decomposition.rs new file mode 100644 index 00000000..8e5ad5dc --- /dev/null +++ b/vendor/icu_normalizer/benches/canonical_decomposition.rs @@ -0,0 +1,162 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; + +use icu_normalizer::properties::CanonicalDecompositionBorrowed; +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + +struct BenchDataContent { + pub file_name: String, + pub nfc: String, + pub nfd: String, + pub nfkc: String, + pub nfkd: String, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 15] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); + let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); + let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + + let content_latin: (&str, &str) = ( + "TestNames_Latin", + &strip_headers(include_str!("./data/TestNames_Latin.txt")), + ); + let content_jp_h: (&str, &str) = ( + "TestNames_Japanese_h", + &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), + ); + let content_jp_k: (&str, &str) = ( + "TestNames_Japanese_k", + &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), + ); + let content_korean: (&str, &str) = ( + "TestNames_Korean", + &strip_headers(include_str!("./data/TestNames_Korean.txt")), + ); + let content_random_words_ar: (&str, &str) = ( + "TestRandomWordsUDHR_ar", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), + ); + let content_random_words_de: (&str, &str) = ( + "TestRandomWordsUDHR_de", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), + ); + let content_random_words_el: (&str, &str) = ( + "TestRandomWordsUDHR_el", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), + ); + let content_random_words_es: (&str, &str) = ( + "TestRandomWordsUDHR_es", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), + ); + let content_random_words_fr: (&str, &str) = ( + "TestRandomWordsUDHR_fr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), + ); + let content_random_words_he: (&str, &str) = ( + "TestRandomWordsUDHR_he", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), + ); + let content_random_words_pl: (&str, &str) = ( + "TestRandomWordsUDHR_pl", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), + ); + let content_random_words_ru: (&str, &str) = ( + "TestRandomWordsUDHR_ru", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), + ); + let content_random_words_th: (&str, &str) = ( + "TestRandomWordsUDHR_th", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), + ); + let content_random_words_tr: (&str, &str) = ( + "TestRandomWordsUDHR_tr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), + ); + let content_viet: (&str, &str) = ("udhr_vie", &strip_headers(include_str!("data/wotw.txt"))); + + [ + content_latin, + content_viet, + content_jp_k, + content_jp_h, + content_korean, + content_random_words_ru, + content_random_words_ar, + content_random_words_el, + content_random_words_es, + content_random_words_fr, + content_random_words_tr, + content_random_words_th, + content_random_words_pl, + content_random_words_he, + content_random_words_de, + ] + .map(|(file_name, raw_content)| BenchDataContent { + file_name: file_name.to_owned(), + nfc: nfc_normalizer.normalize(raw_content).to_string(), + nfd: nfd_normalizer.normalize(raw_content).to_string(), + nfkc: nfkc_normalizer.normalize(raw_content).to_string(), + nfkd: nfkd_normalizer.normalize(raw_content).to_string(), + }) +} + +#[cfg(debug_assertions)] +fn function_under_bench( + _canonical_decomposer: &CanonicalDecompositionBorrowed, + _decomposable_points: &str, +) { + // using debug assertion fails some test. + // "cargo test --bench bench" will pass + // "cargo bench" will work as expected, because the profile doesn't include debug assertions. +} + +#[cfg(not(debug_assertions))] +fn function_under_bench( + canonical_decomposer: &CanonicalDecompositionBorrowed, + decomposable_points: &str, +) { + decomposable_points.chars().for_each(|point| { + canonical_decomposer.decompose(point); + }); +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "canonical_decomposition"; + let mut group = criterion.benchmark_group(group_name); + + let decomposer = CanonicalDecompositionBorrowed::new(); + + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfc)), + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), + |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfd)), + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), + |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkc)), + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), + |bencher| bencher.iter(|| function_under_bench(&decomposer, &bench_data_content.nfkd)), + ); + } + group.finish(); +} diff --git a/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs b/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs new file mode 100644 index 00000000..e23848dc --- /dev/null +++ b/vendor/icu_normalizer/benches/composing_normalizer_nfc.rs @@ -0,0 +1,230 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; + +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + +struct BenchDataContent { + pub file_name: String, + pub nfc: String, + pub nfd: String, + pub nfkc: String, + pub nfkd: String, + pub nfc_u16: Vec<u16>, + pub nfd_u16: Vec<u16>, + pub nfkc_u16: Vec<u16>, + pub nfkd_u16: Vec<u16>, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 15] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); + let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); + let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + + let content_latin: (&str, &str) = ( + "TestNames_Latin", + &strip_headers(include_str!("./data/TestNames_Latin.txt")), + ); + let content_jp_h: (&str, &str) = ( + "TestNames_Japanese_h", + &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), + ); + let content_jp_k: (&str, &str) = ( + "TestNames_Japanese_k", + &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), + ); + let content_korean: (&str, &str) = ( + "TestNames_Korean", + &strip_headers(include_str!("./data/TestNames_Korean.txt")), + ); + let content_random_words_ar: (&str, &str) = ( + "TestRandomWordsUDHR_ar", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), + ); + let content_random_words_de: (&str, &str) = ( + "TestRandomWordsUDHR_de", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), + ); + let content_random_words_el: (&str, &str) = ( + "TestRandomWordsUDHR_el", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), + ); + let content_random_words_es: (&str, &str) = ( + "TestRandomWordsUDHR_es", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), + ); + let content_random_words_fr: (&str, &str) = ( + "TestRandomWordsUDHR_fr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), + ); + let content_random_words_he: (&str, &str) = ( + "TestRandomWordsUDHR_he", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), + ); + let content_random_words_pl: (&str, &str) = ( + "TestRandomWordsUDHR_pl", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), + ); + let content_random_words_ru: (&str, &str) = ( + "TestRandomWordsUDHR_ru", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), + ); + let content_random_words_th: (&str, &str) = ( + "TestRandomWordsUDHR_th", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), + ); + let content_random_words_tr: (&str, &str) = ( + "TestRandomWordsUDHR_tr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), + ); + let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); + + [ + content_latin, + content_viet, + content_jp_k, + content_jp_h, + content_korean, + content_random_words_ru, + content_random_words_ar, + content_random_words_el, + content_random_words_es, + content_random_words_fr, + content_random_words_tr, + content_random_words_th, + content_random_words_pl, + content_random_words_he, + content_random_words_de, + ] + .map(|(file_name, raw_content)| { + let nfc = &nfc_normalizer.normalize(raw_content); + let nfd = &nfd_normalizer.normalize(raw_content); + let nfkc = &nfkc_normalizer.normalize(raw_content); + let nfkd = &nfkd_normalizer.normalize(raw_content); + BenchDataContent { + file_name: file_name.to_owned(), + nfc: nfc.to_string(), + nfd: nfd.to_string(), + nfkc: nfkc.to_string(), + nfkd: nfkd.to_string(), + nfc_u16: nfc.encode_utf16().collect(), + nfd_u16: nfd.encode_utf16().collect(), + nfkc_u16: nfkc.encode_utf16().collect(), + nfkd_u16: nfkd.encode_utf16().collect(), + } + }) +} + +fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { + normalizer.normalize(text); +} + +fn function_under_bench_utf16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { + normalizer.normalize_utf16(text); +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "composing_normalizer_nfc"; + + let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfc(); + + let mut group = criterion.benchmark_group(group_name); + + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) + }) + }, + ); + + // UTF_16 + group.bench_function( + BenchmarkId::from_parameter(format!( + "from_nfc_{}_utf_16", + bench_data_content.file_name + )), + |bencher| { + bencher.iter(|| { + function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!( + "from_nfd_{}_utf_16", + bench_data_content.file_name + )), + |bencher| { + bencher.iter(|| { + function_under_bench_utf16(&normalizer_under_bench, &bench_data_content.nfd_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!( + "from_nfkc_{}_utf_16", + bench_data_content.file_name + )), + |bencher| { + bencher.iter(|| { + function_under_bench_utf16( + &normalizer_under_bench, + &bench_data_content.nfkc_u16, + ) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!( + "from_nfkd_{}_utf_16", + bench_data_content.file_name + )), + |bencher| { + bencher.iter(|| { + function_under_bench_utf16( + &normalizer_under_bench, + &bench_data_content.nfkd_u16, + ) + }) + }, + ); + } + group.finish(); +} diff --git a/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs b/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs new file mode 100644 index 00000000..6792c7ee --- /dev/null +++ b/vendor/icu_normalizer/benches/composing_normalizer_nfkc.rs @@ -0,0 +1,211 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; + +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + +struct BenchDataContent { + pub file_name: String, + pub nfc: String, + pub nfd: String, + pub nfkc: String, + pub nfkd: String, + pub nfc_u16: Vec<u16>, + pub nfd_u16: Vec<u16>, + pub nfkc_u16: Vec<u16>, + pub nfkd_u16: Vec<u16>, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 15] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); + let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); + let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + + let content_latin: (&str, &str) = ( + "TestNames_Latin", + &strip_headers(include_str!("./data/TestNames_Latin.txt")), + ); + let content_jp_h: (&str, &str) = ( + "TestNames_Japanese_h", + &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), + ); + let content_jp_k: (&str, &str) = ( + "TestNames_Japanese_k", + &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), + ); + let content_korean: (&str, &str) = ( + "TestNames_Korean", + &strip_headers(include_str!("./data/TestNames_Korean.txt")), + ); + let content_random_words_ar: (&str, &str) = ( + "TestRandomWordsUDHR_ar", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), + ); + let content_random_words_de: (&str, &str) = ( + "TestRandomWordsUDHR_de", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), + ); + let content_random_words_el: (&str, &str) = ( + "TestRandomWordsUDHR_el", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), + ); + let content_random_words_es: (&str, &str) = ( + "TestRandomWordsUDHR_es", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), + ); + let content_random_words_fr: (&str, &str) = ( + "TestRandomWordsUDHR_fr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), + ); + let content_random_words_he: (&str, &str) = ( + "TestRandomWordsUDHR_he", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), + ); + let content_random_words_pl: (&str, &str) = ( + "TestRandomWordsUDHR_pl", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), + ); + let content_random_words_ru: (&str, &str) = ( + "TestRandomWordsUDHR_ru", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), + ); + let content_random_words_th: (&str, &str) = ( + "TestRandomWordsUDHR_th", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), + ); + let content_random_words_tr: (&str, &str) = ( + "TestRandomWordsUDHR_tr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), + ); + let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); + + [ + content_latin, + content_viet, + content_jp_k, + content_jp_h, + content_korean, + content_random_words_ru, + content_random_words_ar, + content_random_words_el, + content_random_words_es, + content_random_words_fr, + content_random_words_tr, + content_random_words_th, + content_random_words_pl, + content_random_words_he, + content_random_words_de, + ] + .map(|(file_name, raw_content)| { + let nfc = &nfc_normalizer.normalize(raw_content); + let nfd = &nfd_normalizer.normalize(raw_content); + let nfkc = &nfkc_normalizer.normalize(raw_content); + let nfkd = &nfkd_normalizer.normalize(raw_content); + BenchDataContent { + file_name: file_name.to_owned(), + nfc: nfc.to_string(), + nfd: nfd.to_string(), + nfkc: nfkc.to_string(), + nfkd: nfkd.to_string(), + nfc_u16: nfc.encode_utf16().collect(), + nfd_u16: nfd.encode_utf16().collect(), + nfkc_u16: nfkc.encode_utf16().collect(), + nfkd_u16: nfkd.encode_utf16().collect(), + } + }) +} + +fn function_under_bench(normalizer: &ComposingNormalizerBorrowed, text: &str) { + normalizer.normalize(text); +} + +fn function_under_bench_u16(normalizer: &ComposingNormalizerBorrowed, text: &[u16]) { + normalizer.normalize_utf16(text); +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "composing_normalizer_nfkc"; + + let normalizer_under_bench = ComposingNormalizerBorrowed::new_nfkc(); + + let mut group = criterion.benchmark_group(group_name); + + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) + }) + }, + ); + // UTF 16 + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) + }) + }, + ); + } + group.finish(); +} diff --git a/vendor/icu_normalizer/benches/data/README.md b/vendor/icu_normalizer/benches/data/README.md new file mode 100644 index 00000000..de34f9fc --- /dev/null +++ b/vendor/icu_normalizer/benches/data/README.md @@ -0,0 +1,25 @@ +# Generating microbench data + +The full versions of these files are located +[in another part of the repository](https://github.com/unicode-org/icu/tree/main/icu4j/perf-tests/data). + +## Sanitizing the file + +```shell +sed -i '/^#/d' ${filename} +sed -i '/^$/d' ${filename} +``` + +## Shuffling the file + +```shell +shuf -n 20 ${filename} -o ${filename} +``` + +## Add back the header (if you plan on submitting the files) + +``` +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). +``` diff --git a/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt b/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt new file mode 100644 index 00000000..5fb4d944 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestNames_Japanese_h.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +かげやま,みのる +むらかみ,とおる +つじさわ,けい +やすい,たかゆき +むらさき,としお +はせがわ,ひであき +うるしばら,よしひこ +ままだ,ひろし +おおぼら,えいじろう +おおば,まさひで +きたばたけ,たかひこ +はまさき,あつし +ほりい,つねお +もり,だいいち +いとう,しんいち +くにもと,じゅんじ +おか,のりひと +たに,よしあき +しらがき,ひろあき +しらはま,たけひろ +むらかみ,やすひろ +うめはら,たかし +いわた,ひろし +すぎえ,かつとし +てらにし,ひろみつ +まつおか,だいすけ +もろほし,すすむ +いしはら,たかし +おしま,ひろお +なかお,ゆうじ +いかり,はるお +きまち,まさき +ふるかわ,みちお +かねこ,しゅうへい +なかがわ,ともみ +ささき,しんご +うちだ,たくじ +うめだ,さかえ +しばた,いくこ +まきした,けいこ +まつもと,しんいちろう +たかの,かずよし +いしわた,なおひさ +いうち,まこと +いまい,りほ +みずた,のりあき +かくたに,まなぶ +わだ,ほまれ +わかまつ,かずき +かわぐち,ひろき diff --git a/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt b/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt new file mode 100644 index 00000000..b986e7a2 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestNames_Japanese_k.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +ホリモト,ユウジ +ハナミ,ヤスヒデ +イシザカ,タカユキ +ゼンケ,トシオ +ハトリ,ユウコ +ナガオカ,トモユキ +コウダ,ケンイチ +イシダ,ヒロシ +ミワ,シゲユキ +イシカワ,ヒロシ +スズキ,ユウスケ +オクダ,ヨシノリ +シムラ,サカエ +エビシマ,ヤスユキ +イブカ,ヨシテル +タノ,マコト +ドウゾノ,セイヤ +ヤマナカ,サツミ +トミイエ,ハヤト +アザミ,ツトム +タナカ,キョウコ +コジマ,アツシ +フミハラ,カオリ +スズキ,マサユキ +ナトリ,ケンヤ +スズキ,ユウコ +スズキ,ヒサエ +ナカガワ,カツヨシ +スズキ,マサフミ +マツヤマ,トシオ +ヨシナガ,チカエ +キタムラ,リカコ +アオキ,タクオ +ヤマグチ,ヤスヒロ +スギムラ,シゲオ +ウエスギ,マサミ +マツムラ,シンイチ +クバ,タカシ +スドウ,タカトシ +フジモト,ヒロシ +イトウ,シュウイチ +コバヤシ,カズミ +タナカ,ヒロカツ +イシダ,ツカサ +ヤマダ,マサコ +カミヤ,トミエ +タケモト,ユウジ +スミノ,コウジ +ヒロハタ,タクヤ +ミヒラ,リョウヘイ diff --git a/vendor/icu_normalizer/benches/data/TestNames_Korean.txt b/vendor/icu_normalizer/benches/data/TestNames_Korean.txt new file mode 100644 index 00000000..95b19916 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestNames_Korean.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +김명희 +홍차수 +허순재 +강영휘 +김운주 +이종환 +이은국 +강태호 +강일래 +김동현 +곽기자 +차재수 +표봉기 +문대원 +이형기 +최교표 +박식현 +홍종립 +서창수 +김쌍건 +서말도 +이병훈 +김희수 +박학태 +강태종 +조문란 +신범균 +백두진 +이철정 +김태중 +이성현 +김주조 +김강행 +이정길 +김완일 +권수자 +이춘철 +김판근 +김곡리 +이경형 +이운만 +손상철 +유기숙 +박정한 +조윤래 +유신호 +이두수 +김재률 +김성홍 +김혜경 diff --git a/vendor/icu_normalizer/benches/data/TestNames_Latin.txt b/vendor/icu_normalizer/benches/data/TestNames_Latin.txt new file mode 100644 index 00000000..e5b82ab3 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestNames_Latin.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +González, Joan +Reinders, Jim +Applebroog, Ida +Kidd, Joseph Bartholomew +Gulácsy, Lajos +Letendre, Rita +Zuccaro, Federico +Apt the Elder, Ulrich +Drummond, Arthur +Manley, Thomas +Broc, Jean +Ramunno, Tony +Simone dei Crocifissi +Lane, Theodore +Symonds, William Robert +Johnson, Frank Tenney +Cox, Gardner +Bunbury, Charles +Pedro de la Cuadra +Payne, William +Lucas, John Seymour +Holsman, Elizabeth T. +de Vries, Auke +Laszlo, Philip Alexius de +Shigemasa +Wolfe, Ruth Mitchell +Buck, John +Baselitz, Georg +Hook, Walter +Segall, Lasar +Brush, George deForest +Master of Jánosrét +Sutherland, Elizabeth Leveson-Gower, Countess of +Tuckerman, Jane +Varley, F.H. +Fosso, Samuel +Gardner, Daniel +Sadler, Walter Dendy +Clausen, Franciska +Coman, Charlotte Buell +Wakelin, Roland +Payne, Jon, CML +Campagna, Girolamo +Wiener, Phyllis +Sallee, Charles +Fitzgerald, John Anster +Gribbroek, Robert +Laporte, John +Lévy-Dhurmer, Lucien +Young, Stephen Scott diff --git a/vendor/icu_normalizer/benches/data/TestNames_Thai.txt b/vendor/icu_normalizer/benches/data/TestNames_Thai.txt new file mode 100644 index 00000000..4de72dc6 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestNames_Thai.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +ณรงค์ โต๊ะเงิน +กิตติ บุญวันต์ +สมหมาย ดาบทองดี +ธวัชชัย อิสระนิมิตร +วรรณา โสภณนรินทร์ +วินัย หมู่มิ่ง +พัชรี ชูจิรวงศ์ +สมปอง จิวไพโรจน์กิจ +บุญส่ง กวยรักษา +นิพนธ์ นิ่มใหม่ +พัชรี สุวพรศิลป์ +เจริญ นววัฒนทรัพย์ +อรพินท์ แซ่เจี่ย +ชัยพร สมใจนึก +ประนอม โคศิลา +ฉวีวรรณ ศรสังข์ทอง +วัชรา เจริญรัตนพร +สุภัท นกศิริ +อู๋ มาลาเล็ก +ประยูร ไชโย +ละออ อยู่ยืนยง +สมใจ วิวัฒน์วานิช +จุมพล จันทรศรีเกษร +พุฒ ดอกไม้จีน +บุญชัย วรกิจพรสิน +สมาน ธูปเทียน +พงศ์ศักดิ์ แซ่แต้ +อำนาจ ไวจงเจริญ +พรทิพย์ แซ่ลี้ +อุไรวรรณ สาครสินธุ์ +อำพล วีระตะนนท์ +สมจิตร ใจวังโลก +สุเทพ ตันวินิจ +สวาท ทรัพย์มาก +สมศักดิ์ เจือจันทร์ +ดัสซันซิงห์ กุลาตี +ธีร ศรแก้ว +พรรณยุพา ฮ่อสกุล +สำราญ จันทร์เอี่ยม +พจน์ มั่นกันนาน +สุธี บุณยเกียรติ +บุญโชติ ทิพย์ประเสริฐสิน +ประดิษฐ์ ทองพสิฐสมบัติ +จำเนียร เพ็งเจริญ +สมศักดิ์ อรุณรัตน์ +อนุชา จารุหิรัญสกุล +พิกุล มโนภิญโญภิญญะ +ผ่องศรี นกแก้ว +อารี วิไลวรรณ +ณรงค์วิทย์ วิทสัทธาวรกุล diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt new file mode 100644 index 00000000..0cf40fb0 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ar.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +ممارسة مراعاة +العنصرية +حدود والشيخوخة +بالحكم كهذا ينتفع +البلاد +تربية +الغير التقدم والعدل +نحو بالتعليم والحرية +تأمين متساو +للتعليم فيها +آذت اعتداء للتعليم +ليس المتأصلة +والمساهمة الضروري تتناقض +وتأسيس +رضى +شرعي الطبية +لكيلا الجمعية والحرية +للرجال التزوج +بالكرامة +حرية بين +هذه العيش تنظر +قيد +يقررها والصداقة +اعتُمد وينبغي اجتماعي +حرمان +للإدراك بأجر إنتاجه +التربية القانون +لإنصافه وتأسيس وسمعته +أساسه للرجال +كافة +المجهود دولي أينما +وإلى +بنشاط تجري +والأمم مثل لحقوق +الإنسان بشروط بحماية +شرفه +كما الوظائف +حياته ديسمبر +ولما +هذه +غاية جديد إنسان +حرية +متهم الوطنية قدمًا +التملك وضع +شرعية ويعبر تأدية +بنظام عمل والأخلاق +التملك لشخصيته يلجأ +بحال يضطر ولا +الانضمام بالكرامة +عضوا diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt new file mode 100644 index 00000000..b002a64c --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_de.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +Herrschaft Freiheiten Not +Gewalt +stets anderer begründet +erhobenen innerstaatliche +Heiratsfähige freie +offenstehen Begrenzung grausamer +Maßnahmen höchste +unentbehrlich privat +erniedrigender +Verachtung freie +innezuhaben innerstaatlichen +kommen +werden gleichgültig +Würde überall höchste +Schutzmaßnahmen den Pflichten +Wille Bestimmung +Leibeigenschaft einschließlich für +gleiche bekräftigt Gewissens +Wohles +Generalversammlung +Volkes +Völkern gegenwärtig Zusammenarbeit +Heiratsfähige sowie Jeder +Stellung +Lebensstandard +seinem +Rede strafbaren Sicherheit +mit +Kulthandlungen Grund +ärztlicher +Auflösung Anforderungen anzugehören +Furcht +keine Geburt +Wohles Furcht genügen +befriedigende Medien +anzugehören Urlaub Vereinigungen +hinzuwirken verboten Resolution +kommen +sozialer vor irgendein +Bestimmung Bestimmung +Fall natürliche kein +Geschlecht Aufhetzung eigenen +seinen +über +Unterlassung Berücksichtigung +war +Rufes stets +Volkes anderer Beschränkungen +Handlungen dessen +Die diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt new file mode 100644 index 00000000..9c71f293 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_el.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +προάγει αλληλογραφία +λογική έχει +ιδρύει ζωή τεχνική +δυνατότητες +περιορισμό συνόλου +ασκεί παραγνώριση συναφθεί +αναγνωρίζουν ποινικής εκδηλώνει +κοινότητας διακυβέρνηση στα +απέναντι υψηλή +περιστάσεων αξιόποινη +σεβασμό +συντήρησής κατά εξασφαλίσουν +παραβιάζουν συμπληρώνεται νόμο +άμεσα +σημαίνει καθεστώς +ΑΝΘΡΩΠΙΝΑ θέλησης ανθρωπίνων +ΔΙΑΚΗΡΥΞΗ αθλιότητα ασφάλιση +μέσο +ίση Εχει +ειρήνης Κάθε +μέλη μορφή +όσο +κρατείται Στο Διακηρύσσει +οικονομικών έκφρασης εξασφαλίζεται +κάθε +περίπτωση απολαμβάνουν +ποινικό γεροντική +είναι μαζί δικαστήρια +μαζί προοπτική +δική +βαρβαρότητας +οικονομικών εξασφαλίσει +υποχρεώσεις οδήγησαν +Οικουμενική Διακήρυξης γονείς +στις μυστική αντιπροσώπους +Διακήρυξης άδειες βιοτικό +αναπηρία ομάδα +πραγματικό +καλύτερες +ανάπαυση +δίκαιες ένα δικαίου +μετέχει στους +θρησκευτικών ποινικής +Κανείς ίσα +πεποιθήσεις +πολιτικές ανάλογα δουλεία +πολιτικές ιατρική ωσότου +ηθικής χωρίς +ανδρών ικανό +καθώς diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt new file mode 100644 index 00000000..db0490d3 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_es.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +duración común +delito reconocimiento alimentación +inalienables +entre seguridad escogidos +comportarse dignidad +autónomo gobierno tiempo +omisiones +comisión +Derechos territorios +debe +han +regresar inalienables +regresar +desempleo científico +arbitrariamente proclamada +están contraerse esposos +cualesquiera +salir carácter desarrollo +solamente justas +personalidad una +cuanto +garantice resolución +concepción +tomar impondrá +cualquier reconocimiento +obligatoria obligatoria satisfactoria +acusación sin +artísticas penal culturales +pagadas examen +Además Organización dignidad +opresión esposos ejercidos +barbarie están mientras +por +idioma +recursos pagadas +materia Nada ella +con injerencias +inspirándose +organización +gozar jurisdicción +que +asegurar +humana libertad +nadie equivalente +escoger remuneración +torturas +individuos poder +disfruten seres Preámbulo +desempleo +liberados diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt new file mode 100644 index 00000000..2e0a38e7 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_fr.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +conforme êtres fonctions +non tout généralisé +premier lui +faire hommes d’égalité +peuple volonté bénéficier +générale nationales +cruels plus +d’encourager opinions +genre l’esprit +d’origine effectif +exigences auront +résultent situation recevoir +peuples Chacun +sont d’égalité +jouissent +auront l’esprit +pays telle +publiquement +mariage foi +travail démocratique religieux +rémunération +omissions telles +L’éducation +raison complétée donner +invoqué auront arbitraires +l’amitié suffisant affaires +travaille l’accomplissement l’intermédiaire +race +opinions celles +assurer par privée +valeur +violant traite premier +inhérente +bienfaits l’avènement +Unies s’il actions +inquiété l’esclavage +inquiété +esclaves lieu +salaire +par +toute +innocente procédure membres +arts l’idéal envers +suffrage territoires inhumains +d’immixtions l’organisation progrès +comme égalité Unies +maternité +violerait suprême sécurité +impliquant eux loisirs +nationalité diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt new file mode 100644 index 00000000..2b6b120a --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_he.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +זקנה משפילים +ינתן חברתי עניניו +הפוב +ולהיות זכויות הישגים +יאסרו מטעמי וללא +ספרותית השלם +למנוחה חינם +וההתאגדות +לטפח +באלה במלואן +יהנו +ולרווחתם לגבר האדם +בכבודו שבארצות כבוד +ובינלאומיים +בכך לתנאי אישי +שאינן +שרירותי +במשפט +ולעקרונותיהן מטעם +שרירותית האשמה יהיה +החינוך ולבטחון +סובלנות אשמתו במגילה +המאוחדות חיוני +חשוב במקרה +כלתי העולם +שמקורה כציבור +לשויון +לתקנה +תלוי ההתאספות +הדיבור שהוא +והבלתי והבסיסית +ולעקרונותיהן יהא וישאף +ביתנ הבינלאומי +והזלזול להקנות +בגלל כולם שיושלם +לחיים +בדבר +לשירות +זכויות +לפני +אדם ולא מזזמנות +קנינו שהיה ההתאספות +בינלאומי חיוניות לבקש +תהיינה +ובזכות בכורה מהגנה +מתוך +ובמצפון מזומנות לאגד +והחמריים סוציאלי +אנושיים ובהצבעה +פראיים diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt new file mode 100644 index 00000000..b6cd9760 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_pl.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +uciskowi posiadania prawo +społecznego największych skazany +czy +potrzeby samodzielnie przystępowania +Krzewi też dokonania +pełną prawo +buntu +moralności +zapewnienia znaczenie +nieludzki wypadek Nikt +zasadności jakikolwiek Każdy +samowolnie krajem +międzynarodowego +członek wielu +rozwój wynikających obalenia +rasy +grudnia która +jedynie urlopu ani +małżeńskie stanowi ustaniu +człowieka postępowych +prześladowania +politycznej które zawarcia +Deklaracja +ingerować wyłącznie +studia Nikt +innego uprawianie zrozumienie +wybranych swobodę wyznania +wolni osobowości +ograniczenie Nie +równej społecznego uciekać +będącą POWSZECHNA +niezdolności poszukiwania międzynarodowej +konieczne potrzeby posiada +opinii wychowywania 1948 +międzynarodowej zatrzymać +przedstawicieli +przeciw +wynikających organy pracę +człowiek grupami +niezbędnych +wolności podstawowym +opinii małżonków wolność +postępować zdecydowanie komórką +odniesieniu +pokoju azyl +zawodowych powrócić człowiek +konstytucję +takiej postaciach powszechnego +wygnać wygnać +wspólny poszanowania diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt new file mode 100644 index 00000000..4ceb0307 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_ru.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +областях +будут должен +обеспечиваются нежели +котором Уставе +социального моральных +совершеннолетия предоставление +том независимо +существование +вмешательства какому ограниченной +распространять +находить помощь +искусством +унижающим положения искать +изгнанию член совершеннолетия +обществом имуществом государственной +идеи братства +наслаждаться значение социальной +осуществления юрисдикцией наказанию +достойное свою III +жизнь расторжения инвалидности +терпимости этого +целях равны +обеспечиваются законным +принуждаем правосубъектности +пыткам доступа неприкосновенность +Брак против +прибегать независимой +человека человеческой +быть независимо религии +публичным +членам против +разумом результатом семью +Принята участие +беспристрастным тем +частным основной +правового +страной обслуживание +было свободу полное +рабочего свободны +состоянии помощь религиозными +полное +владеть власти морали +меньшей +братства социальному убежища +государств +равны который дети +терпимости +получать бесплатным полного +богослужении +отдельным diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt new file mode 100644 index 00000000..bc0d0737 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_th.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +คิด ใตัอำ เคลื่อนไหว +บังคับ บาก +สิ่ง สิ้น +วัตถุ +ชาย อาศัย เท่านั้น +สิน +เกา +ดูแล พิธีกรรม +ภายใน +เพศ +หนัก ประสงค์ +เหตุ +งาน รักษา +เพศ ภาษา +นี้ +คู่ สัญชาติ ต้องการ +วิธี ระหว่าง ตกลง +ทำนอง +สืบ กับ ศิลปกรรม +เหนือ วรรณกรรม +คิด การก หน้าที่ +ชาติ ศิลปกรรม แต่ +สามัญ สอด +เหยียด วิธี จุด +หน้า ถ้า เบื้อง +ประชุม +ศิลปกรรม +เสรีภาพ โหด ก่อ +เกียรติศักดิ์ ป่วย เอกราช +ประหัต มโนธรรม การ +แทน +ขัดขืน เวลา เสียง +กฎบัตร พยายาม +สิน หน้า +จำเป็น +ประชาธิปไตย หน่วย +กรณี จริงจัง +ทำนอง +ทาษ +เพิ่ม +บรรดา ขวาง +กักขัง +มนุษย์ +ชาย ประกัน มนุษยธรรม +จะบัน มูลฐาน เถื่อน +พฤติ +มิได้ +หญิง คู่ +สมา ปฏิบัติ อนึ่ง +สิ่ง ทาษ diff --git a/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt new file mode 100644 index 00000000..08129b01 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/TestRandomWordsUDHR_tr.txt @@ -0,0 +1,54 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +mecburidir ilim +isnadın sınırları suç +tutuklanamaz diğer +memleket korunmasi kullanılamaz +İnsanlık ilerlemeyi +bir mülk menfaatlerinin +usul zümreye herhangi +mahkeme vicdana ilerleyişe +zulüm zalimane +ilim öncelikle çocuk +mevzubahis ancak +muamelesi dinlenmeye +eşitlikle prensiplerine ülkenin +öğretim bulunmalarına yardım +memleketler amacıyla +birbirlerine +olmalıdır +bırakılamaz serbestisine +hürriyetin iyi +hükmü işbu zalimane +evlenme memleketi tedbirlerle +evlenmek ahalisi işini +hürriyetler +belirlenmiş kere +elde cürüme +tanınan dünyaca yüksek +müddetinin ailesine +vicdan kırıcı itibariyle +geniş inanma +kendi görevleri Teşkilatı +yaymak +öğretim vesayet +renk kişiliğinin +tamamlanan +haklara bulunma +hükmü uygulanabilecek +etmiş geliştirilmesini hoşgörü +sahiptir temel +giyim +Bundan temeli +icaplarını +mülk karışma tekmil +vicdana hürriyetine işini +Herkesin vahşiliklere +dolaşma dünyanın +davasının Uluslararasında idamesi +eşittir +haklardan hakkı +kovuşturmalar hürriyetlerden gözönünde +Evrensel fiilli beyannamesi diff --git a/vendor/icu_normalizer/benches/data/wotw.txt b/vendor/icu_normalizer/benches/data/wotw.txt new file mode 100644 index 00000000..5ffb1cf4 --- /dev/null +++ b/vendor/icu_normalizer/benches/data/wotw.txt @@ -0,0 +1,58 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +# The contents of this file have been translated by "Google Translate". + +Vào những năm cuối của thế kỷ 19, không ai có thể tin rằng thế giới này +đang được theo dõi một cách sâu sắc và chặt chẽ bởi những trí thông minh +lớn hơn con người nhưng cũng nguy hiểm như chính con người; rằng khi con +người bận rộn với những mối quan tâm khác nhau của họ, họ bị xem xét và +nghiên cứu kỹ lưỡng, có lẽ gần như một người đàn ông với kính hiển vi có thể +xem xét kỹ lưỡng những sinh vật nhất thời tụ tập và sinh sôi nảy nở trong +một giọt nước. Với sự tự mãn vô hạn, con người đi đi lại lại khắp thế giới +này chỉ vì những công việc nhỏ nhặt của họ, thanh thản với niềm tin chắc +chắn về đế chế của họ đối với vật chất. Có thể là infusoria dưới kính hiển +vi cũng làm như vậy. Không ai coi các thế giới cũ hơn trong không gian là +nguồn gây nguy hiểm cho con người, hoặc nghĩ về chúng chỉ để bác bỏ ý +tưởng về sự sống đối với chúng là không thể hoặc không thể xảy ra. +Thật tò mò khi nhớ lại một số thói quen tinh thần của những ngày đã +qua. Hầu hết những người trên trái đất đều tưởng tượng rằng có thể có +những người khác trên sao Hỏa, có lẽ thấp kém hơn họ và sẵn sàng chào +đón một doanh nghiệp truyền giáo. Tuy nhiên, bên kia vịnh không gian, +những bộ óc đối với tâm trí của chúng ta cũng như tâm trí của chúng ta đối +với những con thú bị diệt vong, những bộ óc rộng lớn, lạnh lùng và vô cảm, +nhìn trái đất này với con mắt ghen tị, và dần dần và chắc chắn vạch ra +những kế hoạch chống lại chúng ta. Và đầu thế kỷ 20 đã xảy ra sự vỡ mộng +lớn. Hành tinh sao Hỏa, tôi không cần nhắc độc giả, quay xung quanh mặt +trời ở khoảng cách trung bình 140.000.000 dặm, và ánh sáng và nhiệt mà +nó nhận được từ mặt trời chỉ bằng một nửa so với thế giới này nhận được. +Nếu giả thuyết về tinh vân có bất kỳ sự thật nào, nó phải tồn tại lâu +đời hơn thế giới của chúng ta; và rất lâu trước khi trái đất này ngừng +nóng chảy, sự sống trên bề mặt của nó hẳn đã bắt đầu quá trình của nó. +Thực tế là nó chỉ chiếm một phần bảy thể tích của trái đất đã làm tăng +tốc độ nguội đi của nó đến nhiệt độ mà sự sống có thể bắt đầu. Nó có +không khí và nước và tất cả những gì cần thiết để hỗ trợ sự tồn tại +sinh động. Tuy nhiên, con người quá hão huyền và bị mù quáng bởi sự phù +phiếm của mình, đến nỗi cho đến tận cuối thế kỷ 19, không có nhà văn nào +bày tỏ bất kỳ ý tưởng nào rằng sự sống thông minh có thể đã phát triển ở đó xa, +hoặc thực sự là ở tất cả, vượt ra ngoài mức độ trần gian của nó. Người ta +cũng không hiểu một cách tổng quát rằng vì sao Hỏa già hơn trái đất của chúng +ta, chỉ bằng một phần tư diện tích bề mặt và ở xa mặt trời hơn, nên điều tất +yếu dẫn đến là nó không chỉ xa hơn so với thời điểm bắt đầu mà còn gần ngày kết +thúc hơn. Sự nguội lạnh thế tục mà một ngày nào đó phải vượt qua hành tinh của chúng +ta đã thực sự đi xa với người hàng xóm của chúng ta. Tình trạng vật lý của nó phần lớn +vẫn còn là một bí ẩn, nhưng giờ đây chúng ta biết rằng ngay cả ở vùng xích đạo của nó, +nhiệt độ giữa trưa hầu như không bằng nhiệt độ của mùa đông lạnh nhất của chúng ta. +Không khí của nó loãng hơn nhiều so với không khí của chúng ta, các đại dương của nó đã +thu hẹp lại cho đến khi chỉ bao phủ một phần ba bề mặt của nó, và khi các mùa chậm chạp +của nó thay đổi, các chỏm tuyết khổng lồ tụ lại và tan chảy ở hai cực và định kỳ làm ngập các vùng ôn đới của nó. +Giai đoạn cuối cùng của sự kiệt sức, mà đối với chúng ta vẫn còn quá xa vời, đã trở thành +một vấn đề ngày nay đối với các cư dân trên sao Hỏa. Áp lực trước mắt của sự cần +thiết đã làm sáng tỏ trí tuệ của họ, mở rộng sức mạnh của họ và làm chai đá trái +tim họ. Và nhìn xuyên qua không gian với các công cụ, và trí thông minh như chúng +ta hiếm khi mơ tới, họ thấy, ở khoảng cách gần nhất chỉ cách họ 35.000.000 dặm +về phía mặt trời, một ngôi sao buổi sáng của hy vọng, hành tinh ấm áp hơn của chúng +ta, màu xanh lục của thảm thực vật và màu xám của nước , với bầu không khí nhiều +mây hùng hồn của sự màu mỡ, với những cái nhìn thoáng qua qua những đám mây +trôi dạt của nó là những dải đất rộng lớn đông dân và những vùng biển chật hẹp đông đúc hải quân. diff --git a/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs b/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs new file mode 100644 index 00000000..4ee7590a --- /dev/null +++ b/vendor/icu_normalizer/benches/decomposing_normalizer_nfd.rs @@ -0,0 +1,213 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; + +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + +struct BenchDataContent { + pub file_name: String, + pub nfc: String, + pub nfd: String, + pub nfkc: String, + pub nfkd: String, + pub nfc_u16: Vec<u16>, + pub nfd_u16: Vec<u16>, + pub nfkc_u16: Vec<u16>, + pub nfkd_u16: Vec<u16>, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 15] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); + let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); + let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + + let content_latin: (&str, &str) = ( + "TestNames_Latin", + &strip_headers(include_str!("./data/TestNames_Latin.txt")), + ); + let content_jp_h: (&str, &str) = ( + "TestNames_Japanese_h", + &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), + ); + let content_jp_k: (&str, &str) = ( + "TestNames_Japanese_k", + &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), + ); + let content_korean: (&str, &str) = ( + "TestNames_Korean", + &strip_headers(include_str!("./data/TestNames_Korean.txt")), + ); + let content_random_words_ar: (&str, &str) = ( + "TestRandomWordsUDHR_ar", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), + ); + let content_random_words_de: (&str, &str) = ( + "TestRandomWordsUDHR_de", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), + ); + let content_random_words_el: (&str, &str) = ( + "TestRandomWordsUDHR_el", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), + ); + let content_random_words_es: (&str, &str) = ( + "TestRandomWordsUDHR_es", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), + ); + let content_random_words_fr: (&str, &str) = ( + "TestRandomWordsUDHR_fr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), + ); + let content_random_words_he: (&str, &str) = ( + "TestRandomWordsUDHR_he", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), + ); + let content_random_words_pl: (&str, &str) = ( + "TestRandomWordsUDHR_pl", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), + ); + let content_random_words_ru: (&str, &str) = ( + "TestRandomWordsUDHR_ru", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), + ); + let content_random_words_th: (&str, &str) = ( + "TestRandomWordsUDHR_th", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), + ); + let content_random_words_tr: (&str, &str) = ( + "TestRandomWordsUDHR_tr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), + ); + let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); + + [ + content_latin, + content_viet, + content_jp_k, + content_jp_h, + content_korean, + content_random_words_ru, + content_random_words_ar, + content_random_words_el, + content_random_words_es, + content_random_words_fr, + content_random_words_tr, + content_random_words_th, + content_random_words_pl, + content_random_words_he, + content_random_words_de, + ] + .map(|(file_name, raw_content)| { + let nfc = &nfc_normalizer.normalize(raw_content); + let nfd = &nfd_normalizer.normalize(raw_content); + let nfkc = &nfkc_normalizer.normalize(raw_content); + let nfkd = &nfkd_normalizer.normalize(raw_content); + BenchDataContent { + file_name: file_name.to_owned(), + nfc: nfc.to_string(), + nfd: nfd.to_string(), + nfkc: nfkc.to_string(), + nfkd: nfkd.to_string(), + nfc_u16: nfc.encode_utf16().collect(), + nfd_u16: nfd.encode_utf16().collect(), + nfkc_u16: nfkc.encode_utf16().collect(), + nfkd_u16: nfkd.encode_utf16().collect(), + } + }) +} + +fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { + normalizer.normalize(text); +} + +fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { + normalizer.normalize_utf16(text); +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "decomposing_normalizer_nfd"; + + let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfd(); + + let mut group = criterion.benchmark_group(group_name); + + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) + }) + }, + ); + + // UTF 16 + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) + }) + }, + ); + } + + group.finish(); +} diff --git a/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs b/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs new file mode 100644 index 00000000..4b5d9013 --- /dev/null +++ b/vendor/icu_normalizer/benches/decomposing_normalizer_nfkd.rs @@ -0,0 +1,211 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use criterion::{black_box, BenchmarkId, Criterion}; + +use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + +struct BenchDataContent { + pub file_name: String, + pub nfc: String, + pub nfd: String, + pub nfkc: String, + pub nfkd: String, + pub nfc_u16: Vec<u16>, + pub nfd_u16: Vec<u16>, + pub nfkc_u16: Vec<u16>, + pub nfkd_u16: Vec<u16>, +} + +fn strip_headers(content: &str) -> String { + content + .lines() + .filter(|&s| !s.starts_with('#')) + .map(|s| s.to_owned()) + .collect::<Vec<String>>() + .join("\n") +} + +fn normalizer_bench_data() -> [BenchDataContent; 15] { + let nfc_normalizer = ComposingNormalizerBorrowed::new_nfc(); + let nfd_normalizer = DecomposingNormalizerBorrowed::new_nfd(); + let nfkc_normalizer = ComposingNormalizerBorrowed::new_nfkc(); + let nfkd_normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + + let content_latin: (&str, &str) = ( + "TestNames_Latin", + &strip_headers(include_str!("./data/TestNames_Latin.txt")), + ); + let content_jp_h: (&str, &str) = ( + "TestNames_Japanese_h", + &strip_headers(include_str!("./data/TestNames_Japanese_h.txt")), + ); + let content_jp_k: (&str, &str) = ( + "TestNames_Japanese_k", + &strip_headers(include_str!("./data/TestNames_Japanese_k.txt")), + ); + let content_korean: (&str, &str) = ( + "TestNames_Korean", + &strip_headers(include_str!("./data/TestNames_Korean.txt")), + ); + let content_random_words_ar: (&str, &str) = ( + "TestRandomWordsUDHR_ar", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ar.txt")), + ); + let content_random_words_de: (&str, &str) = ( + "TestRandomWordsUDHR_de", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_de.txt")), + ); + let content_random_words_el: (&str, &str) = ( + "TestRandomWordsUDHR_el", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_el.txt")), + ); + let content_random_words_es: (&str, &str) = ( + "TestRandomWordsUDHR_es", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_es.txt")), + ); + let content_random_words_fr: (&str, &str) = ( + "TestRandomWordsUDHR_fr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_fr.txt")), + ); + let content_random_words_he: (&str, &str) = ( + "TestRandomWordsUDHR_he", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_he.txt")), + ); + let content_random_words_pl: (&str, &str) = ( + "TestRandomWordsUDHR_pl", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_pl.txt")), + ); + let content_random_words_ru: (&str, &str) = ( + "TestRandomWordsUDHR_ru", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_ru.txt")), + ); + let content_random_words_th: (&str, &str) = ( + "TestRandomWordsUDHR_th", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_th.txt")), + ); + let content_random_words_tr: (&str, &str) = ( + "TestRandomWordsUDHR_tr", + &strip_headers(include_str!("./data/TestRandomWordsUDHR_tr.txt")), + ); + let content_viet: (&str, &str) = ("wotw", &strip_headers(include_str!("./data/wotw.txt"))); + + [ + content_latin, + content_viet, + content_jp_k, + content_jp_h, + content_korean, + content_random_words_ru, + content_random_words_ar, + content_random_words_el, + content_random_words_es, + content_random_words_fr, + content_random_words_tr, + content_random_words_th, + content_random_words_pl, + content_random_words_he, + content_random_words_de, + ] + .map(|(file_name, raw_content)| { + let nfc = &nfc_normalizer.normalize(raw_content); + let nfd = &nfd_normalizer.normalize(raw_content); + let nfkc = &nfkc_normalizer.normalize(raw_content); + let nfkd = &nfkd_normalizer.normalize(raw_content); + BenchDataContent { + file_name: file_name.to_owned(), + nfc: nfc.to_string(), + nfd: nfd.to_string(), + nfkc: nfkc.to_string(), + nfkd: nfkd.to_string(), + nfc_u16: nfc.encode_utf16().collect(), + nfd_u16: nfd.encode_utf16().collect(), + nfkc_u16: nfkc.encode_utf16().collect(), + nfkd_u16: nfkd.encode_utf16().collect(), + } + }) +} + +fn function_under_bench(normalizer: &DecomposingNormalizerBorrowed, text: &str) { + normalizer.normalize(text); +} + +fn function_under_bench_u16(normalizer: &DecomposingNormalizerBorrowed, text: &[u16]) { + normalizer.normalize_utf16(text); +} + +pub fn criterion_benchmark(criterion: &mut Criterion) { + let group_name = "decomposing_normalizer_nfkd"; + + let normalizer_under_bench = DecomposingNormalizerBorrowed::new_nfkd(); + + let mut group = criterion.benchmark_group(group_name); + for bench_data_content in black_box(normalizer_bench_data()) { + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfc)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}", bench_data_content.file_name)), + |bencher| { + bencher + .iter(|| function_under_bench(&normalizer_under_bench, &bench_data_content.nfd)) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkc) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench(&normalizer_under_bench, &bench_data_content.nfkd) + }) + }, + ); + + // UTF 16 + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfd_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkc_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkc_u16) + }) + }, + ); + group.bench_function( + BenchmarkId::from_parameter(format!("from_nfkd_{}_u16", bench_data_content.file_name)), + |bencher| { + bencher.iter(|| { + function_under_bench_u16(&normalizer_under_bench, &bench_data_content.nfkd_u16) + }) + }, + ); + } + group.finish(); +} diff --git a/vendor/icu_normalizer/src/lib.rs b/vendor/icu_normalizer/src/lib.rs new file mode 100644 index 00000000..788b2682 --- /dev/null +++ b/vendor/icu_normalizer/src/lib.rs @@ -0,0 +1,2854 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +// https://github.com/unicode-org/icu4x/blob/main/documents/process/boilerplate.md#library-annotations +#![cfg_attr(not(any(test, doc)), no_std)] +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + clippy::trivially_copy_pass_by_ref, + missing_debug_implementations, + ) +)] +#![warn(missing_docs)] + +//! Normalizing text into Unicode Normalization Forms. +//! +//! This module is published as its own crate ([`icu_normalizer`](https://docs.rs/icu_normalizer/latest/icu_normalizer/)) +//! and as part of the [`icu`](https://docs.rs/icu/latest/icu/) crate. See the latter for more details on the ICU4X project. +//! +//! # Functionality +//! +//! The top level of the crate provides normalization of input into the four normalization forms defined in [UAX #15: Unicode +//! Normalization Forms](https://www.unicode.org/reports/tr15/): NFC, NFD, NFKC, and NFKD. +//! +//! Three kinds of contiguous inputs are supported: known-well-formed UTF-8 (`&str`), potentially-not-well-formed UTF-8, +//! and potentially-not-well-formed UTF-16. Additionally, an iterator over `char` can be wrapped in a normalizing iterator. +//! +//! The `uts46` module provides the combination of mapping and normalization operations for [UTS #46: Unicode IDNA +//! Compatibility Processing](https://www.unicode.org/reports/tr46/). This functionality is not meant to be used by +//! applications directly. Instead, it is meant as a building block for a full implementation of UTS #46, such as the +//! [`idna`](https://docs.rs/idna/latest/idna/) crate. +//! +//! The `properties` module provides the non-recursive canonical decomposition operation on a per `char` basis and +//! the canonical compositon operation given two `char`s. It also provides access to the Canonical Combining Class +//! property. These operations are primarily meant for [HarfBuzz](https://harfbuzz.github.io/) via the +//! [`icu_harfbuzz`](https://docs.rs/icu_harfbuzz/latest/icu_harfbuzz/) crate. +//! +//! Notably, this normalizer does _not_ provide the normalization “quick check” that can result in “maybe” in +//! addition to “yes” and “no”. The normalization checks provided by this crate always give a definitive +//! non-“maybe” answer. +//! +//! # Examples +//! +//! ``` +//! let nfc = icu_normalizer::ComposingNormalizerBorrowed::new_nfc(); +//! assert_eq!(nfc.normalize("a\u{0308}"), "ä"); +//! assert!(nfc.is_normalized("ä")); +//! +//! let nfd = icu_normalizer::DecomposingNormalizerBorrowed::new_nfd(); +//! assert_eq!(nfd.normalize("ä"), "a\u{0308}"); +//! assert!(!nfd.is_normalized("ä")); +//! ``` + +extern crate alloc; + +// We don't depend on icu_properties to minimize deps, but we want to be able +// to ensure we're using the right CCC values +macro_rules! ccc { + ($name:ident, $num:expr) => { + const { + #[cfg(feature = "icu_properties")] + if icu_properties::props::CanonicalCombiningClass::$name.to_icu4c_value() != $num { + panic!("icu_normalizer has incorrect ccc values") + } + CanonicalCombiningClass::from_icu4c_value($num) + } + }; +} + +pub mod properties; +pub mod provider; +pub mod uts46; + +use crate::provider::CanonicalCompositions; +use crate::provider::DecompositionData; +use crate::provider::NormalizerNfdDataV1; +use crate::provider::NormalizerNfkdDataV1; +use crate::provider::NormalizerUts46DataV1; +use alloc::borrow::Cow; +use alloc::string::String; +use core::char::REPLACEMENT_CHARACTER; +use icu_collections::char16trie::Char16Trie; +use icu_collections::char16trie::Char16TrieIterator; +use icu_collections::char16trie::TrieResult; +use icu_collections::codepointtrie::CodePointTrie; +#[cfg(feature = "icu_properties")] +use icu_properties::props::CanonicalCombiningClass; +use icu_provider::prelude::*; +use provider::DecompositionTables; +use provider::NormalizerNfcV1; +use provider::NormalizerNfdTablesV1; +use provider::NormalizerNfkdTablesV1; +use smallvec::SmallVec; +#[cfg(feature = "utf16_iter")] +use utf16_iter::Utf16CharsEx; +#[cfg(feature = "utf8_iter")] +use utf8_iter::Utf8CharsEx; +use zerovec::{zeroslice, ZeroSlice}; + +/// This type exists as a shim for icu_properties CanonicalCombiningClass when the crate is disabled +/// It should not be exposed to users. +#[cfg(not(feature = "icu_properties"))] +#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] +struct CanonicalCombiningClass(pub(crate) u8); + +#[cfg(not(feature = "icu_properties"))] +impl CanonicalCombiningClass { + const fn from_icu4c_value(v: u8) -> Self { + Self(v) + } + const fn to_icu4c_value(self) -> u8 { + self.0 + } +} + +const CCC_NOT_REORDERED: CanonicalCombiningClass = ccc!(NotReordered, 0); +const CCC_ABOVE: CanonicalCombiningClass = ccc!(Above, 230); + +/// Treatment of the ignorable marker (0xFFFFFFFF) in data. +#[derive(Debug, PartialEq, Eq)] +enum IgnorableBehavior { + /// 0xFFFFFFFF in data is not supported. + Unsupported, + /// Ignorables are ignored. + Ignored, + /// Ignorables are treated as singleton decompositions + /// to the REPLACEMENT CHARACTER. + ReplacementCharacter, +} + +/// Marker for UTS 46 ignorables. +/// +/// See trie-value-format.md +const IGNORABLE_MARKER: u32 = 0xFFFFFFFF; + +/// Marker that the decomposition does not round trip via NFC. +/// +/// See trie-value-format.md +const NON_ROUND_TRIP_MARKER: u32 = 1 << 30; + +/// Marker that the first character of the decomposition +/// can combine backwards. +/// +/// See trie-value-format.md +const BACKWARD_COMBINING_MARKER: u32 = 1 << 31; + +/// Mask for the bits have to be zero for this to be a BMP +/// singleton decomposition, or value baked into the surrogate +/// range. +/// +/// See trie-value-format.md +const HIGH_ZEROS_MASK: u32 = 0x3FFF0000; + +/// Mask for the bits have to be zero for this to be a complex +/// decomposition. +/// +/// See trie-value-format.md +const LOW_ZEROS_MASK: u32 = 0xFFE0; + +/// Checks if a trie value carries a (non-zero) canonical +/// combining class. +/// +/// See trie-value-format.md +fn trie_value_has_ccc(trie_value: u32) -> bool { + (trie_value & 0x3FFFFE00) == 0xD800 +} + +/// Checks if the trie signifies a special non-starter decomposition. +/// +/// See trie-value-format.md +fn trie_value_indicates_special_non_starter_decomposition(trie_value: u32) -> bool { + (trie_value & 0x3FFFFF00) == 0xD900 +} + +/// Checks if a trie value signifies a character whose decomposition +/// starts with a non-starter. +/// +/// See trie-value-format.md +fn decomposition_starts_with_non_starter(trie_value: u32) -> bool { + trie_value_has_ccc(trie_value) +} + +/// Extracts a canonical combining class (possibly zero) from a trie value. +/// +/// See trie-value-format.md +fn ccc_from_trie_value(trie_value: u32) -> CanonicalCombiningClass { + if trie_value_has_ccc(trie_value) { + CanonicalCombiningClass::from_icu4c_value(trie_value as u8) + } else { + CCC_NOT_REORDERED + } +} + +/// The tail (everything after the first character) of the NFKD form U+FDFA +/// as 16-bit units. +static FDFA_NFKD: [u16; 17] = [ + 0x644, 0x649, 0x20, 0x627, 0x644, 0x644, 0x647, 0x20, 0x639, 0x644, 0x64A, 0x647, 0x20, 0x648, + 0x633, 0x644, 0x645, +]; + +/// Marker value for U+FDFA in NFKD. (Unified with Hangul syllable marker, +/// but they differ by `NON_ROUND_TRIP_MARKER`.) +/// +/// See trie-value-format.md +const FDFA_MARKER: u16 = 1; + +// These constants originate from page 143 of Unicode 14.0 +/// Syllable base +const HANGUL_S_BASE: u32 = 0xAC00; +/// Lead jamo base +const HANGUL_L_BASE: u32 = 0x1100; +/// Vowel jamo base +const HANGUL_V_BASE: u32 = 0x1161; +/// Trail jamo base (deliberately off by one to account for the absence of a trail) +const HANGUL_T_BASE: u32 = 0x11A7; +/// Lead jamo count +const HANGUL_L_COUNT: u32 = 19; +/// Vowel jamo count +const HANGUL_V_COUNT: u32 = 21; +/// Trail jamo count (deliberately off by one to account for the absence of a trail) +const HANGUL_T_COUNT: u32 = 28; +/// Vowel jamo count times trail jamo count +const HANGUL_N_COUNT: u32 = 588; +/// Syllable count +const HANGUL_S_COUNT: u32 = 11172; + +/// One past the conjoining jamo block +const HANGUL_JAMO_LIMIT: u32 = 0x1200; + +/// If `opt` is `Some`, unwrap it. If `None`, panic if debug assertions +/// are enabled and return `default` if debug assertions are not enabled. +/// +/// Use this only if the only reason why `opt` could be `None` is bogus +/// data from the provider. +#[inline(always)] +fn unwrap_or_gigo<T>(opt: Option<T>, default: T) -> T { + if let Some(val) = opt { + val + } else { + // GIGO case + debug_assert!(false); + default + } +} + +/// Convert a `u32` _obtained from data provider data_ to `char`. +#[inline(always)] +fn char_from_u32(u: u32) -> char { + unwrap_or_gigo(core::char::from_u32(u), REPLACEMENT_CHARACTER) +} + +/// Convert a `u16` _obtained from data provider data_ to `char`. +#[inline(always)] +fn char_from_u16(u: u16) -> char { + char_from_u32(u32::from(u)) +} + +const EMPTY_U16: &ZeroSlice<u16> = zeroslice![]; + +const EMPTY_CHAR: &ZeroSlice<char> = zeroslice![]; + +#[inline(always)] +fn in_inclusive_range(c: char, start: char, end: char) -> bool { + u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start)) +} + +#[inline(always)] +#[cfg(feature = "utf16_iter")] +fn in_inclusive_range16(u: u16, start: u16, end: u16) -> bool { + u.wrapping_sub(start) <= (end - start) +} + +/// Performs canonical composition (including Hangul) on a pair of +/// characters or returns `None` if these characters don't compose. +/// Composition exclusions are taken into account. +#[inline] +fn compose(iter: Char16TrieIterator, starter: char, second: char) -> Option<char> { + let v = u32::from(second).wrapping_sub(HANGUL_V_BASE); + if v >= HANGUL_JAMO_LIMIT - HANGUL_V_BASE { + return compose_non_hangul(iter, starter, second); + } + if v < HANGUL_V_COUNT { + let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE); + if l < HANGUL_L_COUNT { + let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT; + // Safe, because the inputs are known to be in range. + return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }); + } + return None; + } + if in_inclusive_range(second, '\u{11A8}', '\u{11C2}') { + let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE); + if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 { + let lvt = lv + (u32::from(second) - HANGUL_T_BASE); + // Safe, because the inputs are known to be in range. + return Some(unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) }); + } + } + None +} + +/// Performs (non-Hangul) canonical composition on a pair of characters +/// or returns `None` if these characters don't compose. Composition +/// exclusions are taken into account. +fn compose_non_hangul(mut iter: Char16TrieIterator, starter: char, second: char) -> Option<char> { + // To make the trie smaller, the pairs are stored second character first. + // Given how this method is used in ways where it's known that `second` + // is or isn't a starter. We could potentially split the trie into two + // tries depending on whether `second` is a starter. + match iter.next(second) { + TrieResult::NoMatch => None, + TrieResult::NoValue => match iter.next(starter) { + TrieResult::NoMatch => None, + TrieResult::FinalValue(i) => { + if let Some(c) = char::from_u32(i as u32) { + Some(c) + } else { + // GIGO case + debug_assert!(false); + None + } + } + TrieResult::NoValue | TrieResult::Intermediate(_) => { + // GIGO case + debug_assert!(false); + None + } + }, + TrieResult::FinalValue(_) | TrieResult::Intermediate(_) => { + // GIGO case + debug_assert!(false); + None + } + } +} + +/// See trie-value-format.md +#[inline(always)] +fn starter_and_decomposes_to_self_impl(trie_val: u32) -> bool { + // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, + // and this function needs to ignore that. + (trie_val & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 +} + +/// See trie-value-format.md +#[inline(always)] +fn potential_passthrough_and_cannot_combine_backwards_impl(trie_val: u32) -> bool { + (trie_val & (NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER)) == 0 +} + +/// Struct for holding together a character and the value +/// looked up for it from the NFD trie in a more explicit +/// way than an anonymous pair. +/// Also holds a flag about the supplementary-trie provenance. +#[derive(Debug, PartialEq, Eq)] +struct CharacterAndTrieValue { + character: char, + /// See trie-value-format.md + trie_val: u32, +} + +impl CharacterAndTrieValue { + #[inline(always)] + pub fn new(c: char, trie_value: u32) -> Self { + CharacterAndTrieValue { + character: c, + trie_val: trie_value, + } + } + + #[inline(always)] + pub fn starter_and_decomposes_to_self(&self) -> bool { + starter_and_decomposes_to_self_impl(self.trie_val) + } + + /// See trie-value-format.md + #[inline(always)] + #[cfg(feature = "utf8_iter")] + pub fn starter_and_decomposes_to_self_except_replacement(&self) -> bool { + // This intentionally leaves `NON_ROUND_TRIP_MARKER` in the value + // to be compared with zero. U+FFFD has that flag set despite really + // being being round-tripping in order to make UTF-8 errors + // ineligible for passthrough. + (self.trie_val & !BACKWARD_COMBINING_MARKER) == 0 + } + + /// See trie-value-format.md + #[inline(always)] + pub fn can_combine_backwards(&self) -> bool { + (self.trie_val & BACKWARD_COMBINING_MARKER) != 0 + } + /// See trie-value-format.md + #[inline(always)] + pub fn potential_passthrough(&self) -> bool { + (self.trie_val & NON_ROUND_TRIP_MARKER) == 0 + } + /// See trie-value-format.md + #[inline(always)] + pub fn potential_passthrough_and_cannot_combine_backwards(&self) -> bool { + potential_passthrough_and_cannot_combine_backwards_impl(self.trie_val) + } +} + +/// Pack a `char` and a `CanonicalCombiningClass` in +/// 32 bits (the former in the lower 24 bits and the +/// latter in the high 8 bits). The latter can be +/// initialized to 0xFF upon creation, in which case +/// it can be actually set later by calling +/// `set_ccc_from_trie_if_not_already_set`. This is +/// a micro optimization to avoid the Canonical +/// Combining Class trie lookup when there is only +/// one combining character in a sequence. This type +/// is intentionally non-`Copy` to get compiler help +/// in making sure that the class is set on the +/// instance on which it is intended to be set +/// and not on a temporary copy. +/// +/// Note that 0xFF is won't be assigned to an actual +/// canonical combining class per definition D104 +/// in The Unicode Standard. +// +// NOTE: The Pernosco debugger has special knowledge +// of this struct. Please do not change the bit layout +// or the crate-module-qualified name of this struct +// without coordination. +#[derive(Debug)] +struct CharacterAndClass(u32); + +impl CharacterAndClass { + pub fn new(c: char, ccc: CanonicalCombiningClass) -> Self { + CharacterAndClass(u32::from(c) | (u32::from(ccc.to_icu4c_value()) << 24)) + } + pub fn new_with_placeholder(c: char) -> Self { + CharacterAndClass(u32::from(c) | ((0xFF) << 24)) + } + pub fn new_with_trie_value(c_tv: CharacterAndTrieValue) -> Self { + Self::new(c_tv.character, ccc_from_trie_value(c_tv.trie_val)) + } + pub fn new_starter(c: char) -> Self { + CharacterAndClass(u32::from(c)) + } + /// This method must exist for Pernosco to apply its special rendering. + /// Also, this must not be dead code! + pub fn character(&self) -> char { + // Safe, because the low 24 bits came from a `char` + // originally. + unsafe { char::from_u32_unchecked(self.0 & 0xFFFFFF) } + } + /// This method must exist for Pernosco to apply its special rendering. + pub fn ccc(&self) -> CanonicalCombiningClass { + CanonicalCombiningClass::from_icu4c_value((self.0 >> 24) as u8) + } + + pub fn character_and_ccc(&self) -> (char, CanonicalCombiningClass) { + (self.character(), self.ccc()) + } + pub fn set_ccc_from_trie_if_not_already_set(&mut self, trie: &CodePointTrie<u32>) { + if self.0 >> 24 != 0xFF { + return; + } + let scalar = self.0 & 0xFFFFFF; + self.0 = + ((ccc_from_trie_value(trie.get32_u32(scalar)).to_icu4c_value() as u32) << 24) | scalar; + } +} + +// This function exists as a borrow check helper. +#[inline(always)] +fn sort_slice_by_ccc(slice: &mut [CharacterAndClass], trie: &CodePointTrie<u32>) { + // We don't look up the canonical combining class for starters + // of for single combining characters between starters. When + // there's more than one combining character between starters, + // we look up the canonical combining class for each character + // exactly once. + if slice.len() < 2 { + return; + } + slice + .iter_mut() + .for_each(|cc| cc.set_ccc_from_trie_if_not_already_set(trie)); + slice.sort_by_key(|cc| cc.ccc()); +} + +/// An iterator adaptor that turns an `Iterator` over `char` into +/// a lazily-decomposed `char` sequence. +#[derive(Debug)] +pub struct Decomposition<'data, I> +where + I: Iterator<Item = char>, +{ + delegate: I, + buffer: SmallVec<[CharacterAndClass; 17]>, // Enough to hold NFKD for U+FDFA + /// The index of the next item to be read from `buffer`. + /// The purpose if this index is to avoid having to move + /// the rest upon every read. + buffer_pos: usize, + // At the start of `next()` if not `None`, this is a pending unnormalized + // starter. When `Decomposition` appears alone, this is never a non-starter. + // However, when `Decomposition` appears inside a `Composition`, this + // may become a non-starter before `decomposing_next()` is called. + pending: Option<CharacterAndTrieValue>, // None at end of stream + // See trie-value-format.md + trie: &'data CodePointTrie<'data, u32>, + scalars16: &'data ZeroSlice<u16>, + scalars24: &'data ZeroSlice<char>, + supplementary_scalars16: &'data ZeroSlice<u16>, + supplementary_scalars24: &'data ZeroSlice<char>, + /// The lowest character for which either of the following does + /// not hold: + /// 1. Decomposes to self. + /// 2. Decomposition starts with a non-starter + decomposition_passthrough_bound: u32, // never above 0xC0 + ignorable_behavior: IgnorableBehavior, // Arguably should be a type parameter +} + +impl<'data, I> Decomposition<'data, I> +where + I: Iterator<Item = char>, +{ + /// Constructs a decomposing iterator adapter from a delegate + /// iterator and references to the necessary data, without + /// supplementary data. + /// + /// Use `DecomposingNormalizer::normalize_iter()` instead unless + /// there's a good reason to use this constructor directly. + /// + /// Public but hidden in order to be able to use this from the + /// collator. + #[doc(hidden)] // used in collator + pub fn new( + delegate: I, + decompositions: &'data DecompositionData, + tables: &'data DecompositionTables, + ) -> Self { + Self::new_with_supplements( + delegate, + decompositions, + tables, + None, + 0xC0, + IgnorableBehavior::Unsupported, + ) + } + + /// Constructs a decomposing iterator adapter from a delegate + /// iterator and references to the necessary data, including + /// supplementary data. + /// + /// Use `DecomposingNormalizer::normalize_iter()` instead unless + /// there's a good reason to use this constructor directly. + fn new_with_supplements( + delegate: I, + decompositions: &'data DecompositionData, + tables: &'data DecompositionTables, + supplementary_tables: Option<&'data DecompositionTables>, + decomposition_passthrough_bound: u8, + ignorable_behavior: IgnorableBehavior, + ) -> Self { + let mut ret = Decomposition::<I> { + delegate, + buffer: SmallVec::new(), // Normalized + buffer_pos: 0, + // Initialize with a placeholder starter in case + // the real stream starts with a non-starter. + pending: Some(CharacterAndTrieValue::new('\u{FFFF}', 0)), + trie: &decompositions.trie, + scalars16: &tables.scalars16, + scalars24: &tables.scalars24, + supplementary_scalars16: if let Some(supplementary) = supplementary_tables { + &supplementary.scalars16 + } else { + EMPTY_U16 + }, + supplementary_scalars24: if let Some(supplementary) = supplementary_tables { + &supplementary.scalars24 + } else { + EMPTY_CHAR + }, + decomposition_passthrough_bound: u32::from(decomposition_passthrough_bound), + ignorable_behavior, + }; + let _ = ret.next(); // Remove the U+FFFF placeholder + ret + } + + fn push_decomposition16( + &mut self, + offset: usize, + len: usize, + only_non_starters_in_trail: bool, + slice16: &ZeroSlice<u16>, + ) -> (char, usize) { + let (starter, tail) = slice16 + .get_subslice(offset..offset + len) + .and_then(|slice| slice.split_first()) + .map_or_else( + || { + // GIGO case + debug_assert!(false); + (REPLACEMENT_CHARACTER, EMPTY_U16) + }, + |(first, trail)| (char_from_u16(first), trail), + ); + if only_non_starters_in_trail { + // All the rest are combining + self.buffer.extend( + tail.iter() + .map(|u| CharacterAndClass::new_with_placeholder(char_from_u16(u))), + ); + (starter, 0) + } else { + let mut i = 0; + let mut combining_start = 0; + for u in tail.iter() { + let ch = char_from_u16(u); + let trie_value = self.trie.get(ch); + self.buffer.push(CharacterAndClass::new_with_trie_value( + CharacterAndTrieValue::new(ch, trie_value), + )); + i += 1; + // Half-width kana and iota subscript don't occur in the tails + // of these multicharacter decompositions. + if !decomposition_starts_with_non_starter(trie_value) { + combining_start = i; + } + } + (starter, combining_start) + } + } + + fn push_decomposition32( + &mut self, + offset: usize, + len: usize, + only_non_starters_in_trail: bool, + slice32: &ZeroSlice<char>, + ) -> (char, usize) { + let (starter, tail) = slice32 + .get_subslice(offset..offset + len) + .and_then(|slice| slice.split_first()) + .unwrap_or_else(|| { + // GIGO case + debug_assert!(false); + (REPLACEMENT_CHARACTER, EMPTY_CHAR) + }); + if only_non_starters_in_trail { + // All the rest are combining + self.buffer + .extend(tail.iter().map(CharacterAndClass::new_with_placeholder)); + (starter, 0) + } else { + let mut i = 0; + let mut combining_start = 0; + for ch in tail.iter() { + let trie_value = self.trie.get(ch); + self.buffer.push(CharacterAndClass::new_with_trie_value( + CharacterAndTrieValue::new(ch, trie_value), + )); + i += 1; + // Half-width kana and iota subscript don't occur in the tails + // of these multicharacter decompositions. + if !decomposition_starts_with_non_starter(trie_value) { + combining_start = i; + } + } + (starter, combining_start) + } + } + + #[inline(always)] + fn attach_trie_value(&self, c: char) -> CharacterAndTrieValue { + CharacterAndTrieValue::new(c, self.trie.get(c)) + } + + fn delegate_next_no_pending(&mut self) -> Option<CharacterAndTrieValue> { + debug_assert!(self.pending.is_none()); + loop { + let c = self.delegate.next()?; + + // TODO(#2384): Measure if this check is actually an optimization. + if u32::from(c) < self.decomposition_passthrough_bound { + return Some(CharacterAndTrieValue::new(c, 0)); + } + + let trie_val = self.trie.get(c); + // TODO: Can we do something better about the cost of this branch in the + // non-UTS 46 case? + if trie_val == IGNORABLE_MARKER { + match self.ignorable_behavior { + IgnorableBehavior::Unsupported => { + debug_assert!(false); + } + IgnorableBehavior::ReplacementCharacter => { + return Some(CharacterAndTrieValue::new( + c, + u32::from(REPLACEMENT_CHARACTER) | NON_ROUND_TRIP_MARKER, + )); + } + IgnorableBehavior::Ignored => { + // Else ignore this character by reading the next one from the delegate. + continue; + } + } + } + return Some(CharacterAndTrieValue::new(c, trie_val)); + } + } + + fn delegate_next(&mut self) -> Option<CharacterAndTrieValue> { + if let Some(pending) = self.pending.take() { + // Only happens as part of `Composition` and as part of + // the contiguous-buffer methods of `DecomposingNormalizer`. + // I.e. does not happen as part of standalone iterator + // usage of `Decomposition`. + Some(pending) + } else { + self.delegate_next_no_pending() + } + } + + fn decomposing_next(&mut self, c_and_trie_val: CharacterAndTrieValue) -> char { + let (starter, combining_start) = { + let c = c_and_trie_val.character; + // See trie-value-format.md + let decomposition = c_and_trie_val.trie_val; + // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, + // and that flag needs to be ignored here. + if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { + // The character is its own decomposition + (c, 0) + } else { + let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; + let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; + if !high_zeros && !low_zeros { + // Decomposition into two BMP characters: starter and non-starter + let starter = char_from_u32(decomposition & 0x7FFF); + let combining = char_from_u32((decomposition >> 15) & 0x7FFF); + self.buffer + .push(CharacterAndClass::new_with_placeholder(combining)); + (starter, 0) + } else if high_zeros { + // Do the check by looking at `c` instead of looking at a marker + // in `singleton` below, because if we looked at the trie value, + // we'd still have to check that `c` is in the Hangul syllable + // range in order for the subsequent interpretations as `char` + // to be safe. + // Alternatively, `FDFA_MARKER` and the Hangul marker could + // be unified. That would add a branch for Hangul and remove + // a branch from singleton decompositions. It seems more + // important to favor Hangul syllables than singleton + // decompositions. + // Note that it would be valid to hoist this Hangul check + // one or even two steps earlier in this check hierarchy. + // Right now, it's assumed the kind of decompositions into + // BMP starter and non-starter, which occur in many languages, + // should be checked before Hangul syllables, which are about + // one language specifically. Hopefully, we get some + // instruction-level parallelism out of the disjointness of + // operations on `c` and `decomposition`. + let hangul_offset = u32::from(c).wrapping_sub(HANGUL_S_BASE); // SIndex in the spec + if hangul_offset < HANGUL_S_COUNT { + debug_assert_eq!(decomposition, 1); + // Hangul syllable + // The math here comes from page 144 of Unicode 14.0 + let l = hangul_offset / HANGUL_N_COUNT; + let v = (hangul_offset % HANGUL_N_COUNT) / HANGUL_T_COUNT; + let t = hangul_offset % HANGUL_T_COUNT; + + // The unsafe blocks here are OK, because the values stay + // within the Hangul jamo block and, therefore, the scalar + // value range by construction. + self.buffer.push(CharacterAndClass::new_starter(unsafe { + core::char::from_u32_unchecked(HANGUL_V_BASE + v) + })); + let first = unsafe { core::char::from_u32_unchecked(HANGUL_L_BASE + l) }; + if t != 0 { + self.buffer.push(CharacterAndClass::new_starter(unsafe { + core::char::from_u32_unchecked(HANGUL_T_BASE + t) + })); + (first, 2) + } else { + (first, 1) + } + } else { + let singleton = decomposition as u16; + if singleton != FDFA_MARKER { + // Decomposition into one BMP character + let starter = char_from_u16(singleton); + (starter, 0) + } else { + // Special case for the NFKD form of U+FDFA. + self.buffer.extend(FDFA_NFKD.map(|u| { + // SAFETY: `FDFA_NFKD` is known not to contain + // surrogates. + CharacterAndClass::new_starter(unsafe { + core::char::from_u32_unchecked(u32::from(u)) + }) + })); + ('\u{0635}', 17) + } + } + } else { + debug_assert!(low_zeros); + // Only 12 of 14 bits used as of Unicode 16. + let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; + // Only 3 of 4 bits used as of Unicode 16. + let len_bits = decomposition & 0b1111; + let only_non_starters_in_trail = (decomposition & 0b10000) != 0; + if offset < self.scalars16.len() { + self.push_decomposition16( + offset, + (len_bits + 2) as usize, + only_non_starters_in_trail, + self.scalars16, + ) + } else if offset < self.scalars16.len() + self.scalars24.len() { + self.push_decomposition32( + offset - self.scalars16.len(), + (len_bits + 1) as usize, + only_non_starters_in_trail, + self.scalars24, + ) + } else if offset + < self.scalars16.len() + + self.scalars24.len() + + self.supplementary_scalars16.len() + { + self.push_decomposition16( + offset - (self.scalars16.len() + self.scalars24.len()), + (len_bits + 2) as usize, + only_non_starters_in_trail, + self.supplementary_scalars16, + ) + } else { + self.push_decomposition32( + offset + - (self.scalars16.len() + + self.scalars24.len() + + self.supplementary_scalars16.len()), + (len_bits + 1) as usize, + only_non_starters_in_trail, + self.supplementary_scalars24, + ) + } + } + } + }; + // Either we're inside `Composition` or `self.pending.is_none()`. + + self.gather_and_sort_combining(combining_start); + starter + } + + fn gather_and_sort_combining(&mut self, combining_start: usize) { + // Not a `for` loop to avoid holding a mutable reference to `self` across + // the loop body. + while let Some(ch_and_trie_val) = self.delegate_next() { + if !trie_value_has_ccc(ch_and_trie_val.trie_val) { + self.pending = Some(ch_and_trie_val); + break; + } else if !trie_value_indicates_special_non_starter_decomposition( + ch_and_trie_val.trie_val, + ) { + self.buffer + .push(CharacterAndClass::new_with_trie_value(ch_and_trie_val)); + } else { + // The Tibetan special cases are starters that decompose into non-starters. + let mapped = match ch_and_trie_val.character { + '\u{0340}' => { + // COMBINING GRAVE TONE MARK + CharacterAndClass::new('\u{0300}', CCC_ABOVE) + } + '\u{0341}' => { + // COMBINING ACUTE TONE MARK + CharacterAndClass::new('\u{0301}', CCC_ABOVE) + } + '\u{0343}' => { + // COMBINING GREEK KORONIS + CharacterAndClass::new('\u{0313}', CCC_ABOVE) + } + '\u{0344}' => { + // COMBINING GREEK DIALYTIKA TONOS + self.buffer + .push(CharacterAndClass::new('\u{0308}', CCC_ABOVE)); + CharacterAndClass::new('\u{0301}', CCC_ABOVE) + } + '\u{0F73}' => { + // TIBETAN VOWEL SIGN II + self.buffer + .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129))); + CharacterAndClass::new('\u{0F72}', ccc!(CCC130, 130)) + } + '\u{0F75}' => { + // TIBETAN VOWEL SIGN UU + self.buffer + .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129))); + CharacterAndClass::new('\u{0F74}', ccc!(CCC132, 132)) + } + '\u{0F81}' => { + // TIBETAN VOWEL SIGN REVERSED II + self.buffer + .push(CharacterAndClass::new('\u{0F71}', ccc!(CCC129, 129))); + CharacterAndClass::new('\u{0F80}', ccc!(CCC130, 130)) + } + '\u{FF9E}' => { + // HALFWIDTH KATAKANA VOICED SOUND MARK + CharacterAndClass::new('\u{3099}', ccc!(KanaVoicing, 8)) + } + '\u{FF9F}' => { + // HALFWIDTH KATAKANA VOICED SOUND MARK + CharacterAndClass::new('\u{309A}', ccc!(KanaVoicing, 8)) + } + _ => { + // GIGO case + debug_assert!(false); + CharacterAndClass::new_with_placeholder(REPLACEMENT_CHARACTER) + } + }; + self.buffer.push(mapped); + } + } + // Slicing succeeds by construction; we've always ensured that `combining_start` + // is in permissible range. + #[allow(clippy::indexing_slicing)] + sort_slice_by_ccc(&mut self.buffer[combining_start..], self.trie); + } +} + +impl<I> Iterator for Decomposition<'_, I> +where + I: Iterator<Item = char>, +{ + type Item = char; + + fn next(&mut self) -> Option<char> { + if let Some(ret) = self.buffer.get(self.buffer_pos).map(|c| c.character()) { + self.buffer_pos += 1; + if self.buffer_pos == self.buffer.len() { + self.buffer.clear(); + self.buffer_pos = 0; + } + return Some(ret); + } + debug_assert_eq!(self.buffer_pos, 0); + let c_and_trie_val = self.pending.take()?; + Some(self.decomposing_next(c_and_trie_val)) + } +} + +/// An iterator adaptor that turns an `Iterator` over `char` into +/// a lazily-decomposed and then canonically composed `char` sequence. +#[derive(Debug)] +pub struct Composition<'data, I> +where + I: Iterator<Item = char>, +{ + /// The decomposing part of the normalizer than operates before + /// the canonical composition is performed on its output. + decomposition: Decomposition<'data, I>, + /// Non-Hangul canonical composition data. + canonical_compositions: Char16Trie<'data>, + /// To make `next()` yield in cases where there's a non-composing + /// starter in the decomposition buffer, we put it here to let it + /// wait for the next `next()` call (or a jump forward within the + /// `next()` call). + unprocessed_starter: Option<char>, + /// The lowest character for which any one of the following does + /// not hold: + /// 1. Roundtrips via decomposition and recomposition. + /// 2. Decomposition starts with a non-starter + /// 3. Is not a backward-combining starter + composition_passthrough_bound: u32, +} + +impl<'data, I> Composition<'data, I> +where + I: Iterator<Item = char>, +{ + fn new( + decomposition: Decomposition<'data, I>, + canonical_compositions: Char16Trie<'data>, + composition_passthrough_bound: u16, + ) -> Self { + Self { + decomposition, + canonical_compositions, + unprocessed_starter: None, + composition_passthrough_bound: u32::from(composition_passthrough_bound), + } + } + + /// Performs canonical composition (including Hangul) on a pair of + /// characters or returns `None` if these characters don't compose. + /// Composition exclusions are taken into account. + #[inline(always)] + pub fn compose(&self, starter: char, second: char) -> Option<char> { + compose(self.canonical_compositions.iter(), starter, second) + } + + /// Performs (non-Hangul) canonical composition on a pair of characters + /// or returns `None` if these characters don't compose. Composition + /// exclusions are taken into account. + #[inline(always)] + fn compose_non_hangul(&self, starter: char, second: char) -> Option<char> { + compose_non_hangul(self.canonical_compositions.iter(), starter, second) + } +} + +impl<I> Iterator for Composition<'_, I> +where + I: Iterator<Item = char>, +{ + type Item = char; + + #[inline] + fn next(&mut self) -> Option<char> { + let mut undecomposed_starter = CharacterAndTrieValue::new('\u{0}', 0); // The compiler can't figure out that this gets overwritten before use. + if self.unprocessed_starter.is_none() { + // The loop is only broken out of as goto forward + #[allow(clippy::never_loop)] + loop { + if let Some((character, ccc)) = self + .decomposition + .buffer + .get(self.decomposition.buffer_pos) + .map(|c| c.character_and_ccc()) + { + self.decomposition.buffer_pos += 1; + if self.decomposition.buffer_pos == self.decomposition.buffer.len() { + self.decomposition.buffer.clear(); + self.decomposition.buffer_pos = 0; + } + if ccc == CCC_NOT_REORDERED { + // Previous decomposition contains a starter. This must + // now become the `unprocessed_starter` for it to have + // a chance to compose with the upcoming characters. + // + // E.g. parenthesized Hangul in NFKC comes through here, + // but suitable composition exclusion could exercise this + // in NFC. + self.unprocessed_starter = Some(character); + break; // We already have a starter, so skip taking one from `pending`. + } + return Some(character); + } + debug_assert_eq!(self.decomposition.buffer_pos, 0); + undecomposed_starter = self.decomposition.pending.take()?; + if u32::from(undecomposed_starter.character) < self.composition_passthrough_bound + || undecomposed_starter.potential_passthrough() + { + // TODO(#2385): In the NFC case (moot for NFKC and UTS46), if the upcoming + // character is not below `decomposition_passthrough_bound` but is + // below `composition_passthrough_bound`, we read from the trie + // unnecessarily. + if let Some(upcoming) = self.decomposition.delegate_next_no_pending() { + let cannot_combine_backwards = u32::from(upcoming.character) + < self.composition_passthrough_bound + || !upcoming.can_combine_backwards(); + self.decomposition.pending = Some(upcoming); + if cannot_combine_backwards { + // Fast-track succeeded! + return Some(undecomposed_starter.character); + } + } else { + // End of stream + return Some(undecomposed_starter.character); + } + } + break; // Not actually looping + } + } + let mut starter = '\u{0}'; // The compiler can't figure out this gets overwritten before use. + + // The point of having this boolean is to have only one call site to + // `self.decomposition.decomposing_next`, which is hopefully beneficial for + // code size under inlining. + let mut attempt_composition = false; + loop { + if let Some(unprocessed) = self.unprocessed_starter.take() { + debug_assert_eq!(undecomposed_starter, CharacterAndTrieValue::new('\u{0}', 0)); + debug_assert_eq!(starter, '\u{0}'); + starter = unprocessed; + } else { + debug_assert_eq!(self.decomposition.buffer_pos, 0); + let next_starter = self.decomposition.decomposing_next(undecomposed_starter); + if !attempt_composition { + starter = next_starter; + } else if let Some(composed) = self.compose(starter, next_starter) { + starter = composed; + } else { + // This is our yield point. We'll pick this up above in the + // next call to `next()`. + self.unprocessed_starter = Some(next_starter); + return Some(starter); + } + } + // We first loop by index to avoid moving the contents of `buffer`, but + // if there's a discontiguous match, we'll start modifying `buffer` instead. + loop { + let (character, ccc) = if let Some((character, ccc)) = self + .decomposition + .buffer + .get(self.decomposition.buffer_pos) + .map(|c| c.character_and_ccc()) + { + (character, ccc) + } else { + self.decomposition.buffer.clear(); + self.decomposition.buffer_pos = 0; + break; + }; + if let Some(composed) = self.compose(starter, character) { + starter = composed; + self.decomposition.buffer_pos += 1; + continue; + } + let mut most_recent_skipped_ccc = ccc; + { + let _ = self + .decomposition + .buffer + .drain(0..self.decomposition.buffer_pos); + } + self.decomposition.buffer_pos = 0; + if most_recent_skipped_ccc == CCC_NOT_REORDERED { + // We failed to compose a starter. Discontiguous match not allowed. + // We leave the starter in `buffer` for `next()` to find. + return Some(starter); + } + let mut i = 1; // We have skipped one non-starter. + while let Some((character, ccc)) = self + .decomposition + .buffer + .get(i) + .map(|c| c.character_and_ccc()) + { + if ccc == CCC_NOT_REORDERED { + // Discontiguous match not allowed. + return Some(starter); + } + debug_assert!(ccc >= most_recent_skipped_ccc); + if ccc != most_recent_skipped_ccc { + // Using the non-Hangul version as a micro-optimization, since + // we already rejected the case where `second` is a starter + // above, and conjoining jamo are starters. + if let Some(composed) = self.compose_non_hangul(starter, character) { + self.decomposition.buffer.remove(i); + starter = composed; + continue; + } + } + most_recent_skipped_ccc = ccc; + i += 1; + } + break; + } + + debug_assert_eq!(self.decomposition.buffer_pos, 0); + + if !self.decomposition.buffer.is_empty() { + return Some(starter); + } + // Now we need to check if composition with an upcoming starter is possible. + #[allow(clippy::unwrap_used)] + if self.decomposition.pending.is_some() { + // We know that `pending_starter` decomposes to start with a starter. + // Otherwise, it would have been moved to `self.decomposition.buffer` + // by `self.decomposing_next()`. We do this set lookup here in order + // to get an opportunity to go back to the fast track. + // Note that this check has to happen _after_ checking that `pending` + // holds a character, because this flag isn't defined to be meaningful + // when `pending` isn't holding a character. + let pending = self.decomposition.pending.as_ref().unwrap(); + if u32::from(pending.character) < self.composition_passthrough_bound + || !pending.can_combine_backwards() + { + // Won't combine backwards anyway. + return Some(starter); + } + // Consume what we peeked. `unwrap` OK, because we checked `is_some()` + // above. + undecomposed_starter = self.decomposition.pending.take().unwrap(); + // The following line is OK, because we're about to loop back + // to `self.decomposition.decomposing_next(c);`, which will + // restore the between-`next()`-calls invariant of `pending` + // before this function returns. + attempt_composition = true; + continue; + } + // End of input + return Some(starter); + } + } +} + +macro_rules! composing_normalize_to { + ($(#[$meta:meta])*, + $normalize_to:ident, + $write:path, + $slice:ty, + $prolog:block, + $always_valid_utf:literal, + $as_slice:ident, + $fast:block, + $text:ident, + $sink:ident, + $composition:ident, + $composition_passthrough_bound:ident, + $undecomposed_starter:ident, + $pending_slice:ident, + $len_utf:ident, + ) => { + $(#[$meta])* + pub fn $normalize_to<W: $write + ?Sized>( + &self, + $text: $slice, + $sink: &mut W, + ) -> core::fmt::Result { + $prolog + let mut $composition = self.normalize_iter($text.chars()); + debug_assert_eq!($composition.decomposition.ignorable_behavior, IgnorableBehavior::Unsupported); + for cc in $composition.decomposition.buffer.drain(..) { + $sink.write_char(cc.character())?; + } + + // Try to get the compiler to hoist the bound to a register. + let $composition_passthrough_bound = $composition.composition_passthrough_bound; + 'outer: loop { + debug_assert_eq!($composition.decomposition.buffer_pos, 0); + let mut $undecomposed_starter = + if let Some(pending) = $composition.decomposition.pending.take() { + pending + } else { + return Ok(()); + }; + // Allowing indexed slicing, because a failure would be a code bug and + // not a data issue. + #[allow(clippy::indexing_slicing)] + if u32::from($undecomposed_starter.character) < $composition_passthrough_bound || + $undecomposed_starter.potential_passthrough() + { + // We don't know if a `REPLACEMENT_CHARACTER` occurred in the slice or + // was returned in response to an error by the iterator. Assume the + // latter for correctness even though it pessimizes the former. + if $always_valid_utf || $undecomposed_starter.character != REPLACEMENT_CHARACTER { + let $pending_slice = &$text[$text.len() - $composition.decomposition.delegate.$as_slice().len() - $undecomposed_starter.character.$len_utf()..]; + // The `$fast` block must either: + // 1. Return due to reaching EOF + // 2. Leave a starter with its trie value in `$undecomposed_starter` + // and, if there is still more input, leave the next character + // and its trie value in `$composition.decomposition.pending`. + $fast + } + } + // Fast track above, full algorithm below + let mut starter = $composition + .decomposition + .decomposing_next($undecomposed_starter); + 'bufferloop: loop { + // We first loop by index to avoid moving the contents of `buffer`, but + // if there's a discontiguous match, we'll start modifying `buffer` instead. + loop { + let (character, ccc) = if let Some((character, ccc)) = $composition + .decomposition + .buffer + .get($composition.decomposition.buffer_pos) + .map(|c| c.character_and_ccc()) + { + (character, ccc) + } else { + $composition.decomposition.buffer.clear(); + $composition.decomposition.buffer_pos = 0; + break; + }; + if let Some(composed) = $composition.compose(starter, character) { + starter = composed; + $composition.decomposition.buffer_pos += 1; + continue; + } + let mut most_recent_skipped_ccc = ccc; + if most_recent_skipped_ccc == CCC_NOT_REORDERED { + // We failed to compose a starter. Discontiguous match not allowed. + // Write the current `starter` we've been composing, make the unmatched + // starter in the buffer the new `starter` (we know it's been decomposed) + // and process the rest of the buffer with that as the starter. + $sink.write_char(starter)?; + starter = character; + $composition.decomposition.buffer_pos += 1; + continue 'bufferloop; + } else { + { + let _ = $composition + .decomposition + .buffer + .drain(0..$composition.decomposition.buffer_pos); + } + $composition.decomposition.buffer_pos = 0; + } + let mut i = 1; // We have skipped one non-starter. + while let Some((character, ccc)) = $composition + .decomposition + .buffer + .get(i) + .map(|c| c.character_and_ccc()) + { + if ccc == CCC_NOT_REORDERED { + // Discontiguous match not allowed. + $sink.write_char(starter)?; + for cc in $composition.decomposition.buffer.drain(..i) { + $sink.write_char(cc.character())?; + } + starter = character; + { + let removed = $composition.decomposition.buffer.remove(0); + debug_assert_eq!(starter, removed.character()); + } + debug_assert_eq!($composition.decomposition.buffer_pos, 0); + continue 'bufferloop; + } + debug_assert!(ccc >= most_recent_skipped_ccc); + if ccc != most_recent_skipped_ccc { + // Using the non-Hangul version as a micro-optimization, since + // we already rejected the case where `second` is a starter + // above, and conjoining jamo are starters. + if let Some(composed) = + $composition.compose_non_hangul(starter, character) + { + $composition.decomposition.buffer.remove(i); + starter = composed; + continue; + } + } + most_recent_skipped_ccc = ccc; + i += 1; + } + break; + } + debug_assert_eq!($composition.decomposition.buffer_pos, 0); + + if !$composition.decomposition.buffer.is_empty() { + $sink.write_char(starter)?; + for cc in $composition.decomposition.buffer.drain(..) { + $sink.write_char(cc.character())?; + } + // We had non-empty buffer, so can't compose with upcoming. + continue 'outer; + } + // Now we need to check if composition with an upcoming starter is possible. + if $composition.decomposition.pending.is_some() { + // We know that `pending_starter` decomposes to start with a starter. + // Otherwise, it would have been moved to `composition.decomposition.buffer` + // by `composition.decomposing_next()`. We do this set lookup here in order + // to get an opportunity to go back to the fast track. + // Note that this check has to happen _after_ checking that `pending` + // holds a character, because this flag isn't defined to be meaningful + // when `pending` isn't holding a character. + let pending = $composition.decomposition.pending.as_ref().unwrap(); + if u32::from(pending.character) < $composition.composition_passthrough_bound + || !pending.can_combine_backwards() + { + // Won't combine backwards anyway. + $sink.write_char(starter)?; + continue 'outer; + } + let pending_starter = $composition.decomposition.pending.take().unwrap(); + let decomposed = $composition.decomposition.decomposing_next(pending_starter); + if let Some(composed) = $composition.compose(starter, decomposed) { + starter = composed; + } else { + $sink.write_char(starter)?; + starter = decomposed; + } + continue 'bufferloop; + } + // End of input + $sink.write_char(starter)?; + return Ok(()); + } // 'bufferloop + } + } + }; +} + +macro_rules! decomposing_normalize_to { + ($(#[$meta:meta])*, + $normalize_to:ident, + $write:path, + $slice:ty, + $prolog:block, + $as_slice:ident, + $fast:block, + $text:ident, + $sink:ident, + $decomposition:ident, + $decomposition_passthrough_bound:ident, + $undecomposed_starter:ident, + $pending_slice:ident, + $outer:lifetime, // loop labels use lifetime tokens + ) => { + $(#[$meta])* + pub fn $normalize_to<W: $write + ?Sized>( + &self, + $text: $slice, + $sink: &mut W, + ) -> core::fmt::Result { + $prolog + + let mut $decomposition = self.normalize_iter($text.chars()); + debug_assert_eq!($decomposition.ignorable_behavior, IgnorableBehavior::Unsupported); + + // Try to get the compiler to hoist the bound to a register. + let $decomposition_passthrough_bound = $decomposition.decomposition_passthrough_bound; + $outer: loop { + for cc in $decomposition.buffer.drain(..) { + $sink.write_char(cc.character())?; + } + debug_assert_eq!($decomposition.buffer_pos, 0); + let mut $undecomposed_starter = if let Some(pending) = $decomposition.pending.take() { + pending + } else { + return Ok(()); + }; + // Allowing indexed slicing, because a failure would be a code bug and + // not a data issue. + #[allow(clippy::indexing_slicing)] + if $undecomposed_starter.starter_and_decomposes_to_self() { + // Don't bother including `undecomposed_starter` in a contiguous buffer + // write: Just write it right away: + $sink.write_char($undecomposed_starter.character)?; + + let $pending_slice = $decomposition.delegate.$as_slice(); + $fast + } + let starter = $decomposition.decomposing_next($undecomposed_starter); + $sink.write_char(starter)?; + } + } + }; +} + +macro_rules! normalizer_methods { + () => { + /// Normalize a string slice into a `Cow<'a, str>`. + pub fn normalize<'a>(&self, text: &'a str) -> Cow<'a, str> { + let (head, tail) = self.split_normalized(text); + if tail.is_empty() { + return Cow::Borrowed(head); + } + let mut ret = String::new(); + ret.reserve(text.len()); + ret.push_str(head); + let _ = self.normalize_to(tail, &mut ret); + Cow::Owned(ret) + } + + /// Split a string slice into maximum normalized prefix and unnormalized suffix + /// such that the concatenation of the prefix and the normalization of the suffix + /// is the normalization of the whole input. + pub fn split_normalized<'a>(&self, text: &'a str) -> (&'a str, &'a str) { + let up_to = self.is_normalized_up_to(text); + text.split_at_checked(up_to).unwrap_or_else(|| { + // Internal bug, not even GIGO, never supposed to happen + debug_assert!(false); + ("", text) + }) + } + + /// Return the index a string slice is normalized up to. + fn is_normalized_up_to(&self, text: &str) -> usize { + let mut sink = IsNormalizedSinkStr::new(text); + let _ = self.normalize_to(text, &mut sink); + text.len() - sink.remaining_len() + } + + /// Check whether a string slice is normalized. + pub fn is_normalized(&self, text: &str) -> bool { + self.is_normalized_up_to(text) == text.len() + } + + /// Normalize a slice of potentially-invalid UTF-16 into a `Cow<'a, [u16]>`. + /// + /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER + /// before normalizing. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + pub fn normalize_utf16<'a>(&self, text: &'a [u16]) -> Cow<'a, [u16]> { + let (head, tail) = self.split_normalized_utf16(text); + if tail.is_empty() { + return Cow::Borrowed(head); + } + let mut ret = alloc::vec::Vec::with_capacity(text.len()); + ret.extend_from_slice(head); + let _ = self.normalize_utf16_to(tail, &mut ret); + Cow::Owned(ret) + } + + /// Split a slice of potentially-invalid UTF-16 into maximum normalized (and valid) + /// prefix and unnormalized suffix such that the concatenation of the prefix and the + /// normalization of the suffix is the normalization of the whole input. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + pub fn split_normalized_utf16<'a>(&self, text: &'a [u16]) -> (&'a [u16], &'a [u16]) { + let up_to = self.is_normalized_utf16_up_to(text); + text.split_at_checked(up_to).unwrap_or_else(|| { + // Internal bug, not even GIGO, never supposed to happen + debug_assert!(false); + (&[], text) + }) + } + + /// Return the index a slice of potentially-invalid UTF-16 is normalized up to. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + fn is_normalized_utf16_up_to(&self, text: &[u16]) -> usize { + let mut sink = IsNormalizedSinkUtf16::new(text); + let _ = self.normalize_utf16_to(text, &mut sink); + text.len() - sink.remaining_len() + } + + /// Checks whether a slice of potentially-invalid UTF-16 is normalized. + /// + /// Unpaired surrogates are treated as the REPLACEMENT CHARACTER. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + pub fn is_normalized_utf16(&self, text: &[u16]) -> bool { + self.is_normalized_utf16_up_to(text) == text.len() + } + + /// Normalize a slice of potentially-invalid UTF-8 into a `Cow<'a, str>`. + /// + /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER + /// according to the WHATWG Encoding Standard. + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + pub fn normalize_utf8<'a>(&self, text: &'a [u8]) -> Cow<'a, str> { + let (head, tail) = self.split_normalized_utf8(text); + if tail.is_empty() { + return Cow::Borrowed(head); + } + let mut ret = String::new(); + ret.reserve(text.len()); + ret.push_str(head); + let _ = self.normalize_utf8_to(tail, &mut ret); + Cow::Owned(ret) + } + + /// Split a slice of potentially-invalid UTF-8 into maximum normalized (and valid) + /// prefix and unnormalized suffix such that the concatenation of the prefix and the + /// normalization of the suffix is the normalization of the whole input. + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + pub fn split_normalized_utf8<'a>(&self, text: &'a [u8]) -> (&'a str, &'a [u8]) { + let up_to = self.is_normalized_utf8_up_to(text); + let (head, tail) = text.split_at_checked(up_to).unwrap_or_else(|| { + // Internal bug, not even GIGO, never supposed to happen + debug_assert!(false); + (&[], text) + }); + // SAFETY: The normalization check also checks for + // UTF-8 well-formedness. + (unsafe { core::str::from_utf8_unchecked(head) }, tail) + } + + /// Return the index a slice of potentially-invalid UTF-8 is normalized up to + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + fn is_normalized_utf8_up_to(&self, text: &[u8]) -> usize { + let mut sink = IsNormalizedSinkUtf8::new(text); + let _ = self.normalize_utf8_to(text, &mut sink); + text.len() - sink.remaining_len() + } + + /// Check if a slice of potentially-invalid UTF-8 is normalized. + /// + /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER + /// according to the WHATWG Encoding Standard before checking. + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + pub fn is_normalized_utf8(&self, text: &[u8]) -> bool { + self.is_normalized_utf8_up_to(text) == text.len() + } + }; +} + +/// Borrowed version of a normalizer for performing decomposing normalization. +#[derive(Debug)] +pub struct DecomposingNormalizerBorrowed<'a> { + decompositions: &'a DecompositionData<'a>, + tables: &'a DecompositionTables<'a>, + supplementary_tables: Option<&'a DecompositionTables<'a>>, + decomposition_passthrough_bound: u8, // never above 0xC0 + composition_passthrough_bound: u16, // never above 0x0300 +} + +impl DecomposingNormalizerBorrowed<'static> { + /// Cheaply converts a [`DecomposingNormalizerBorrowed<'static>`] into a [`DecomposingNormalizer`]. + /// + /// Note: Due to branching and indirection, using [`DecomposingNormalizer`] might inhibit some + /// compile-time optimizations that are possible with [`DecomposingNormalizerBorrowed`]. + pub const fn static_to_owned(self) -> DecomposingNormalizer { + DecomposingNormalizer { + decompositions: DataPayload::from_static_ref(self.decompositions), + tables: DataPayload::from_static_ref(self.tables), + supplementary_tables: if let Some(s) = self.supplementary_tables { + // `map` not available in const context + Some(DataPayload::from_static_ref(s)) + } else { + None + }, + decomposition_passthrough_bound: self.decomposition_passthrough_bound, + composition_passthrough_bound: self.composition_passthrough_bound, + } + } + + /// NFD constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfd() -> Self { + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars24 + .const_len() + <= 0xFFF, + "future extension" + ); + + DecomposingNormalizerBorrowed { + decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, + tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, + supplementary_tables: None, + decomposition_passthrough_bound: 0xC0, + composition_passthrough_bound: 0x0300, + } + } + + /// NFKD constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfkd() -> Self { + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars24 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1 + .scalars24 + .const_len() + <= 0xFFF, + "future extension" + ); + + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap <= 0x0300, + "invalid" + ); + + let decomposition_capped = + if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0xC0 { + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap + } else { + 0xC0 + }; + let composition_capped = + if crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap < 0x0300 { + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1.passthrough_cap + } else { + 0x0300 + }; + + DecomposingNormalizerBorrowed { + decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_DATA_V1, + tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, + supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1), + decomposition_passthrough_bound: decomposition_capped as u8, + composition_passthrough_bound: composition_capped, + } + } + + #[cfg(feature = "compiled_data")] + pub(crate) const fn new_uts46_decomposed() -> Self { + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars24 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1 + .scalars24 + .const_len() + <= 0xFFF, + "future extension" + ); + + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap <= 0x0300, + "invalid" + ); + + let decomposition_capped = + if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap < 0xC0 { + crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap + } else { + 0xC0 + }; + let composition_capped = if crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1 + .passthrough_cap + < 0x0300 + { + crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1.passthrough_cap + } else { + 0x0300 + }; + + DecomposingNormalizerBorrowed { + decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_UTS46_DATA_V1, + tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, + supplementary_tables: Some(crate::provider::Baked::SINGLETON_NORMALIZER_NFKD_TABLES_V1), + decomposition_passthrough_bound: decomposition_capped as u8, + composition_passthrough_bound: composition_capped, + } + } +} + +impl<'data> DecomposingNormalizerBorrowed<'data> { + /// Wraps a delegate iterator into a decomposing iterator + /// adapter by using the data already held by this normalizer. + pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Decomposition<'data, I> { + Decomposition::new_with_supplements( + iter, + self.decompositions, + self.tables, + self.supplementary_tables, + self.decomposition_passthrough_bound, + IgnorableBehavior::Unsupported, + ) + } + + normalizer_methods!(); + + decomposing_normalize_to!( + /// Normalize a string slice into a `Write` sink. + , + normalize_to, + core::fmt::Write, + &str, + { + }, + as_str, + { + let decomposition_passthrough_byte_bound = if decomposition_passthrough_bound == 0xC0 { + 0xC3u8 + } else { + decomposition_passthrough_bound.min(0x80) as u8 + }; + // The attribute belongs on an inner statement, but Rust doesn't allow it there. + #[allow(clippy::unwrap_used)] + 'fast: loop { + let mut code_unit_iter = decomposition.delegate.as_str().as_bytes().iter(); + 'fastest: loop { + if let Some(&upcoming_byte) = code_unit_iter.next() { + if upcoming_byte < decomposition_passthrough_byte_bound { + // Fast-track succeeded! + continue 'fastest; + } + decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); + break 'fastest; + } + // End of stream + sink.write_str(pending_slice)?; + return Ok(()); + } + + // `unwrap()` OK, because the slice is valid UTF-8 and we know there + // is an upcoming byte. + let upcoming = decomposition.delegate.next().unwrap(); + let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming); + if upcoming_with_trie_value.starter_and_decomposes_to_self() { + continue 'fast; + } + let consumed_so_far_slice = &pending_slice[..pending_slice.len() + - decomposition.delegate.as_str().len() + - upcoming.len_utf8()]; + sink.write_str(consumed_so_far_slice)?; + + // Now let's figure out if we got a starter or a non-starter. + if decomposition_starts_with_non_starter( + upcoming_with_trie_value.trie_val, + ) { + // Let this trie value to be reprocessed in case it is + // one of the rare decomposing ones. + decomposition.pending = Some(upcoming_with_trie_value); + decomposition.gather_and_sort_combining(0); + continue 'outer; + } + undecomposed_starter = upcoming_with_trie_value; + debug_assert!(decomposition.pending.is_none()); + break 'fast; + } + }, + text, + sink, + decomposition, + decomposition_passthrough_bound, + undecomposed_starter, + pending_slice, + 'outer, + ); + + decomposing_normalize_to!( + /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink. + /// + /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER + /// according to the WHATWG Encoding Standard. + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + , + normalize_utf8_to, + core::fmt::Write, + &[u8], + { + }, + as_slice, + { + let decomposition_passthrough_byte_bound = decomposition_passthrough_bound.min(0x80) as u8; + // The attribute belongs on an inner statement, but Rust doesn't allow it there. + #[allow(clippy::unwrap_used)] + 'fast: loop { + let mut code_unit_iter = decomposition.delegate.as_slice().iter(); + 'fastest: loop { + if let Some(&upcoming_byte) = code_unit_iter.next() { + if upcoming_byte < decomposition_passthrough_byte_bound { + // Fast-track succeeded! + continue 'fastest; + } + break 'fastest; + } + // End of stream + sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?; + return Ok(()); + } + decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); + + // `unwrap()` OK, because the slice is valid UTF-8 and we know there + // is an upcoming byte. + let upcoming = decomposition.delegate.next().unwrap(); + let upcoming_with_trie_value = decomposition.attach_trie_value(upcoming); + if upcoming_with_trie_value.starter_and_decomposes_to_self_except_replacement() { + // Note: The trie value of the REPLACEMENT CHARACTER is + // intentionally formatted to fail the + // `starter_and_decomposes_to_self` test even though it + // really is a starter that decomposes to self. This + // Allows moving the branch on REPLACEMENT CHARACTER + // below this `continue`. + continue 'fast; + } + + // TODO: Annotate as unlikely. + if upcoming == REPLACEMENT_CHARACTER { + // We might have an error, so fall out of the fast path. + + // Since the U+FFFD might signify an error, we can't + // assume `upcoming.len_utf8()` for the backoff length. + let mut consumed_so_far = pending_slice[..pending_slice.len() - decomposition.delegate.as_slice().len()].chars(); + let back = consumed_so_far.next_back(); + debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER)); + let consumed_so_far_slice = consumed_so_far.as_slice(); + sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?; + + // We could call `gather_and_sort_combining` here and + // `continue 'outer`, but this should be better for code + // size. + undecomposed_starter = upcoming_with_trie_value; + debug_assert!(decomposition.pending.is_none()); + break 'fast; + } + + let consumed_so_far_slice = &pending_slice[..pending_slice.len() + - decomposition.delegate.as_slice().len() + - upcoming.len_utf8()]; + sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) } )?; + + // Now let's figure out if we got a starter or a non-starter. + if decomposition_starts_with_non_starter( + upcoming_with_trie_value.trie_val, + ) { + // Let this trie value to be reprocessed in case it is + // one of the rare decomposing ones. + decomposition.pending = Some(upcoming_with_trie_value); + decomposition.gather_and_sort_combining(0); + continue 'outer; + } + undecomposed_starter = upcoming_with_trie_value; + debug_assert!(decomposition.pending.is_none()); + break 'fast; + } + }, + text, + sink, + decomposition, + decomposition_passthrough_bound, + undecomposed_starter, + pending_slice, + 'outer, + ); + + decomposing_normalize_to!( + /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink. + /// + /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER + /// before normalizing. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + , + normalize_utf16_to, + write16::Write16, + &[u16], + { + sink.size_hint(text.len())?; + }, + as_slice, + { + let mut code_unit_iter = decomposition.delegate.as_slice().iter(); + 'fast: loop { + if let Some(&upcoming_code_unit) = code_unit_iter.next() { + let mut upcoming32 = u32::from(upcoming_code_unit); + if upcoming32 < decomposition_passthrough_bound { + continue 'fast; + } + // We might be doing a trie lookup by surrogate. Surrogates get + // a decomposition to U+FFFD. + let mut trie_value = decomposition.trie.get32(upcoming32); + if starter_and_decomposes_to_self_impl(trie_value) { + continue 'fast; + } + // We might now be looking at a surrogate. + // The loop is only broken out of as goto forward + #[allow(clippy::never_loop)] + 'surrogateloop: loop { + let surrogate_base = upcoming32.wrapping_sub(0xD800); + if surrogate_base > (0xDFFF - 0xD800) { + // Not surrogate + break 'surrogateloop; + } + if surrogate_base <= (0xDBFF - 0xD800) { + let iter_backup = code_unit_iter.clone(); + if let Some(&low) = code_unit_iter.next() { + if in_inclusive_range16(low, 0xDC00, 0xDFFF) { + upcoming32 = (upcoming32 << 10) + u32::from(low) + - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); + // Successfully-paired surrogate. Read from the trie again. + trie_value = decomposition.trie.get32(upcoming32); + if starter_and_decomposes_to_self_impl(trie_value) { + continue 'fast; + } + break 'surrogateloop; + } else { + code_unit_iter = iter_backup; + } + } + } + // unpaired surrogate + upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check. + // trie_value already holds a decomposition to U+FFFD. + break 'surrogateloop; + } + + let upcoming = unsafe { char::from_u32_unchecked(upcoming32) }; + let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value); + + let consumed_so_far_slice = &pending_slice[..pending_slice.len() + - code_unit_iter.as_slice().len() + - upcoming.len_utf16()]; + sink.write_slice(consumed_so_far_slice)?; + + // Now let's figure out if we got a starter or a non-starter. + if decomposition_starts_with_non_starter( + upcoming_with_trie_value.trie_val, + ) { + // Sync with main iterator + decomposition.delegate = code_unit_iter.as_slice().chars(); + // Let this trie value to be reprocessed in case it is + // one of the rare decomposing ones. + decomposition.pending = Some(upcoming_with_trie_value); + decomposition.gather_and_sort_combining(0); + continue 'outer; + } + undecomposed_starter = upcoming_with_trie_value; + debug_assert!(decomposition.pending.is_none()); + break 'fast; + } + // End of stream + sink.write_slice(pending_slice)?; + return Ok(()); + } + // Sync the main iterator + decomposition.delegate = code_unit_iter.as_slice().chars(); + }, + text, + sink, + decomposition, + decomposition_passthrough_bound, + undecomposed_starter, + pending_slice, + 'outer, + ); +} + +/// A normalizer for performing decomposing normalization. +#[derive(Debug)] +pub struct DecomposingNormalizer { + decompositions: DataPayload<NormalizerNfdDataV1>, + tables: DataPayload<NormalizerNfdTablesV1>, + supplementary_tables: Option<DataPayload<NormalizerNfkdTablesV1>>, + decomposition_passthrough_bound: u8, // never above 0xC0 + composition_passthrough_bound: u16, // never above 0x0300 +} + +impl DecomposingNormalizer { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> DecomposingNormalizerBorrowed { + DecomposingNormalizerBorrowed { + decompositions: self.decompositions.get(), + tables: self.tables.get(), + supplementary_tables: self.supplementary_tables.as_ref().map(|s| s.get()), + decomposition_passthrough_bound: self.decomposition_passthrough_bound, + composition_passthrough_bound: self.composition_passthrough_bound, + } + } + + /// NFD constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfd() -> DecomposingNormalizerBorrowed<'static> { + DecomposingNormalizerBorrowed::new_nfd() + } + + icu_provider::gen_buffer_data_constructors!( + () -> error: DataError, + functions: [ + new_nfd: skip, + try_new_nfd_with_buffer_provider, + try_new_nfd_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfd)] + pub fn try_new_nfd_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfdDataV1> + DataProvider<NormalizerNfdTablesV1> + ?Sized, + { + let decompositions: DataPayload<NormalizerNfdDataV1> = + provider.load(Default::default())?.payload; + let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; + + if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { + // The data is from a future where there exists a normalization flavor whose + // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points + // of space. If a good use case from such a decomposition flavor arises, we can + // dynamically change the bit masks so that the length mask becomes 0x1FFF instead + // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, + // since for now the masks are hard-coded, error out. + return Err( + DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO) + ); + } + + let cap = decompositions.get().passthrough_cap; + if cap > 0x0300 { + return Err(DataError::custom("invalid").with_marker(NormalizerNfdDataV1::INFO)); + } + let decomposition_capped = cap.min(0xC0); + let composition_capped = cap.min(0x0300); + + Ok(DecomposingNormalizer { + decompositions, + tables, + supplementary_tables: None, + decomposition_passthrough_bound: decomposition_capped as u8, + composition_passthrough_bound: composition_capped, + }) + } + + icu_provider::gen_buffer_data_constructors!( + () -> error: DataError, + functions: [ + new_nfkd: skip, + try_new_nfkd_with_buffer_provider, + try_new_nfkd_unstable, + Self, + ] + ); + + /// NFKD constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfkd() -> DecomposingNormalizerBorrowed<'static> { + DecomposingNormalizerBorrowed::new_nfkd() + } + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkd)] + pub fn try_new_nfkd_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfkdDataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfkdTablesV1> + + ?Sized, + { + let decompositions: DataPayload<NormalizerNfkdDataV1> = + provider.load(Default::default())?.payload; + let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; + let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> = + provider.load(Default::default())?.payload; + + if tables.get().scalars16.len() + + tables.get().scalars24.len() + + supplementary_tables.get().scalars16.len() + + supplementary_tables.get().scalars24.len() + > 0xFFF + { + // The data is from a future where there exists a normalization flavor whose + // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points + // of space. If a good use case from such a decomposition flavor arises, we can + // dynamically change the bit masks so that the length mask becomes 0x1FFF instead + // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, + // since for now the masks are hard-coded, error out. + return Err( + DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO) + ); + } + + let cap = decompositions.get().passthrough_cap; + if cap > 0x0300 { + return Err(DataError::custom("invalid").with_marker(NormalizerNfkdDataV1::INFO)); + } + let decomposition_capped = cap.min(0xC0); + let composition_capped = cap.min(0x0300); + + Ok(DecomposingNormalizer { + decompositions: decompositions.cast(), + tables, + supplementary_tables: Some(supplementary_tables), + decomposition_passthrough_bound: decomposition_capped as u8, + composition_passthrough_bound: composition_capped, + }) + } + + /// UTS 46 decomposed constructor (testing only) + /// + /// This is a special building block normalization for IDNA. It is the decomposed counterpart of + /// ICU4C's UTS 46 normalization with two exceptions: characters that UTS 46 disallows and + /// ICU4C maps to U+FFFD and characters that UTS 46 maps to the empty string normalize as in + /// NFD in this normalization. In both cases, the previous UTS 46 processing before using + /// normalization is expected to deal with these characters. Making the disallowed characters + /// behave like this is beneficial to data size, and this normalizer implementation cannot + /// deal with a character normalizing to the empty string, which doesn't happen in NFD or + /// NFKD as of Unicode 14. + /// + /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior + /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns + /// U+0345 from a reordered character into a non-reordered character before reordering happens. + /// Therefore, the output of this normalization may differ for different inputs that are + /// canonically equivalent with each other if they differ by how U+0345 is ordered relative + /// to other reorderable characters. + pub(crate) fn try_new_uts46_decomposed_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerUts46DataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfkdTablesV1> + // UTS 46 tables merged into CompatibilityDecompositionTablesV1 + + ?Sized, + { + let decompositions: DataPayload<NormalizerUts46DataV1> = + provider.load(Default::default())?.payload; + let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; + let supplementary_tables: DataPayload<NormalizerNfkdTablesV1> = + provider.load(Default::default())?.payload; + + if tables.get().scalars16.len() + + tables.get().scalars24.len() + + supplementary_tables.get().scalars16.len() + + supplementary_tables.get().scalars24.len() + > 0xFFF + { + // The data is from a future where there exists a normalization flavor whose + // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points + // of space. If a good use case from such a decomposition flavor arises, we can + // dynamically change the bit masks so that the length mask becomes 0x1FFF instead + // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, + // since for now the masks are hard-coded, error out. + return Err( + DataError::custom("future extension").with_marker(NormalizerNfdTablesV1::INFO) + ); + } + + let cap = decompositions.get().passthrough_cap; + if cap > 0x0300 { + return Err(DataError::custom("invalid").with_marker(NormalizerUts46DataV1::INFO)); + } + let decomposition_capped = cap.min(0xC0); + let composition_capped = cap.min(0x0300); + + Ok(DecomposingNormalizer { + decompositions: decompositions.cast(), + tables, + supplementary_tables: Some(supplementary_tables), + decomposition_passthrough_bound: decomposition_capped as u8, + composition_passthrough_bound: composition_capped, + }) + } +} + +/// Borrowed version of a normalizer for performing composing normalization. +#[derive(Debug)] +pub struct ComposingNormalizerBorrowed<'a> { + decomposing_normalizer: DecomposingNormalizerBorrowed<'a>, + canonical_compositions: &'a CanonicalCompositions<'a>, +} + +impl ComposingNormalizerBorrowed<'static> { + /// Cheaply converts a [`ComposingNormalizerBorrowed<'static>`] into a [`ComposingNormalizer`]. + /// + /// Note: Due to branching and indirection, using [`ComposingNormalizer`] might inhibit some + /// compile-time optimizations that are possible with [`ComposingNormalizerBorrowed`]. + pub const fn static_to_owned(self) -> ComposingNormalizer { + ComposingNormalizer { + decomposing_normalizer: self.decomposing_normalizer.static_to_owned(), + canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), + } + } + + /// NFC constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfc() -> Self { + ComposingNormalizerBorrowed { + decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfd(), + canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, + } + } + + /// NFKC constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfkc() -> Self { + ComposingNormalizerBorrowed { + decomposing_normalizer: DecomposingNormalizerBorrowed::new_nfkd(), + canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, + } + } + + /// This is a special building block normalization for IDNA that implements parts of the Map + /// step and the following Normalize step. + /// + /// Warning: In this normalization, U+0345 COMBINING GREEK YPOGEGRAMMENI exhibits a behavior + /// that no character in Unicode exhibits in NFD, NFKD, NFC, or NFKC: Case folding turns + /// U+0345 from a reordered character into a non-reordered character before reordering happens. + /// Therefore, the output of this normalization may differ for different inputs that are + /// canonically equivalents with each other if they differ by how U+0345 is ordered relative + /// to other reorderable characters. + #[cfg(feature = "compiled_data")] + pub(crate) const fn new_uts46() -> Self { + ComposingNormalizerBorrowed { + decomposing_normalizer: DecomposingNormalizerBorrowed::new_uts46_decomposed(), + canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, + } + } +} + +impl<'data> ComposingNormalizerBorrowed<'data> { + /// Wraps a delegate iterator into a composing iterator + /// adapter by using the data already held by this normalizer. + pub fn normalize_iter<I: Iterator<Item = char>>(&self, iter: I) -> Composition<'data, I> { + self.normalize_iter_private(iter, IgnorableBehavior::Unsupported) + } + + fn normalize_iter_private<I: Iterator<Item = char>>( + &self, + iter: I, + ignorable_behavior: IgnorableBehavior, + ) -> Composition<'data, I> { + Composition::new( + Decomposition::new_with_supplements( + iter, + self.decomposing_normalizer.decompositions, + self.decomposing_normalizer.tables, + self.decomposing_normalizer.supplementary_tables, + self.decomposing_normalizer.decomposition_passthrough_bound, + ignorable_behavior, + ), + self.canonical_compositions.canonical_compositions.clone(), + self.decomposing_normalizer.composition_passthrough_bound, + ) + } + + normalizer_methods!(); + + composing_normalize_to!( + /// Normalize a string slice into a `Write` sink. + , + normalize_to, + core::fmt::Write, + &str, + {}, + true, + as_str, + { + // Let's hope LICM hoists this outside `'outer`. + let composition_passthrough_byte_bound = if composition_passthrough_bound == 0x300 { + 0xCCu8 + } else { + // We can make this fancy if a normalization other than NFC where looking at + // non-ASCII lead bytes is worthwhile is ever introduced. + composition_passthrough_bound.min(0x80) as u8 + }; + // Attributes have to be on blocks, so hoisting all the way here. + #[allow(clippy::unwrap_used)] + 'fast: loop { + let mut code_unit_iter = composition.decomposition.delegate.as_str().as_bytes().iter(); + 'fastest: loop { + if let Some(&upcoming_byte) = code_unit_iter.next() { + if upcoming_byte < composition_passthrough_byte_bound { + // Fast-track succeeded! + continue 'fastest; + } + composition.decomposition.delegate = pending_slice[pending_slice.len() - code_unit_iter.as_slice().len() - 1..].chars(); + break 'fastest; + } + // End of stream + sink.write_str(pending_slice)?; + return Ok(()); + } + // `unwrap()` OK, because the slice is valid UTF-8 and we know there + // is an upcoming byte. + let upcoming = composition.decomposition.delegate.next().unwrap(); + let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming); + if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() { + // Can't combine backwards, hence a plain (non-backwards-combining) + // starter albeit past `composition_passthrough_bound` + + // Fast-track succeeded! + continue 'fast; + } + // We need to fall off the fast path. + composition.decomposition.pending = Some(upcoming_with_trie_value); + + // slicing and unwrap OK, because we've just evidently read enough previously. + let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_str().len() - upcoming.len_utf8()].chars(); + // `unwrap` OK, because we've previously manage to read the previous character + undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); + let consumed_so_far_slice = consumed_so_far.as_str(); + sink.write_str(consumed_so_far_slice)?; + break 'fast; + } + }, + text, + sink, + composition, + composition_passthrough_bound, + undecomposed_starter, + pending_slice, + len_utf8, + ); + + composing_normalize_to!( + /// Normalize a slice of potentially-invalid UTF-8 into a `Write` sink. + /// + /// Ill-formed byte sequences are mapped to the REPLACEMENT CHARACTER + /// according to the WHATWG Encoding Standard. + /// + /// ✨ *Enabled with the `utf8_iter` Cargo feature.* + #[cfg(feature = "utf8_iter")] + , + normalize_utf8_to, + core::fmt::Write, + &[u8], + {}, + false, + as_slice, + { + 'fast: loop { + if let Some(upcoming) = composition.decomposition.delegate.next() { + if u32::from(upcoming) < composition_passthrough_bound { + // Fast-track succeeded! + continue 'fast; + } + // TODO: Be statically aware of fast/small trie. + let upcoming_with_trie_value = composition.decomposition.attach_trie_value(upcoming); + if upcoming_with_trie_value.potential_passthrough_and_cannot_combine_backwards() { + // Note: The trie value of the REPLACEMENT CHARACTER is + // intentionally formatted to fail the + // `potential_passthrough_and_cannot_combine_backwards` + // test even though it really is a starter that decomposes + // to self and cannot combine backwards. This + // Allows moving the branch on REPLACEMENT CHARACTER + // below this `continue`. + continue 'fast; + } + // We need to fall off the fast path. + + // TODO(#2006): Annotate as unlikely + if upcoming == REPLACEMENT_CHARACTER { + // Can't tell if this is an error or a literal U+FFFD in + // the input. Assuming the former to be sure. + + // Since the U+FFFD might signify an error, we can't + // assume `upcoming.len_utf8()` for the backoff length. + let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len()].chars(); + let back = consumed_so_far.next_back(); + debug_assert_eq!(back, Some(REPLACEMENT_CHARACTER)); + let consumed_so_far_slice = consumed_so_far.as_slice(); + sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice) })?; + undecomposed_starter = CharacterAndTrieValue::new(REPLACEMENT_CHARACTER, 0); + composition.decomposition.pending = None; + break 'fast; + } + + composition.decomposition.pending = Some(upcoming_with_trie_value); + // slicing and unwrap OK, because we've just evidently read enough previously. + // `unwrap` OK, because we've previously manage to read the previous character + let mut consumed_so_far = pending_slice[..pending_slice.len() - composition.decomposition.delegate.as_slice().len() - upcoming.len_utf8()].chars(); + #[allow(clippy::unwrap_used)] + { + // TODO: If the previous character was below the passthrough bound, + // we really need to read from the trie. Otherwise, we could maintain + // the most-recent trie value. Need to measure what's more expensive: + // Remembering the trie value on each iteration or re-reading the + // last one after the fast-track run. + undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); + } + let consumed_so_far_slice = consumed_so_far.as_slice(); + sink.write_str(unsafe { core::str::from_utf8_unchecked(consumed_so_far_slice)})?; + break 'fast; + } + // End of stream + sink.write_str(unsafe { core::str::from_utf8_unchecked(pending_slice) })?; + return Ok(()); + } + }, + text, + sink, + composition, + composition_passthrough_bound, + undecomposed_starter, + pending_slice, + len_utf8, + ); + + composing_normalize_to!( + /// Normalize a slice of potentially-invalid UTF-16 into a `Write16` sink. + /// + /// Unpaired surrogates are mapped to the REPLACEMENT CHARACTER + /// before normalizing. + /// + /// ✨ *Enabled with the `utf16_iter` Cargo feature.* + #[cfg(feature = "utf16_iter")] + , + normalize_utf16_to, + write16::Write16, + &[u16], + { + sink.size_hint(text.len())?; + }, + false, + as_slice, + { + let mut code_unit_iter = composition.decomposition.delegate.as_slice().iter(); + let mut upcoming32; + // Declaring this up here is useful for getting compile errors about invalid changes + // to the code structure below. + let mut trie_value; + 'fast: loop { + if let Some(&upcoming_code_unit) = code_unit_iter.next() { + upcoming32 = u32::from(upcoming_code_unit); // may be surrogate + if upcoming32 < composition_passthrough_bound { + // No need for surrogate or U+FFFD check, because + // `composition_passthrough_bound` cannot be higher than + // U+0300. + // Fast-track succeeded! + // At this point, `trie_value` is out of sync with `upcoming32`. + // However, we either 1) reach the end of `code_unit_iter`, at + // which point nothing reads `trie_value` anymore or we + // execute the line immediately below this loop. + continue 'fast; + } + // We might be doing a trie lookup by surrogate. Surrogates get + // a decomposition to U+FFFD. + trie_value = composition.decomposition.trie.get32(upcoming32); + if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) { + // Can't combine backwards, hence a plain (non-backwards-combining) + // starter albeit past `composition_passthrough_bound` + + // Fast-track succeeded! + continue 'fast; + } + + // We might now be looking at a surrogate. + // The loop is only broken out of as goto forward + #[allow(clippy::never_loop)] + 'surrogateloop: loop { + let surrogate_base = upcoming32.wrapping_sub(0xD800); + if surrogate_base > (0xDFFF - 0xD800) { + // Not surrogate + break 'surrogateloop; + } + if surrogate_base <= (0xDBFF - 0xD800) { + let iter_backup = code_unit_iter.clone(); + if let Some(&low) = code_unit_iter.next() { + if in_inclusive_range16(low, 0xDC00, 0xDFFF) { + upcoming32 = (upcoming32 << 10) + u32::from(low) + - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); + // Successfully-paired surrogate. Read from the trie again. + trie_value = composition.decomposition.trie.get32(upcoming32); + if potential_passthrough_and_cannot_combine_backwards_impl(trie_value) { + // Fast-track succeeded! + continue 'fast; + } + break 'surrogateloop; + } else { + code_unit_iter = iter_backup; + } + } + } + // unpaired surrogate + upcoming32 = 0xFFFD; // Safe value for `char::from_u32_unchecked` and matches later potential error check. + // trie_value already holds a decomposition to U+FFFD. + debug_assert_eq!(trie_value, NON_ROUND_TRIP_MARKER | BACKWARD_COMBINING_MARKER | 0xFFFD); + break 'surrogateloop; + } + + // SAFETY: upcoming32 can no longer be a surrogate. + let upcoming = unsafe { char::from_u32_unchecked(upcoming32) }; + let upcoming_with_trie_value = CharacterAndTrieValue::new(upcoming, trie_value); + // We need to fall off the fast path. + composition.decomposition.pending = Some(upcoming_with_trie_value); + let mut consumed_so_far = pending_slice[..pending_slice.len() - code_unit_iter.as_slice().len() - upcoming.len_utf16()].chars(); + // `unwrap` OK, because we've previously managed to read the previous character + #[allow(clippy::unwrap_used)] + { + // TODO: If the previous character was below the passthrough bound, + // we really need to read from the trie. Otherwise, we could maintain + // the most-recent trie value. Need to measure what's more expensive: + // Remembering the trie value on each iteration or re-reading the + // last one after the fast-track run. + undecomposed_starter = composition.decomposition.attach_trie_value(consumed_so_far.next_back().unwrap()); + } + let consumed_so_far_slice = consumed_so_far.as_slice(); + sink.write_slice(consumed_so_far_slice)?; + break 'fast; + } + // End of stream + sink.write_slice(pending_slice)?; + return Ok(()); + } + // Sync the main iterator + composition.decomposition.delegate = code_unit_iter.as_slice().chars(); + }, + text, + sink, + composition, + composition_passthrough_bound, + undecomposed_starter, + pending_slice, + len_utf16, + ); +} + +/// A normalizer for performing composing normalization. +#[derive(Debug)] +pub struct ComposingNormalizer { + decomposing_normalizer: DecomposingNormalizer, + canonical_compositions: DataPayload<NormalizerNfcV1>, +} + +impl ComposingNormalizer { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> ComposingNormalizerBorrowed<'_> { + ComposingNormalizerBorrowed { + decomposing_normalizer: self.decomposing_normalizer.as_borrowed(), + canonical_compositions: self.canonical_compositions.get(), + } + } + + /// NFC constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfc() -> ComposingNormalizerBorrowed<'static> { + ComposingNormalizerBorrowed::new_nfc() + } + + icu_provider::gen_buffer_data_constructors!( + () -> error: DataError, + functions: [ + new_nfc: skip, + try_new_nfc_with_buffer_provider, + try_new_nfc_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfc)] + pub fn try_new_nfc_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfdDataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfcV1> + + ?Sized, + { + let decomposing_normalizer = DecomposingNormalizer::try_new_nfd_unstable(provider)?; + + let canonical_compositions: DataPayload<NormalizerNfcV1> = + provider.load(Default::default())?.payload; + + Ok(ComposingNormalizer { + decomposing_normalizer, + canonical_compositions, + }) + } + + /// NFKC constructor using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new_nfkc() -> ComposingNormalizerBorrowed<'static> { + ComposingNormalizerBorrowed::new_nfkc() + } + + icu_provider::gen_buffer_data_constructors!( + () -> error: DataError, + functions: [ + new_nfkc: skip, + try_new_nfkc_with_buffer_provider, + try_new_nfkc_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_nfkc)] + pub fn try_new_nfkc_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfkdDataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfkdTablesV1> + + DataProvider<NormalizerNfcV1> + + ?Sized, + { + let decomposing_normalizer = DecomposingNormalizer::try_new_nfkd_unstable(provider)?; + + let canonical_compositions: DataPayload<NormalizerNfcV1> = + provider.load(Default::default())?.payload; + + Ok(ComposingNormalizer { + decomposing_normalizer, + canonical_compositions, + }) + } + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new_uts46)] + pub(crate) fn try_new_uts46_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerUts46DataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfkdTablesV1> + // UTS 46 tables merged into CompatibilityDecompositionTablesV1 + + DataProvider<NormalizerNfcV1> + + ?Sized, + { + let decomposing_normalizer = + DecomposingNormalizer::try_new_uts46_decomposed_unstable(provider)?; + + let canonical_compositions: DataPayload<NormalizerNfcV1> = + provider.load(Default::default())?.payload; + + Ok(ComposingNormalizer { + decomposing_normalizer, + canonical_compositions, + }) + } +} + +#[cfg(feature = "utf16_iter")] +struct IsNormalizedSinkUtf16<'a> { + expect: &'a [u16], +} + +#[cfg(feature = "utf16_iter")] +impl<'a> IsNormalizedSinkUtf16<'a> { + pub fn new(slice: &'a [u16]) -> Self { + IsNormalizedSinkUtf16 { expect: slice } + } + pub fn remaining_len(&self) -> usize { + self.expect.len() + } +} + +#[cfg(feature = "utf16_iter")] +impl write16::Write16 for IsNormalizedSinkUtf16<'_> { + fn write_slice(&mut self, s: &[u16]) -> core::fmt::Result { + // We know that if we get a slice, it's a pass-through, + // so we can compare addresses. Indexing is OK, because + // an indexing failure would be a code bug rather than + // an input or data issue. + #[allow(clippy::indexing_slicing)] + if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) { + self.expect = &self.expect[s.len()..]; + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } + + fn write_char(&mut self, c: char) -> core::fmt::Result { + let mut iter = self.expect.chars(); + if iter.next() == Some(c) { + self.expect = iter.as_slice(); + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } +} + +#[cfg(feature = "utf8_iter")] +struct IsNormalizedSinkUtf8<'a> { + expect: &'a [u8], +} + +#[cfg(feature = "utf8_iter")] +impl<'a> IsNormalizedSinkUtf8<'a> { + pub fn new(slice: &'a [u8]) -> Self { + IsNormalizedSinkUtf8 { expect: slice } + } + pub fn remaining_len(&self) -> usize { + self.expect.len() + } +} + +#[cfg(feature = "utf8_iter")] +impl core::fmt::Write for IsNormalizedSinkUtf8<'_> { + fn write_str(&mut self, s: &str) -> core::fmt::Result { + // We know that if we get a slice, it's a pass-through, + // so we can compare addresses. Indexing is OK, because + // an indexing failure would be a code bug rather than + // an input or data issue. + #[allow(clippy::indexing_slicing)] + if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) { + self.expect = &self.expect[s.len()..]; + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } + + fn write_char(&mut self, c: char) -> core::fmt::Result { + let mut iter = self.expect.chars(); + if iter.next() == Some(c) { + self.expect = iter.as_slice(); + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } +} + +struct IsNormalizedSinkStr<'a> { + expect: &'a str, +} + +impl<'a> IsNormalizedSinkStr<'a> { + pub fn new(slice: &'a str) -> Self { + IsNormalizedSinkStr { expect: slice } + } + pub fn remaining_len(&self) -> usize { + self.expect.len() + } +} + +impl core::fmt::Write for IsNormalizedSinkStr<'_> { + fn write_str(&mut self, s: &str) -> core::fmt::Result { + // We know that if we get a slice, it's a pass-through, + // so we can compare addresses. Indexing is OK, because + // an indexing failure would be a code bug rather than + // an input or data issue. + #[allow(clippy::indexing_slicing)] + if core::ptr::eq(s.as_ptr(), self.expect.as_ptr()) { + self.expect = &self.expect[s.len()..]; + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } + + fn write_char(&mut self, c: char) -> core::fmt::Result { + let mut iter = self.expect.chars(); + if iter.next() == Some(c) { + self.expect = iter.as_str(); + Ok(()) + } else { + Err(core::fmt::Error {}) + } + } +} diff --git a/vendor/icu_normalizer/src/properties.rs b/vendor/icu_normalizer/src/properties.rs new file mode 100644 index 00000000..948780e1 --- /dev/null +++ b/vendor/icu_normalizer/src/properties.rs @@ -0,0 +1,663 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Access to the Unicode properties or property-based operations that +//! are required for NFC and NFD. +//! +//! Applications should generally use the full normalizers that are +//! provided at the top level of this crate. However, the APIs in this +//! module are provided for callers such as HarfBuzz that specifically +//! want access to the raw canonical composition operation e.g. for use in a +//! glyph-availability-guided custom normalizer. + +use crate::char_from_u16; +use crate::char_from_u32; +use crate::in_inclusive_range; +use crate::provider::CanonicalCompositions; +use crate::provider::DecompositionData; +use crate::provider::DecompositionTables; +use crate::provider::NonRecursiveDecompositionSupplement; +use crate::provider::NormalizerNfcV1; +use crate::provider::NormalizerNfdDataV1; +use crate::provider::NormalizerNfdSupplementV1; +use crate::provider::NormalizerNfdTablesV1; +use crate::trie_value_has_ccc; +use crate::CanonicalCombiningClass; +use crate::BACKWARD_COMBINING_MARKER; +use crate::FDFA_MARKER; +use crate::HANGUL_L_BASE; +use crate::HANGUL_N_COUNT; +use crate::HANGUL_S_BASE; +use crate::HANGUL_S_COUNT; +use crate::HANGUL_T_BASE; +use crate::HANGUL_T_COUNT; +use crate::HANGUL_V_BASE; +use crate::HIGH_ZEROS_MASK; +use crate::LOW_ZEROS_MASK; +use crate::NON_ROUND_TRIP_MARKER; +use icu_provider::prelude::*; + +/// Borrowed version of the raw canonical composition operation. +/// +/// Callers should generally use `ComposingNormalizer` instead of this API. +/// However, this API is provided for callers such as HarfBuzz that specifically +/// want access to the raw canonical composition operation e.g. for use in a +/// glyph-availability-guided custom normalizer. +#[derive(Debug, Copy, Clone)] +pub struct CanonicalCompositionBorrowed<'a> { + canonical_compositions: &'a CanonicalCompositions<'a>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalCompositionBorrowed<'static> { + fn default() -> Self { + Self::new() + } +} + +impl CanonicalCompositionBorrowed<'static> { + /// Cheaply converts a [`CanonicalCompositionBorrowed<'static>`] into a [`CanonicalComposition`]. + /// + /// Note: Due to branching and indirection, using [`CanonicalComposition`] might inhibit some + /// compile-time optimizations that are possible with [`CanonicalCompositionBorrowed`]. + pub const fn static_to_owned(self) -> CanonicalComposition { + CanonicalComposition { + canonical_compositions: DataPayload::from_static_ref(self.canonical_compositions), + } + } + + /// Constructs a new `CanonicalComposition` using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new() -> Self { + Self { + canonical_compositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFC_V1, + } + } +} + +impl CanonicalCompositionBorrowed<'_> { + /// Performs canonical composition (including Hangul) on a pair of + /// characters or returns `None` if these characters don't compose. + /// Composition exclusions are taken into account. + /// + /// # Examples + /// + /// ``` + /// let comp = icu::normalizer::properties::CanonicalCompositionBorrowed::new(); + /// + /// assert_eq!(comp.compose('a', 'b'), None); // Just two non-composing starters + /// assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); + /// assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); + /// assert_eq!(comp.compose('𝅗', '𝅥'), None); // Composition exclusion + /// assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter + /// assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV + /// assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT + /// ``` + #[inline(always)] + pub fn compose(self, starter: char, second: char) -> Option<char> { + crate::compose( + self.canonical_compositions.canonical_compositions.iter(), + starter, + second, + ) + } +} + +/// The raw canonical composition operation. +/// +/// Callers should generally use `ComposingNormalizer` instead of this API. +/// However, this API is provided for callers such as HarfBuzz that specifically +/// want access to the raw canonical composition operation e.g. for use in a +/// glyph-availability-guided custom normalizer. +#[derive(Debug)] +pub struct CanonicalComposition { + canonical_compositions: DataPayload<NormalizerNfcV1>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalComposition { + fn default() -> Self { + Self::new().static_to_owned() + } +} + +impl CanonicalComposition { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> CanonicalCompositionBorrowed<'_> { + CanonicalCompositionBorrowed { + canonical_compositions: self.canonical_compositions.get(), + } + } + + /// Constructs a new `CanonicalCompositionBorrowed` using compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + #[allow(clippy::new_ret_no_self)] + pub const fn new() -> CanonicalCompositionBorrowed<'static> { + CanonicalCompositionBorrowed::new() + } + + icu_provider::gen_buffer_data_constructors!(() -> error: DataError, + functions: [ + new: skip, + try_new_with_buffer_provider, + try_new_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] + pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfcV1> + ?Sized, + { + let canonical_compositions: DataPayload<NormalizerNfcV1> = + provider.load(Default::default())?.payload; + Ok(CanonicalComposition { + canonical_compositions, + }) + } +} + +/// The outcome of non-recursive canonical decomposition of a character. +#[allow(clippy::exhaustive_enums)] +#[derive(Debug, PartialEq, Eq)] +pub enum Decomposed { + /// The character is its own canonical decomposition. + Default, + /// The character decomposes to a single different character. + Singleton(char), + /// The character decomposes to two characters. + Expansion(char, char), +} + +/// Borrowed version of the raw (non-recursive) canonical decomposition operation. +/// +/// Callers should generally use `DecomposingNormalizer` instead of this API. +/// However, this API is provided for callers such as HarfBuzz that specifically +/// want access to non-recursive canonical decomposition e.g. for use in a +/// glyph-availability-guided custom normalizer. +#[derive(Debug)] +pub struct CanonicalDecompositionBorrowed<'a> { + decompositions: &'a DecompositionData<'a>, + tables: &'a DecompositionTables<'a>, + non_recursive: &'a NonRecursiveDecompositionSupplement<'a>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalDecompositionBorrowed<'static> { + fn default() -> Self { + Self::new() + } +} + +impl CanonicalDecompositionBorrowed<'static> { + /// Cheaply converts a [`CanonicalDecompositionBorrowed<'static>`] into a [`CanonicalDecomposition`]. + /// + /// Note: Due to branching and indirection, using [`CanonicalDecomposition`] might inhibit some + /// compile-time optimizations that are possible with [`CanonicalDecompositionBorrowed`]. + pub const fn static_to_owned(self) -> CanonicalDecomposition { + CanonicalDecomposition { + decompositions: DataPayload::from_static_ref(self.decompositions), + tables: DataPayload::from_static_ref(self.tables), + non_recursive: DataPayload::from_static_ref(self.non_recursive), + } + } + + /// Construct from compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new() -> Self { + const _: () = assert!( + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars16 + .const_len() + + crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1 + .scalars24 + .const_len() + <= 0xFFF, + "future extension" + ); + + Self { + decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, + tables: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_TABLES_V1, + non_recursive: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_SUPPLEMENT_V1, + } + } +} + +impl CanonicalDecompositionBorrowed<'_> { + /// Performs non-recursive canonical decomposition (including for Hangul). + /// + /// ``` + /// use icu::normalizer::properties::Decomposed; + /// let decomp = icu::normalizer::properties::CanonicalDecompositionBorrowed::new(); + /// + /// assert_eq!(decomp.decompose('e'), Decomposed::Default); + /// assert_eq!( + /// decomp.decompose('ệ'), + /// Decomposed::Expansion('ẹ', '\u{0302}') + /// ); + /// assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); + /// assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN + /// assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN + /// assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia + /// ``` + #[inline] + pub fn decompose(&self, c: char) -> Decomposed { + let lvt = u32::from(c).wrapping_sub(HANGUL_S_BASE); + if lvt >= HANGUL_S_COUNT { + return self.decompose_non_hangul(c); + } + // Invariant: lvt ≤ HANGUL_S_COUNT = 1172 + let t = lvt % HANGUL_T_COUNT; + // Invariant: t ≤ (1172 / HANGUL_T_COUNT = 1172 / 28 = 41) + if t == 0 { + let l = lvt / HANGUL_N_COUNT; + // Invariant: v ≤ (1172 / HANGUL_N_COUNT = 1172 / 588 ≈ 2) + let v = (lvt % HANGUL_N_COUNT) / HANGUL_T_COUNT; + // Invariant: v < (HANGUL_N_COUNT / HANGUL_T_COUNT = 588 / 28 = 21) + return Decomposed::Expansion( + // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 21 and 41 + // max will keep it in range, less than 0xD800 + unsafe { char::from_u32_unchecked(HANGUL_L_BASE + l) }, + unsafe { char::from_u32_unchecked(HANGUL_V_BASE + v) }, + ); + } + let lv = lvt - t; + // Invariant: lvt < 1172 + // Safe because values known to be in range + Decomposed::Expansion( + // Safety: HANGUL_*_BASE are 0x1nnn, addding numbers that are 1172 and 41 + // max will keep it in range, less than 0xD800 + unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) }, + unsafe { char::from_u32_unchecked(HANGUL_T_BASE + t) }, + ) + } + + /// Performs non-recursive canonical decomposition except Hangul syllables + /// are reported as `Decomposed::Default`. + #[inline(always)] + fn decompose_non_hangul(&self, c: char) -> Decomposed { + let decomposition = self.decompositions.trie.get(c); + // The REPLACEMENT CHARACTER has `NON_ROUND_TRIP_MARKER` set, + // and that flag needs to be ignored here. + if (decomposition & !(BACKWARD_COMBINING_MARKER | NON_ROUND_TRIP_MARKER)) == 0 { + return Decomposed::Default; + } + // The loop is only broken out of as goto forward + #[allow(clippy::never_loop)] + loop { + let high_zeros = (decomposition & HIGH_ZEROS_MASK) == 0; + let low_zeros = (decomposition & LOW_ZEROS_MASK) == 0; + if !high_zeros && !low_zeros { + // Decomposition into two BMP characters: starter and non-starter + if in_inclusive_range(c, '\u{1F71}', '\u{1FFB}') { + // Look in the other trie due to oxia singleton + // mappings to corresponding character with tonos. + break; + } + let starter = char_from_u32(decomposition & 0x7FFF); + let combining = char_from_u32((decomposition >> 15) & 0x7FFF); + return Decomposed::Expansion(starter, combining); + } + if high_zeros { + // Decomposition into one BMP character or non-starter + if trie_value_has_ccc(decomposition) { + // Non-starter + if !in_inclusive_range(c, '\u{0340}', '\u{0F81}') { + return Decomposed::Default; + } + return match c { + '\u{0340}' => { + // COMBINING GRAVE TONE MARK + Decomposed::Singleton('\u{0300}') + } + '\u{0341}' => { + // COMBINING ACUTE TONE MARK + Decomposed::Singleton('\u{0301}') + } + '\u{0343}' => { + // COMBINING GREEK KORONIS + Decomposed::Singleton('\u{0313}') + } + '\u{0344}' => { + // COMBINING GREEK DIALYTIKA TONOS + Decomposed::Expansion('\u{0308}', '\u{0301}') + } + '\u{0F73}' => { + // TIBETAN VOWEL SIGN II + Decomposed::Expansion('\u{0F71}', '\u{0F72}') + } + '\u{0F75}' => { + // TIBETAN VOWEL SIGN UU + Decomposed::Expansion('\u{0F71}', '\u{0F74}') + } + '\u{0F81}' => { + // TIBETAN VOWEL SIGN REVERSED II + Decomposed::Expansion('\u{0F71}', '\u{0F80}') + } + _ => Decomposed::Default, + }; + } + let singleton = decomposition as u16; + debug_assert_ne!( + singleton, FDFA_MARKER, + "How come we got the U+FDFA NFKD marker here?" + ); + return Decomposed::Singleton(char_from_u16(singleton)); + } + if c == '\u{212B}' { + // ANGSTROM SIGN + return Decomposed::Singleton('\u{00C5}'); + } + // Only 12 of 14 bits used as of Unicode 16. + let offset = (((decomposition & !(0b11 << 30)) >> 16) as usize) - 1; + // Only 3 of 4 bits used as of Unicode 16. + let len_bits = decomposition & 0b1111; + let tables = self.tables; + if offset < tables.scalars16.len() { + if len_bits != 0 { + // i.e. logical len isn't 2 + break; + } + if let Some(first) = tables.scalars16.get(offset) { + if let Some(second) = tables.scalars16.get(offset + 1) { + // Two BMP starters + return Decomposed::Expansion(char_from_u16(first), char_from_u16(second)); + } + } + // GIGO case + debug_assert!(false); + return Decomposed::Default; + } + let len = len_bits + 1; + if len > 2 { + break; + } + let offset24 = offset - tables.scalars16.len(); + if let Some(first_c) = tables.scalars24.get(offset24) { + if len == 1 { + return Decomposed::Singleton(first_c); + } + if let Some(second_c) = tables.scalars24.get(offset24 + 1) { + return Decomposed::Expansion(first_c, second_c); + } + } + // GIGO case + debug_assert!(false); + return Decomposed::Default; + } + let non_recursive = self.non_recursive; + let non_recursive_decomposition = non_recursive.trie.get(c); + if non_recursive_decomposition == 0 { + // GIGO case + debug_assert!(false); + return Decomposed::Default; + } + let trail_or_complex = (non_recursive_decomposition >> 16) as u16; + let lead = non_recursive_decomposition as u16; + if lead != 0 && trail_or_complex != 0 { + // Decomposition into two BMP characters + return Decomposed::Expansion(char_from_u16(lead), char_from_u16(trail_or_complex)); + } + if lead != 0 { + // Decomposition into one BMP character + return Decomposed::Singleton(char_from_u16(lead)); + } + // Decomposition into two non-BMP characters + // Low is offset into a table plus one to keep it non-zero. + let offset = usize::from(trail_or_complex - 1); + if let Some(first) = non_recursive.scalars24.get(offset) { + if let Some(second) = non_recursive.scalars24.get(offset + 1) { + return Decomposed::Expansion(first, second); + } + } + // GIGO case + debug_assert!(false); + Decomposed::Default + } +} + +/// The raw (non-recursive) canonical decomposition operation. +/// +/// Callers should generally use `DecomposingNormalizer` instead of this API. +/// However, this API is provided for callers such as HarfBuzz that specifically +/// want access to non-recursive canonical decomposition e.g. for use in a +/// glyph-availability-guided custom normalizer. +#[derive(Debug)] +pub struct CanonicalDecomposition { + decompositions: DataPayload<NormalizerNfdDataV1>, + tables: DataPayload<NormalizerNfdTablesV1>, + non_recursive: DataPayload<NormalizerNfdSupplementV1>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalDecomposition { + fn default() -> Self { + Self::new().static_to_owned() + } +} + +impl CanonicalDecomposition { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> CanonicalDecompositionBorrowed<'_> { + CanonicalDecompositionBorrowed { + decompositions: self.decompositions.get(), + tables: self.tables.get(), + non_recursive: self.non_recursive.get(), + } + } + + /// Construct from compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + #[allow(clippy::new_ret_no_self)] + pub const fn new() -> CanonicalDecompositionBorrowed<'static> { + CanonicalDecompositionBorrowed::new() + } + + icu_provider::gen_buffer_data_constructors!(() -> error: DataError, + functions: [ + new: skip, + try_new_with_buffer_provider, + try_new_unstable, + Self, + ] + ); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] + pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfdDataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfdSupplementV1> + + ?Sized, + { + let decompositions: DataPayload<NormalizerNfdDataV1> = + provider.load(Default::default())?.payload; + let tables: DataPayload<NormalizerNfdTablesV1> = provider.load(Default::default())?.payload; + + if tables.get().scalars16.len() + tables.get().scalars24.len() > 0xFFF { + // The data is from a future where there exists a normalization flavor whose + // complex decompositions take more than 0xFFF but fewer than 0x1FFF code points + // of space. If a good use case from such a decomposition flavor arises, we can + // dynamically change the bit masks so that the length mask becomes 0x1FFF instead + // of 0xFFF and the all-non-starters mask becomes 0 instead of 0x1000. However, + // since for now the masks are hard-coded, error out. + return Err(DataError::custom("future extension")); + } + + let non_recursive: DataPayload<NormalizerNfdSupplementV1> = + provider.load(Default::default())?.payload; + + Ok(CanonicalDecomposition { + decompositions, + tables, + non_recursive, + }) + } +} + +/// Borrowed version of lookup of the Canonical_Combining_Class Unicode property. +/// +/// # Example +/// +/// ``` +/// use icu::properties::props::CanonicalCombiningClass; +/// use icu::normalizer::properties::CanonicalCombiningClassMapBorrowed; +/// +/// let map = CanonicalCombiningClassMapBorrowed::new(); +/// assert_eq!(map.get('a'), CanonicalCombiningClass::NotReordered); // U+0061: LATIN SMALL LETTER A +/// assert_eq!(map.get32(0x0301), CanonicalCombiningClass::Above); // U+0301: COMBINING ACUTE ACCENT +/// ``` +#[derive(Debug)] +pub struct CanonicalCombiningClassMapBorrowed<'a> { + /// The data trie + decompositions: &'a DecompositionData<'a>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalCombiningClassMapBorrowed<'static> { + fn default() -> Self { + Self::new() + } +} + +impl CanonicalCombiningClassMapBorrowed<'static> { + /// Cheaply converts a [`CanonicalCombiningClassMapBorrowed<'static>`] into a [`CanonicalCombiningClassMap`]. + /// + /// Note: Due to branching and indirection, using [`CanonicalCombiningClassMap`] might inhibit some + /// compile-time optimizations that are possible with [`CanonicalCombiningClassMapBorrowed`]. + pub const fn static_to_owned(self) -> CanonicalCombiningClassMap { + CanonicalCombiningClassMap { + decompositions: DataPayload::from_static_ref(self.decompositions), + } + } + + /// Construct from compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + pub const fn new() -> Self { + CanonicalCombiningClassMapBorrowed { + decompositions: crate::provider::Baked::SINGLETON_NORMALIZER_NFD_DATA_V1, + } + } +} + +impl CanonicalCombiningClassMapBorrowed<'_> { + /// Look up the canonical combining class for a scalar value. + /// + /// The return value is a u8 representing the canonical combining class, + /// you may enable the `"icu_properties"` feature if you would like to use a typed + /// `CanonicalCombiningClass`. + #[inline(always)] + pub fn get_u8(&self, c: char) -> u8 { + self.get32_u8(u32::from(c)) + } + + /// Look up the canonical combining class for a scalar value + /// represented as `u32`. If the argument is outside the scalar + /// value range, `Not_Reordered` is returned. + /// + /// The return value is a u8 representing the canonical combining class, + /// you may enable the `"icu_properties"` feature if you would like to use a typed + /// `CanonicalCombiningClass`. + pub fn get32_u8(&self, c: u32) -> u8 { + let trie_value = self.decompositions.trie.get32(c); + if trie_value_has_ccc(trie_value) { + trie_value as u8 + } else { + ccc!(NotReordered, 0).to_icu4c_value() + } + } + + /// Look up the canonical combining class for a scalar value + /// + /// ✨ *Enabled with the `icu_properties` Cargo feature.* + #[inline(always)] + #[cfg(feature = "icu_properties")] + pub fn get(&self, c: char) -> CanonicalCombiningClass { + CanonicalCombiningClass::from_icu4c_value(self.get_u8(c)) + } + + /// Look up the canonical combining class for a scalar value + /// represented as `u32`. If the argument is outside the scalar + /// value range, `CanonicalCombiningClass::NotReordered` is returned. + /// + /// ✨ *Enabled with the `icu_properties` Cargo feature.* + #[cfg(feature = "icu_properties")] + pub fn get32(&self, c: u32) -> CanonicalCombiningClass { + CanonicalCombiningClass::from_icu4c_value(self.get32_u8(c)) + } +} + +/// Lookup of the Canonical_Combining_Class Unicode property. +#[derive(Debug)] +pub struct CanonicalCombiningClassMap { + /// The data trie + decompositions: DataPayload<NormalizerNfdDataV1>, +} + +#[cfg(feature = "compiled_data")] +impl Default for CanonicalCombiningClassMap { + fn default() -> Self { + Self::new().static_to_owned() + } +} + +impl CanonicalCombiningClassMap { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> CanonicalCombiningClassMapBorrowed<'_> { + CanonicalCombiningClassMapBorrowed { + decompositions: self.decompositions.get(), + } + } + + /// Construct from compiled data. + /// + /// ✨ *Enabled with the `compiled_data` Cargo feature.* + /// + /// [📚 Help choosing a constructor](icu_provider::constructors) + #[cfg(feature = "compiled_data")] + #[allow(clippy::new_ret_no_self)] + pub const fn new() -> CanonicalCombiningClassMapBorrowed<'static> { + CanonicalCombiningClassMapBorrowed::new() + } + + icu_provider::gen_buffer_data_constructors!(() -> error: DataError, + functions: [ + new: skip, + try_new_with_buffer_provider, + try_new_unstable, + Self, + ]); + + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] + pub fn try_new_unstable<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerNfdDataV1> + ?Sized, + { + let decompositions: DataPayload<NormalizerNfdDataV1> = + provider.load(Default::default())?.payload; + Ok(CanonicalCombiningClassMap { decompositions }) + } +} diff --git a/vendor/icu_normalizer/src/provider.rs b/vendor/icu_normalizer/src/provider.rs new file mode 100644 index 00000000..9502f016 --- /dev/null +++ b/vendor/icu_normalizer/src/provider.rs @@ -0,0 +1,216 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! 🚧 \[Unstable\] Data provider struct definitions for this ICU4X component. +//! +//! <div class="stab unstable"> +//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +//! including in SemVer minor releases. While the serde representation of data structs is guaranteed +//! to be stable, their Rust representation might not be. Use with caution. +//! </div> +//! +//! Read more about data providers: [`icu_provider`] + +// Provider structs must be stable +#![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)] + +use icu_collections::char16trie::Char16Trie; +use icu_collections::codepointtrie::CodePointTrie; +use icu_provider::prelude::*; +use zerovec::ZeroVec; + +#[cfg(feature = "compiled_data")] +#[derive(Debug)] +/// Baked data +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. In particular, the `DataProvider` implementations are only +/// guaranteed to match with this version's `*_unstable` providers. Use with caution. +/// </div> +pub struct Baked; + +#[cfg(feature = "compiled_data")] +#[allow(unused_imports)] +const _: () = { + use icu_normalizer_data::*; + pub mod icu { + pub use crate as normalizer; + pub use icu_collections as collections; + } + make_provider!(Baked); + impl_normalizer_nfc_v1!(Baked); + impl_normalizer_nfd_data_v1!(Baked); + impl_normalizer_nfd_supplement_v1!(Baked); + impl_normalizer_nfd_tables_v1!(Baked); + impl_normalizer_nfkd_data_v1!(Baked); + impl_normalizer_nfkd_tables_v1!(Baked); + impl_normalizer_uts46_data_v1!(Baked); +}; + +icu_provider::data_marker!( + /// Marker for data for canonical decomposition. + NormalizerNfdDataV1, + "normalizer/nfd/data/v1", + DecompositionData<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for additional data for canonical decomposition. + NormalizerNfdTablesV1, + "normalizer/nfd/tables/v1", + DecompositionTables<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for data for compatibility decomposition. + NormalizerNfkdDataV1, + "normalizer/nfkd/data/v1", + DecompositionData<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for additional data for compatibility decomposition. + NormalizerNfkdTablesV1, + "normalizer/nfkd/tables/v1", + DecompositionTables<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for data for UTS-46 decomposition. + NormalizerUts46DataV1, + "normalizer/uts46/data/v1", + DecompositionData<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for data for composition. + NormalizerNfcV1, + "normalizer/nfc/v1", + CanonicalCompositions<'static>, + is_singleton = true +); +icu_provider::data_marker!( + /// Marker for additional data for non-recusrsive composition. + NormalizerNfdSupplementV1, + "normalizer/nfd/supplement/v1", + NonRecursiveDecompositionSupplement<'static>, + is_singleton = true +); + +#[cfg(feature = "datagen")] +/// The latest minimum set of markers required by this component. +pub const MARKERS: &[DataMarkerInfo] = &[ + NormalizerNfcV1::INFO, + NormalizerNfdDataV1::INFO, + NormalizerNfdTablesV1::INFO, + NormalizerNfkdDataV1::INFO, + NormalizerNfkdTablesV1::INFO, + NormalizerNfdSupplementV1::INFO, + NormalizerUts46DataV1::INFO, +]; + +/// Decomposition data +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct DecompositionData<'data> { + /// Trie for decomposition. + #[cfg_attr(feature = "serde", serde(borrow))] + pub trie: CodePointTrie<'data, u32>, + /// The passthrough bounds of NFD/NFC are lowered to this + /// maximum instead. (16-bit, because cannot be higher + /// than 0x0300, which is the bound for NFC.) + pub passthrough_cap: u16, +} + +icu_provider::data_struct!( + DecompositionData<'_>, + #[cfg(feature = "datagen")] +); + +/// The expansion tables for cases where the decomposition isn't +/// contained in the trie value +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct DecompositionTables<'data> { + /// Decompositions that are fully within the BMP + #[cfg_attr(feature = "serde", serde(borrow))] + pub scalars16: ZeroVec<'data, u16>, + /// Decompositions with at least one character outside + /// the BMP + #[cfg_attr(feature = "serde", serde(borrow))] + pub scalars24: ZeroVec<'data, char>, +} + +icu_provider::data_struct!( + DecompositionTables<'_>, + #[cfg(feature = "datagen")] +); + +/// Non-Hangul canonical compositions +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct CanonicalCompositions<'data> { + /// Trie keys are two-`char` strings with the second + /// character coming first. The value, if any, is the + /// (non-Hangul) canonical composition. + #[cfg_attr(feature = "serde", serde(borrow))] + pub canonical_compositions: Char16Trie<'data>, +} + +icu_provider::data_struct!( + CanonicalCompositions<'_>, + #[cfg(feature = "datagen")] +); + +/// Non-recursive canonical decompositions that differ from +/// `DecompositionData`. +/// +/// <div class="stab unstable"> +/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, +/// including in SemVer minor releases. While the serde representation of data structs is guaranteed +/// to be stable, their Rust representation might not be. Use with caution. +/// </div> +#[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)] +#[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))] +#[cfg_attr(feature = "datagen", databake(path = icu_normalizer::provider))] +#[cfg_attr(feature = "serde", derive(serde::Deserialize))] +pub struct NonRecursiveDecompositionSupplement<'data> { + /// Trie for the supplementary non-recursive decompositions + #[cfg_attr(feature = "serde", serde(borrow))] + pub trie: CodePointTrie<'data, u32>, + /// Decompositions with at least one character outside + /// the BMP + #[cfg_attr(feature = "serde", serde(borrow))] + pub scalars24: ZeroVec<'data, char>, +} + +icu_provider::data_struct!( + NonRecursiveDecompositionSupplement<'_>, + #[cfg(feature = "datagen")] +); diff --git a/vendor/icu_normalizer/src/uts46.rs b/vendor/icu_normalizer/src/uts46.rs new file mode 100644 index 00000000..672f5c5c --- /dev/null +++ b/vendor/icu_normalizer/src/uts46.rs @@ -0,0 +1,177 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! Bundles the part of UTS 46 that makes sense to implement as a +//! normalization. +//! +//! This is meant to be used as a building block of an UTS 46 +//! implementation, such as the `idna` crate. + +use crate::ComposingNormalizer; +use crate::ComposingNormalizerBorrowed; +use crate::NormalizerNfcV1; +use crate::NormalizerNfdTablesV1; +use crate::NormalizerNfkdTablesV1; +use crate::NormalizerUts46DataV1; +use icu_provider::DataError; +use icu_provider::DataProvider; + +// Implementation note: Despite merely wrapping a `ComposingNormalizer`, +// having a `Uts46Mapper` serves two purposes: +// +// 1. Denying public access to parts of the `ComposingNormalizer` API +// that don't work when the data contains markers for ignorables. +// 2. Providing a place where additional iterator pre-processing or +// post-processing can take place if needed in the future. (When +// writing this, it looked like such processing was needed but +// now isn't needed after all.) + +/// A borrowed version of a mapper that knows how to performs the +/// subsets of UTS 46 processing documented on the methods. +#[derive(Debug)] +pub struct Uts46MapperBorrowed<'a> { + normalizer: ComposingNormalizerBorrowed<'a>, +} + +#[cfg(feature = "compiled_data")] +impl Default for Uts46MapperBorrowed<'static> { + fn default() -> Self { + Self::new() + } +} + +impl Uts46MapperBorrowed<'static> { + /// Cheaply converts a [`Uts46MapperBorrowed<'static>`] into a [`Uts46Mapper`]. + /// + /// Note: Due to branching and indirection, using [`Uts46Mapper`] might inhibit some + /// compile-time optimizations that are possible with [`Uts46MapperBorrowed`]. + pub const fn static_to_owned(self) -> Uts46Mapper { + Uts46Mapper { + normalizer: self.normalizer.static_to_owned(), + } + } + + /// Construct with compiled data. + #[cfg(feature = "compiled_data")] + pub const fn new() -> Self { + Uts46MapperBorrowed { + normalizer: ComposingNormalizerBorrowed::new_uts46(), + } + } +} + +impl Uts46MapperBorrowed<'_> { + /// Returns an iterator adaptor that turns an `Iterator` over `char` + /// into an iterator yielding a `char` sequence that gets the following + /// operations from the "Map" and "Normalize" steps of the "Processing" + /// section of UTS 46 lazily applied to it: + /// + /// 1. The _ignored_ characters are ignored. + /// 2. The _mapped_ characters are mapped. + /// 3. The _disallowed_ characters are replaced with U+FFFD, + /// which itself is a disallowed character. + /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ + /// as appropriate. + /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. + /// 6. The _disallowed_STD3_mapped_ characters are treated as + /// _mapped_. + /// 7. The result is normalized to NFC. + /// + /// Notably: + /// + /// * The STD3 or WHATWG ASCII deny list should be implemented as a + /// post-processing step. + /// * Transitional processing is not performed. Transitional mapping + /// would be a pre-processing step, but transitional processing is + /// deprecated, and none of Firefox, Safari, or Chrome use it. + pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( + &'delegate self, + iter: I, + ) -> impl Iterator<Item = char> + 'delegate { + self.normalizer + .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) + } + + /// Returns an iterator adaptor that turns an `Iterator` over `char` + /// into an iterator yielding a `char` sequence that gets the following + /// operations from the NFC check and statucs steps of the "Validity + /// Criteria" section of UTS 46 lazily applied to it: + /// + /// 1. The _ignored_ characters are treated as _disallowed_. + /// 2. The _mapped_ characters are mapped. + /// 3. The _disallowed_ characters are replaced with U+FFFD, + /// which itself is a disallowed character. + /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ + /// as appropriate. + /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. + /// 6. The _disallowed_STD3_mapped_ characters are treated as + /// _mapped_. + /// 7. The result is normalized to NFC. + /// + /// Notably: + /// + /// * The STD3 or WHATWG ASCII deny list should be implemented as a + /// post-processing step. + /// * Transitional processing is not performed. Transitional mapping + /// would be a pre-processing step, but transitional processing is + /// deprecated, and none of Firefox, Safari, or Chrome use it. + /// * The output needs to be compared with input to see if anything + /// changed. This check catches failures to adhere to the normalization + /// and status requirements. In particular, this comparison results + /// in _mapped_ characters resulting in error like "Validity Criteria" + /// requires. + pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( + &'delegate self, + iter: I, + ) -> impl Iterator<Item = char> + 'delegate { + self.normalizer + .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) + } +} + +/// A mapper that knows how to performs the subsets of UTS 46 processing +/// documented on the methods. +#[derive(Debug)] +pub struct Uts46Mapper { + normalizer: ComposingNormalizer, +} + +#[cfg(feature = "compiled_data")] +impl Default for Uts46Mapper { + fn default() -> Self { + Self::new().static_to_owned() + } +} + +impl Uts46Mapper { + /// Constructs a borrowed version of this type for more efficient querying. + pub fn as_borrowed(&self) -> Uts46MapperBorrowed<'_> { + Uts46MapperBorrowed { + normalizer: self.normalizer.as_borrowed(), + } + } + + /// Construct with compiled data. + #[cfg(feature = "compiled_data")] + #[allow(clippy::new_ret_no_self)] + pub const fn new() -> Uts46MapperBorrowed<'static> { + Uts46MapperBorrowed::new() + } + + /// Construct with provider. + #[doc = icu_provider::gen_buffer_unstable_docs!(UNSTABLE, Self::new)] + pub fn try_new<D>(provider: &D) -> Result<Self, DataError> + where + D: DataProvider<NormalizerUts46DataV1> + + DataProvider<NormalizerNfdTablesV1> + + DataProvider<NormalizerNfkdTablesV1> + // UTS 46 tables merged into NormalizerNfkdTablesV1 + + DataProvider<NormalizerNfcV1> + + ?Sized, + { + let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; + + Ok(Uts46Mapper { normalizer }) + } +} diff --git a/vendor/icu_normalizer/tests/data/NormalizationTest.txt b/vendor/icu_normalizer/tests/data/NormalizationTest.txt new file mode 100644 index 00000000..0d224b05 --- /dev/null +++ b/vendor/icu_normalizer/tests/data/NormalizationTest.txt @@ -0,0 +1,4 @@ +# This is a placeholder in the interest of keeping the repository size smaller. +# Replace this file with the contents of +# https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt to actually +# run the conformance test. diff --git a/vendor/icu_normalizer/tests/data/README.md b/vendor/icu_normalizer/tests/data/README.md new file mode 100644 index 00000000..8d407e46 --- /dev/null +++ b/vendor/icu_normalizer/tests/data/README.md @@ -0,0 +1,2 @@ +The test data comes from +https://www.unicode.org/Public/UCD/latest/ucd/NormalizationTest.txt diff --git a/vendor/icu_normalizer/tests/tests.rs b/vendor/icu_normalizer/tests/tests.rs new file mode 100644 index 00000000..5e6d8770 --- /dev/null +++ b/vendor/icu_normalizer/tests/tests.rs @@ -0,0 +1,2083 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use icu_normalizer::properties::CanonicalCombiningClassMap; +use icu_normalizer::properties::CanonicalCombiningClassMapBorrowed; +use icu_normalizer::properties::CanonicalComposition; +use icu_normalizer::properties::CanonicalCompositionBorrowed; +use icu_normalizer::properties::CanonicalDecomposition; +use icu_normalizer::properties::CanonicalDecompositionBorrowed; +use icu_normalizer::properties::Decomposed; +use icu_normalizer::uts46::Uts46Mapper; +use icu_normalizer::uts46::Uts46MapperBorrowed; +use icu_normalizer::ComposingNormalizer; +use icu_normalizer::ComposingNormalizerBorrowed; +use icu_normalizer::DecomposingNormalizer; +use icu_normalizer::DecomposingNormalizerBorrowed; + +#[test] +fn test_nfd_basic() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + assert_eq!(normalizer.normalize("ä"), "a\u{0308}"); + assert_eq!(normalizer.normalize("Ä"), "A\u{0308}"); + assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged + assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfd_owned() { + let owned = + DecomposingNormalizer::try_new_nfd_unstable(&icu_normalizer::provider::Baked).unwrap(); + let normalizer = owned.as_borrowed(); + assert_eq!(normalizer.normalize("ä"), "a\u{0308}"); + assert_eq!(normalizer.normalize("Ä"), "A\u{0308}"); + assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged + assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfkd_basic() { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + assert_eq!(normalizer.normalize("ä"), "a\u{0308}"); + assert_eq!(normalizer.normalize("Ä"), "A\u{0308}"); + assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ヘ\u{3099}"); // half-width to full-width + assert_eq!(normalizer.normalize("ペ"), "ヘ\u{309A}"); // half-width to full-width + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{635}\u{644}\u{649} \u{627}\u{644}\u{644}\u{647} \u{639}\u{644}\u{64A}\u{647} \u{648}\u{633}\u{644}\u{645}"); + // ligature expanded + assert_eq!(normalizer.normalize("㈎"), "(\u{1100}\u{1161})"); // parenthetical expanded + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfkd_owned() { + let owned = + DecomposingNormalizer::try_new_nfkd_unstable(&icu_normalizer::provider::Baked).unwrap(); + let normalizer = owned.as_borrowed(); + assert_eq!(normalizer.normalize("ä"), "a\u{0308}"); + assert_eq!(normalizer.normalize("Ä"), "A\u{0308}"); + assert_eq!(normalizer.normalize("ệ"), "e\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("Ệ"), "E\u{0323}\u{0302}"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ヘ\u{3099}"); // half-width to full-width + assert_eq!(normalizer.normalize("ペ"), "ヘ\u{309A}"); // half-width to full-width + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{635}\u{644}\u{649} \u{627}\u{644}\u{644}\u{647} \u{639}\u{644}\u{64A}\u{647} \u{648}\u{633}\u{644}\u{645}"); + // ligature expanded + assert_eq!(normalizer.normalize("㈎"), "(\u{1100}\u{1161})"); // parenthetical expanded + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfc_basic() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + assert_eq!(normalizer.normalize("a\u{0308}"), "ä"); + assert_eq!(normalizer.normalize("A\u{0308}"), "Ä"); + assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ"); + assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion + + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged + assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfc_owned() { + let owned = + ComposingNormalizer::try_new_nfc_unstable(&icu_normalizer::provider::Baked).unwrap(); + let normalizer = owned.as_borrowed(); + assert_eq!(normalizer.normalize("a\u{0308}"), "ä"); + assert_eq!(normalizer.normalize("A\u{0308}"), "Ä"); + assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ"); + assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion + + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width unchanged + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width unchanged + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature unchanged + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{FDFA}"); // ligature unchanged + assert_eq!(normalizer.normalize("㈎"), "㈎"); // parenthetical unchanged + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfkc_basic() { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + assert_eq!(normalizer.normalize("a\u{0308}"), "ä"); + assert_eq!(normalizer.normalize("A\u{0308}"), "Ä"); + assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ"); + assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion + + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width to full-width, the compose + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width to full-width, the compose + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}"); + // ligature expanded + assert_eq!(normalizer.normalize("㈎"), "(가)"); // parenthetical expanded and partially recomposed + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_nfkc_owned() { + let owned = + ComposingNormalizer::try_new_nfkc_unstable(&icu_normalizer::provider::Baked).unwrap(); + let normalizer = owned.as_borrowed(); + assert_eq!(normalizer.normalize("a\u{0308}"), "ä"); + assert_eq!(normalizer.normalize("A\u{0308}"), "Ä"); + assert_eq!(normalizer.normalize("e\u{0323}\u{0302}"), "ệ"); + assert_eq!(normalizer.normalize("E\u{0323}\u{0302}"), "Ệ"); + assert_eq!(normalizer.normalize("𝅗𝅥"), "𝅗\u{1D165}"); // Composition exclusion + + assert_eq!(normalizer.normalize("\u{2126}"), "Ω"); // ohm sign + assert_eq!(normalizer.normalize("ベ"), "ベ"); // half-width to full-width, the compose + assert_eq!(normalizer.normalize("ペ"), "ペ"); // half-width to full-width, the compose + assert_eq!(normalizer.normalize("fi"), "fi"); // ligature expanded + assert_eq!(normalizer.normalize("\u{FDFA}"), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}"); + // ligature expanded + assert_eq!(normalizer.normalize("㈎"), "(가)"); // parenthetical expanded and partially recomposed + assert_eq!(normalizer.normalize("\u{0345}"), "\u{0345}"); // Iota subscript +} + +#[test] +fn test_uts46_map_normalize() { + let mapper = Uts46MapperBorrowed::new(); + assert_eq!( + mapper + .map_normalize("a\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .map_normalize("A\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .map_normalize("e\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper + .map_normalize("E\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper.map_normalize("𝅗𝅥".chars()).collect::<String>(), + "𝅗\u{1D165}" + ); // Composition exclusion + + assert_eq!( + mapper.map_normalize("\u{2126}".chars()).collect::<String>(), + "ω" + ); // ohm sign + assert_eq!(mapper.map_normalize("ベ".chars()).collect::<String>(), "ベ"); // half-width to full-width, the compose + assert_eq!(mapper.map_normalize("ペ".chars()).collect::<String>(), "ペ"); // half-width to full-width, the compose + assert_eq!(mapper.map_normalize("fi".chars()).collect::<String>(), "fi"); // ligature expanded + assert_eq!(mapper.map_normalize("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}"); + // ligature expanded + assert_eq!( + mapper.map_normalize("㈎".chars()).collect::<String>(), + "(가)" + ); // parenthetical expanded and partially recomposed + + // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4) + assert_eq!( + mapper.map_normalize("\u{200C}".chars()).collect::<String>(), + "\u{200C}" + ); + assert_eq!( + mapper.map_normalize("\u{200D}".chars()).collect::<String>(), + "\u{200D}" + ); + assert_eq!(mapper.map_normalize("ß".chars()).collect::<String>(), "ß"); + assert_eq!(mapper.map_normalize("ς".chars()).collect::<String>(), "ς"); + + // Iota subscript + assert_eq!( + mapper.map_normalize("\u{0345}".chars()).collect::<String>(), + "ι" + ); + + // Disallowed + assert_eq!( + mapper.map_normalize("\u{061C}".chars()).collect::<String>(), + "\u{FFFD}" + ); + + // Ignored + assert_eq!( + mapper + .map_normalize("a\u{180B}b".chars()) + .collect::<String>(), + "ab" + ); +} + +#[test] +fn test_uts46_owned() { + let owned = Uts46Mapper::try_new(&icu_normalizer::provider::Baked).unwrap(); + let mapper = owned.as_borrowed(); + assert_eq!( + mapper + .map_normalize("a\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .map_normalize("A\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .map_normalize("e\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper + .map_normalize("E\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper.map_normalize("𝅗𝅥".chars()).collect::<String>(), + "𝅗\u{1D165}" + ); // Composition exclusion + + assert_eq!( + mapper.map_normalize("\u{2126}".chars()).collect::<String>(), + "ω" + ); // ohm sign + assert_eq!(mapper.map_normalize("ベ".chars()).collect::<String>(), "ベ"); // half-width to full-width, the compose + assert_eq!(mapper.map_normalize("ペ".chars()).collect::<String>(), "ペ"); // half-width to full-width, the compose + assert_eq!(mapper.map_normalize("fi".chars()).collect::<String>(), "fi"); // ligature expanded + assert_eq!(mapper.map_normalize("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}"); + // ligature expanded + assert_eq!( + mapper.map_normalize("㈎".chars()).collect::<String>(), + "(가)" + ); // parenthetical expanded and partially recomposed + + // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4) + assert_eq!( + mapper.map_normalize("\u{200C}".chars()).collect::<String>(), + "\u{200C}" + ); + assert_eq!( + mapper.map_normalize("\u{200D}".chars()).collect::<String>(), + "\u{200D}" + ); + assert_eq!(mapper.map_normalize("ß".chars()).collect::<String>(), "ß"); + assert_eq!(mapper.map_normalize("ς".chars()).collect::<String>(), "ς"); + + // Iota subscript + assert_eq!( + mapper.map_normalize("\u{0345}".chars()).collect::<String>(), + "ι" + ); + + // Disallowed + assert_eq!( + mapper.map_normalize("\u{061C}".chars()).collect::<String>(), + "\u{FFFD}" + ); + + // Ignored + assert_eq!( + mapper + .map_normalize("a\u{180B}b".chars()) + .collect::<String>(), + "ab" + ); +} + +#[test] +fn test_uts46_normalize_validate() { + let mapper = Uts46MapperBorrowed::new(); + assert_eq!( + mapper + .normalize_validate("a\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .normalize_validate("A\u{0308}".chars()) + .collect::<String>(), + "ä" + ); + assert_eq!( + mapper + .normalize_validate("e\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper + .normalize_validate("E\u{0323}\u{0302}".chars()) + .collect::<String>(), + "ệ" + ); + assert_eq!( + mapper.normalize_validate("𝅗𝅥".chars()).collect::<String>(), + "𝅗\u{1D165}" + ); // Composition exclusion + + assert_eq!( + mapper + .normalize_validate("\u{2126}".chars()) + .collect::<String>(), + "ω" + ); // ohm sign + assert_eq!( + mapper.normalize_validate("ベ".chars()).collect::<String>(), + "ベ" + ); // half-width to full-width, the compose + assert_eq!( + mapper.normalize_validate("ペ".chars()).collect::<String>(), + "ペ" + ); // half-width to full-width, the compose + assert_eq!( + mapper.normalize_validate("fi".chars()).collect::<String>(), + "fi" + ); // ligature expanded + assert_eq!(mapper.normalize_validate("\u{FDFA}".chars()).collect::<String>(), "\u{0635}\u{0644}\u{0649} \u{0627}\u{0644}\u{0644}\u{0647} \u{0639}\u{0644}\u{064A}\u{0647} \u{0648}\u{0633}\u{0644}\u{0645}"); + // ligature expanded + assert_eq!( + mapper.normalize_validate("㈎".chars()).collect::<String>(), + "(가)" + ); // parenthetical expanded and partially recomposed + + // Deviations (UTS 46, 6 Mapping Table Derivation, Step 4) + assert_eq!( + mapper + .normalize_validate("\u{200C}".chars()) + .collect::<String>(), + "\u{200C}" + ); + assert_eq!( + mapper + .normalize_validate("\u{200D}".chars()) + .collect::<String>(), + "\u{200D}" + ); + assert_eq!( + mapper.normalize_validate("ß".chars()).collect::<String>(), + "ß" + ); + assert_eq!( + mapper.normalize_validate("ς".chars()).collect::<String>(), + "ς" + ); + + // Iota subscript + assert_eq!( + mapper + .normalize_validate("\u{0345}".chars()) + .collect::<String>(), + "ι" + ); + + // Disallowed + assert_eq!( + mapper + .normalize_validate("\u{061C}".chars()) + .collect::<String>(), + "\u{FFFD}" + ); + + // Ignored + assert_eq!( + mapper + .normalize_validate("a\u{180B}b".chars()) + .collect::<String>(), + "a\u{FFFD}b" + ); +} + +type StackString = arraystring::ArrayString<arraystring::typenum::U48>; + +#[test] +fn test_nfd_str_to() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + + let mut buf = StackString::new(); + assert!(normalizer.normalize_to("ä", &mut buf).is_ok()); + assert_eq!(&buf, "a\u{0308}"); + + buf.clear(); + assert!(normalizer.normalize_to("ệ", &mut buf).is_ok()); + assert_eq!(&buf, "e\u{0323}\u{0302}"); +} + +#[test] +fn test_nfd_utf8_to() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + + let mut buf = StackString::new(); + assert!(normalizer + .normalize_utf8_to("ä".as_bytes(), &mut buf) + .is_ok()); + assert_eq!(&buf, "a\u{0308}"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to("ệ".as_bytes(), &mut buf) + .is_ok()); + assert_eq!(&buf, "e\u{0323}\u{0302}"); +} + +type StackVec = arrayvec::ArrayVec<u16, 32>; + +#[test] +fn test_nfd_utf16_to() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + + let mut buf = StackVec::new(); + assert!(normalizer + .normalize_utf16_to([0x00E4u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0x0061u16, 0x0308u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0x1EC7u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0x0065u16, 0x0323u16, 0x0302u16].as_slice()); +} + +#[test] +fn test_nfc_str_to() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + let mut buf = StackString::new(); + assert!(normalizer.normalize_to("a\u{0308}", &mut buf).is_ok()); + assert_eq!(&buf, "ä"); + + buf.clear(); + assert!(normalizer + .normalize_to("e\u{0323}\u{0302}", &mut buf) + .is_ok()); + assert_eq!(&buf, "ệ"); +} + +#[test] +fn test_nfc_utf8_to() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + let mut buf = StackString::new(); + assert!(normalizer + .normalize_utf8_to("a\u{0308}".as_bytes(), &mut buf) + .is_ok()); + assert_eq!(&buf, "ä"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to("e\u{0323}\u{0302}".as_bytes(), &mut buf) + .is_ok()); + assert_eq!(&buf, "ệ"); +} + +#[test] +fn test_nfc_utf16_to() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + let mut buf = StackVec::new(); + assert!(normalizer + .normalize_utf16_to([0x0061u16, 0x0308u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0x00E4u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0x0065u16, 0x0323u16, 0x0302u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0x1EC7u16].as_slice()); +} + +#[test] +fn test_nfc_utf8_to_errors() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + let mut buf = StackString::new(); + assert!(normalizer + .normalize_utf8_to(b"\xFFa\xCC\x88\xFF", &mut buf) + .is_ok()); + assert_eq!(&buf, "\u{FFFD}ä\u{FFFD}"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"\x80e\xCC\xA3\xCC\x82\x80", &mut buf) + .is_ok()); + assert_eq!(&buf, "\u{FFFD}ệ\u{FFFD}"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"aaa\xFFaaa\xFFaaa", &mut buf) + .is_ok()); + assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"aaa\xE2\x98aaa\xE2\x98aaa", &mut buf) + .is_ok()); + assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa"); +} + +#[test] +fn test_nfd_utf8_to_errors() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + + let mut buf = StackString::new(); + assert!(normalizer + .normalize_utf8_to(b"\xFF\xC3\xA4\xFF", &mut buf) + .is_ok()); + assert_eq!(&buf, "\u{FFFD}a\u{0308}\u{FFFD}"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"\x80\xE1\xBB\x87\x80", &mut buf) + .is_ok()); + assert_eq!(&buf, "\u{FFFD}e\u{0323}\u{0302}\u{FFFD}"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"aaa\xFFaaa\xFFaaa", &mut buf) + .is_ok()); + assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa"); + + buf.clear(); + assert!(normalizer + .normalize_utf8_to(b"aaa\xE2\x98aaa\xE2\x98aaa", &mut buf) + .is_ok()); + assert_eq!(&buf, "aaa\u{FFFD}aaa\u{FFFD}aaa"); +} + +#[test] +fn test_nfc_utf16_to_errors() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + let mut buf = StackVec::new(); + assert!(normalizer + .normalize_utf16_to([0xD800u16, 0x0061u16, 0x0308u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0xFFFDu16, 0x00E4u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0xDC00u16, 0x0061u16, 0x0308u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0xFFFDu16, 0x00E4u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0061u16, 0x0308u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!(&buf, [0x0061u16, 0xFFFDu16, 0x00E4u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0061u16, 0x0308u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!(&buf, [0x0061u16, 0xFFFDu16, 0x00E4u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0061u16, 0x0308u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x00E4u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0061u16, 0x0308u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x00E4u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0061u16, 0x0061u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0061u16, 0x0061u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0308u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0308u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice() + ); +} + +#[test] +fn test_nfd_utf16_to_errors() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + + let mut buf = StackVec::new(); + assert!(normalizer + .normalize_utf16_to([0xD800u16, 0x00E4u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0xFFFDu16, 0x0061u16, 0x0308u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0xDC00u16, 0x00E4u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!(&buf, [0xFFFDu16, 0x0061u16, 0x0308u16].as_slice()); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0x0061u16, 0xD800u16, 0x00E4u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to([0x0061u16, 0xDC00u16, 0x00E4u16].as_slice(), &mut buf) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x00E4u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x00E4u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0308u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0061u16, 0x0061u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0061u16, 0x0061u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0061u16, 0x0061u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xD800u16, 0x0308u16, 0xD800u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice() + ); + + buf.clear(); + assert!(normalizer + .normalize_utf16_to( + [0x0061u16, 0xDC00u16, 0x0308u16, 0xDC00u16].as_slice(), + &mut buf + ) + .is_ok()); + assert_eq!( + &buf, + [0x0061u16, 0xFFFDu16, 0x0308u16, 0xFFFDu16].as_slice() + ); +} + +use atoi::FromRadix16; +use icu_properties::props::CanonicalCombiningClass; + +/// Parse five semicolon-terminated strings consisting of space-separated hexadecimal scalar values +fn parse_hex(mut hexes: &[u8]) -> [StackString; 5] { + let mut strings = [ + StackString::new(), + StackString::new(), + StackString::new(), + StackString::new(), + StackString::new(), + ]; + let mut current = 0; + loop { + let (scalar, mut offset) = u32::from_radix_16(hexes); + let c = core::char::from_u32(scalar).unwrap(); + strings[current].try_push(c).unwrap(); + match hexes[offset] { + b';' => { + current += 1; + if current == strings.len() { + return strings; + } + offset += 1; + } + b' ' => { + offset += 1; + } + _ => { + panic!("Bad format: Garbage"); + } + } + hexes = &hexes[offset..]; + } +} + +#[test] +fn test_conformance() { + let nfd = DecomposingNormalizerBorrowed::new_nfd(); + let nfkd = DecomposingNormalizerBorrowed::new_nfkd(); + let nfc = ComposingNormalizerBorrowed::new_nfc(); + let nfkc = ComposingNormalizerBorrowed::new_nfkc(); + + let mut prev = 0u32; + let mut part = 0u8; + let data = include_bytes!("data/NormalizationTest.txt"); + let lines = data.split(|b| b == &b'\n'); + for line in lines { + if line.is_empty() { + continue; + } + if line.starts_with(b"#") { + continue; + } + if line.starts_with(&b"@Part"[..]) { + part = line[5] - b'0'; + if part == 2 { + for u in prev + 1..=0x10FFFF { + if let Some(c) = char::from_u32(u) { + assert!(nfd + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfkd + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfc + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfkc + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + } + } + } + continue; + } + let strings = parse_hex(line); + // 0: source + // 1: NFC + // 2: NFD + // 3: NFKC + // 4: NFKD + if part == 1 { + let mut iter = strings[0].chars(); + let current = iter.next().unwrap(); + assert_eq!(iter.next(), None); + let current_u = u32::from(current); + for u in prev + 1..current_u { + if let Some(c) = char::from_u32(u) { + assert!(nfd + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfkd + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfc + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + assert!(nfkc + .normalize_iter(core::iter::once(c)) + .eq(core::iter::once(c))); + } + } + prev = current_u; + } + // NFC + assert!(nfc + .normalize_iter(strings[0].chars()) + .eq(strings[1].chars())); + assert!(nfc + .normalize_iter(strings[1].chars()) + .eq(strings[1].chars())); + assert!(nfc + .normalize_iter(strings[2].chars()) + .eq(strings[1].chars())); + + assert!(nfc + .normalize_iter(strings[3].chars()) + .eq(strings[3].chars())); + assert!(nfc + .normalize_iter(strings[4].chars()) + .eq(strings[3].chars())); + + // NFD + assert!(nfd + .normalize_iter(strings[0].chars()) + .eq(strings[2].chars())); + assert!(nfd + .normalize_iter(strings[1].chars()) + .eq(strings[2].chars())); + assert!(nfd + .normalize_iter(strings[2].chars()) + .eq(strings[2].chars())); + + assert!(nfd + .normalize_iter(strings[3].chars()) + .eq(strings[4].chars())); + assert!(nfd + .normalize_iter(strings[4].chars()) + .eq(strings[4].chars())); + + // NFKC + assert!(nfkc + .normalize_iter(strings[0].chars()) + .eq(strings[3].chars())); + assert!(nfkc + .normalize_iter(strings[1].chars()) + .eq(strings[3].chars())); + assert!(nfkc + .normalize_iter(strings[2].chars()) + .eq(strings[3].chars())); + assert!(nfkc + .normalize_iter(strings[3].chars()) + .eq(strings[3].chars())); + assert!(nfkc + .normalize_iter(strings[4].chars()) + .eq(strings[3].chars())); + + // NFKD + assert!(nfkd + .normalize_iter(strings[0].chars()) + .eq(strings[4].chars())); + assert!(nfkd + .normalize_iter(strings[1].chars()) + .eq(strings[4].chars())); + assert!(nfkd + .normalize_iter(strings[2].chars()) + .eq(strings[4].chars())); + assert!(nfkd + .normalize_iter(strings[3].chars()) + .eq(strings[4].chars())); + assert!(nfkd + .normalize_iter(strings[4].chars()) + .eq(strings[4].chars())); + } +} + +// Commented out, because we don't currently have a way to force a no-op set for testing. +// #[test] +// fn test_hangul() { +// use icu_collections::codepointinvlist::{CodePointSet, CodePointSetBuilder}; +// use zerofrom::ZeroFrom; +// let builder = CodePointSetBuilder::new(); +// let set: CodePointSet = builder.build(); + +// let normalizer: ComposingNormalizer = ComposingNormalizerBorrowed::new_nfc(); +// { +// let mut norm_iter = normalizer.normalize_iter("A\u{AC00}\u{11A7}".chars()); +// // Pessimize passthrough to avoid hiding bugs. +// norm_iter +// .decomposition +// .potential_passthrough_and_not_backward_combining = Some(ZeroFrom::zero_from(&set)); +// assert!(norm_iter.eq("A\u{AC00}\u{11A7}".chars())); +// } +// { +// let mut norm_iter = normalizer.normalize_iter("A\u{AC00}\u{11C2}".chars()); +// // Pessimize passthrough to avoid hiding bugs. +// norm_iter +// .decomposition +// .potential_passthrough_and_not_backward_combining = Some(ZeroFrom::zero_from(&set)); +// assert!(norm_iter.eq("A\u{AC1B}".chars())); +// } +// } + +fn str_to_utf16(s: &str, sink: &mut StackVec) { + sink.clear(); + let mut buf = [0u16; 2]; + for c in s.chars() { + sink.try_extend_from_slice(c.encode_utf16(&mut buf)) + .unwrap(); + } +} + +fn char_to_utf16(c: char, sink: &mut StackVec) { + sink.clear(); + let mut buf = [0u16; 2]; + sink.try_extend_from_slice(c.encode_utf16(&mut buf)) + .unwrap(); +} + +fn str_to_str(s: &str, sink: &mut StackString) { + sink.clear(); + sink.try_push_str(s).unwrap(); +} + +fn char_to_str(c: char, sink: &mut StackString) { + sink.clear(); + sink.try_push(c).unwrap(); +} + +#[test] +fn test_conformance_utf16() { + let nfd = DecomposingNormalizerBorrowed::new_nfd(); + let nfkd = DecomposingNormalizerBorrowed::new_nfkd(); + let nfc = ComposingNormalizerBorrowed::new_nfc(); + let nfkc = ComposingNormalizerBorrowed::new_nfkc(); + + let mut input = StackVec::new(); + let mut normalized = StackVec::new(); + let mut expected = StackVec::new(); + + let mut prev = 0u32; + let mut part = 0u8; + let data = include_bytes!("data/NormalizationTest.txt"); + let lines = data.split(|b| b == &b'\n'); + for line in lines { + if line.is_empty() { + continue; + } + if line.starts_with(b"#") { + continue; + } + if line.starts_with(&b"@Part"[..]) { + part = line[5] - b'0'; + if part == 2 { + for u in prev + 1..=0x10FFFF { + if let Some(c) = char::from_u32(u) { + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + } + } + } + continue; + } + let strings = parse_hex(line); + // 0: source + // 1: NFC + // 2: NFD + // 3: NFKC + // 4: NFKD + if part == 1 { + let mut iter = strings[0].chars(); + let current = iter.next().unwrap(); + assert_eq!(iter.next(), None); + let current_u = u32::from(current); + for u in prev + 1..current_u { + if let Some(c) = char::from_u32(u) { + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_utf16(c, &mut input); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &input); + } + } + prev = current_u; + } + // NFC + normalized.clear(); + str_to_utf16(&strings[0], &mut input); + str_to_utf16(&strings[1], &mut expected); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[1], &mut input); + str_to_utf16(&strings[1], &mut expected); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[2], &mut input); + str_to_utf16(&strings[1], &mut expected); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[3], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[4], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + // NFD + normalized.clear(); + str_to_utf16(&strings[0], &mut input); + str_to_utf16(&strings[2], &mut expected); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[1], &mut input); + str_to_utf16(&strings[2], &mut expected); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[2], &mut input); + str_to_utf16(&strings[2], &mut expected); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[3], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[4], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + // NFKC + normalized.clear(); + str_to_utf16(&strings[0], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[1], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[2], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[3], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[4], &mut input); + str_to_utf16(&strings[3], &mut expected); + assert!(nfkc.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + // NFKD + normalized.clear(); + str_to_utf16(&strings[0], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[1], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[2], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[3], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_utf16(&strings[4], &mut input); + str_to_utf16(&strings[4], &mut expected); + assert!(nfkd.normalize_utf16_to(&input, &mut normalized).is_ok()); + assert_eq!(&normalized, &expected); + } +} + +#[test] +fn test_conformance_utf8() { + let nfd = DecomposingNormalizerBorrowed::new_nfd(); + let nfkd = DecomposingNormalizerBorrowed::new_nfkd(); + let nfc = ComposingNormalizerBorrowed::new_nfc(); + let nfkc = ComposingNormalizerBorrowed::new_nfkc(); + + let mut input = StackString::new(); + let mut normalized = StackString::new(); + let mut expected = StackString::new(); + + let mut prev = 0u32; + let mut part = 0u8; + let data = include_bytes!("data/NormalizationTest.txt"); + let lines = data.split(|b| b == &b'\n'); + for line in lines { + if line.is_empty() { + continue; + } + if line.starts_with(b"#") { + continue; + } + if line.starts_with(&b"@Part"[..]) { + part = line[5] - b'0'; + if part == 2 { + for u in prev + 1..=0x10FFFF { + if let Some(c) = char::from_u32(u) { + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + } + } + } + continue; + } + let strings = parse_hex(line); + // 0: source + // 1: NFC + // 2: NFD + // 3: NFKC + // 4: NFKD + if part == 1 { + let mut iter = strings[0].chars(); + let current = iter.next().unwrap(); + assert_eq!(iter.next(), None); + let current_u = u32::from(current); + for u in prev + 1..current_u { + if let Some(c) = char::from_u32(u) { + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + + normalized.clear(); + char_to_str(c, &mut input); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &input); + } + } + prev = current_u; + } + // NFC + normalized.clear(); + str_to_str(&strings[0], &mut input); + str_to_str(&strings[1], &mut expected); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[1], &mut input); + str_to_str(&strings[1], &mut expected); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[2], &mut input); + str_to_str(&strings[1], &mut expected); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[3], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[4], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + // NFD + normalized.clear(); + str_to_str(&strings[0], &mut input); + str_to_str(&strings[2], &mut expected); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[1], &mut input); + str_to_str(&strings[2], &mut expected); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[2], &mut input); + str_to_str(&strings[2], &mut expected); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[3], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[4], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + // NFKC + normalized.clear(); + str_to_str(&strings[0], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[1], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[2], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[3], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[4], &mut input); + str_to_str(&strings[3], &mut expected); + assert!(nfkc + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + // NFKD + normalized.clear(); + str_to_str(&strings[0], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[1], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[2], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[3], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + + normalized.clear(); + str_to_str(&strings[4], &mut input); + str_to_str(&strings[4], &mut expected); + assert!(nfkd + .normalize_utf8_to(input.as_bytes(), &mut normalized) + .is_ok()); + assert_eq!(&normalized, &expected); + } +} + +#[test] +fn test_canonical_composition() { + let comp = CanonicalCompositionBorrowed::new(); + + assert_eq!(comp.compose('a', 'b'), None); // Just two starters + + assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); + assert_eq!(comp.compose('A', '\u{0308}'), Some('Ä')); + assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); + assert_eq!(comp.compose('Ẹ', '\u{0302}'), Some('Ệ')); + assert_eq!(comp.compose('\u{1D157}', '\u{1D165}'), None); // Composition exclusion + + assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter; BMP + assert_eq!(comp.compose('𑄱', '𑄧'), Some('𑄮')); // Second is starter; non-BMP + + assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV + assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT +} + +#[test] +fn test_canonical_composition_owned() { + let owned = CanonicalComposition::try_new_unstable(&icu_normalizer::provider::Baked).unwrap(); + let comp = owned.as_borrowed(); + + assert_eq!(comp.compose('a', 'b'), None); // Just two starters + + assert_eq!(comp.compose('a', '\u{0308}'), Some('ä')); + assert_eq!(comp.compose('A', '\u{0308}'), Some('Ä')); + assert_eq!(comp.compose('ẹ', '\u{0302}'), Some('ệ')); + assert_eq!(comp.compose('Ẹ', '\u{0302}'), Some('Ệ')); + assert_eq!(comp.compose('\u{1D157}', '\u{1D165}'), None); // Composition exclusion + + assert_eq!(comp.compose('ে', 'া'), Some('ো')); // Second is starter; BMP + assert_eq!(comp.compose('𑄱', '𑄧'), Some('𑄮')); // Second is starter; non-BMP + + assert_eq!(comp.compose('ᄀ', 'ᅡ'), Some('가')); // Hangul LV + assert_eq!(comp.compose('가', 'ᆨ'), Some('각')); // Hangul LVT +} + +#[test] +fn test_canonical_decomposition() { + let decomp = CanonicalDecompositionBorrowed::new(); + + assert_eq!( + decomp.decompose('ä'), + Decomposed::Expansion('a', '\u{0308}') + ); + assert_eq!( + decomp.decompose('Ä'), + Decomposed::Expansion('A', '\u{0308}') + ); + assert_eq!( + decomp.decompose('ệ'), + Decomposed::Expansion('ẹ', '\u{0302}') + ); + assert_eq!( + decomp.decompose('Ệ'), + Decomposed::Expansion('Ẹ', '\u{0302}') + ); + assert_eq!( + decomp.decompose('\u{1D15E}'), + Decomposed::Expansion('\u{1D157}', '\u{1D165}') + ); + assert_eq!(decomp.decompose('ো'), Decomposed::Expansion('ে', 'া')); + assert_eq!(decomp.decompose('𑄮'), Decomposed::Expansion('𑄱', '𑄧')); + assert_eq!(decomp.decompose('가'), Decomposed::Expansion('ᄀ', 'ᅡ')); + assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); + + assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN + assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN + + assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia + assert_eq!( + decomp.decompose('\u{1F72}'), + Decomposed::Expansion('ε', '\u{0300}') + ); // not oxia but in the oxia range + assert_eq!( + decomp.decompose('ά'), + Decomposed::Expansion('α', '\u{0301}') + ); // tonos +} + +#[test] +fn test_canonical_decomposition_owned() { + let owned = CanonicalDecomposition::try_new_unstable(&icu_normalizer::provider::Baked).unwrap(); + let decomp = owned.as_borrowed(); + + assert_eq!( + decomp.decompose('ä'), + Decomposed::Expansion('a', '\u{0308}') + ); + assert_eq!( + decomp.decompose('Ä'), + Decomposed::Expansion('A', '\u{0308}') + ); + assert_eq!( + decomp.decompose('ệ'), + Decomposed::Expansion('ẹ', '\u{0302}') + ); + assert_eq!( + decomp.decompose('Ệ'), + Decomposed::Expansion('Ẹ', '\u{0302}') + ); + assert_eq!( + decomp.decompose('\u{1D15E}'), + Decomposed::Expansion('\u{1D157}', '\u{1D165}') + ); + assert_eq!(decomp.decompose('ো'), Decomposed::Expansion('ে', 'া')); + assert_eq!(decomp.decompose('𑄮'), Decomposed::Expansion('𑄱', '𑄧')); + assert_eq!(decomp.decompose('가'), Decomposed::Expansion('ᄀ', 'ᅡ')); + assert_eq!(decomp.decompose('각'), Decomposed::Expansion('가', 'ᆨ')); + + assert_eq!(decomp.decompose('\u{212B}'), Decomposed::Singleton('Å')); // ANGSTROM SIGN + assert_eq!(decomp.decompose('\u{2126}'), Decomposed::Singleton('Ω')); // OHM SIGN + + assert_eq!(decomp.decompose('\u{1F71}'), Decomposed::Singleton('ά')); // oxia + assert_eq!( + decomp.decompose('\u{1F72}'), + Decomposed::Expansion('ε', '\u{0300}') + ); // not oxia but in the oxia range + assert_eq!( + decomp.decompose('ά'), + Decomposed::Expansion('α', '\u{0301}') + ); // tonos +} + +#[test] +fn test_ccc() { + let map = CanonicalCombiningClassMapBorrowed::new(); + for u in 0..=0x10FFFF { + assert_eq!( + map.get32(u), + icu_properties::CodePointMapData::<CanonicalCombiningClass>::new().get32(u) + ); + } +} + +#[test] +fn test_ccc_owned() { + let owned = + CanonicalCombiningClassMap::try_new_unstable(&icu_normalizer::provider::Baked).unwrap(); + let map = owned.as_borrowed(); + for u in 0..=0x10FFFF { + assert_eq!( + map.get32(u), + icu_properties::CodePointMapData::<CanonicalCombiningClass>::new().get32(u) + ); + } +} + +#[test] +fn test_utf16_basic() { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + + assert_eq!( + normalizer.normalize_utf16(&[0x0061]).as_ref(), + [0x0061].as_slice() + ); + assert_eq!( + normalizer.normalize_utf16(&[0x0300, 0x0323]).as_ref(), + [0x0323, 0x0300].as_slice() + ); +} + +#[test] +fn test_accented_digraph() { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + assert_eq!( + normalizer.normalize("\u{01C4}\u{0323}"), + "DZ\u{0323}\u{030C}" + ); + assert_eq!( + normalizer.normalize("DZ\u{030C}\u{0323}"), + "DZ\u{0323}\u{030C}" + ); +} + +#[test] +fn test_ddd() { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + assert_eq!( + normalizer.normalize("\u{0DDD}\u{0334}"), + "\u{0DD9}\u{0DCF}\u{0334}\u{0DCA}" + ); +} + +#[test] +fn test_is_normalized() { + let nfd = DecomposingNormalizerBorrowed::new_nfd(); + let nfkd = DecomposingNormalizerBorrowed::new_nfkd(); + let nfc = ComposingNormalizerBorrowed::new_nfc(); + let nfkc = ComposingNormalizerBorrowed::new_nfkc(); + + let aaa = "aaa"; + assert!(nfd.is_normalized(aaa)); + assert!(nfkd.is_normalized(aaa)); + assert!(nfc.is_normalized(aaa)); + assert!(nfkc.is_normalized(aaa)); + + assert!(nfd.is_normalized_utf8(aaa.as_bytes())); + assert!(nfkd.is_normalized_utf8(aaa.as_bytes())); + assert!(nfc.is_normalized_utf8(aaa.as_bytes())); + assert!(nfkc.is_normalized_utf8(aaa.as_bytes())); + + let aaa16 = [0x0061u16, 0x0061u16, 0x0061u16].as_slice(); + assert!(nfd.is_normalized_utf16(aaa16)); + assert!(nfkd.is_normalized_utf16(aaa16)); + assert!(nfc.is_normalized_utf16(aaa16)); + assert!(nfkc.is_normalized_utf16(aaa16)); + + let affa = b"a\xFFa"; + assert!(nfd.is_normalized_utf8(affa)); + assert!(nfkd.is_normalized_utf8(affa)); + assert!(nfc.is_normalized_utf8(affa)); + assert!(nfkc.is_normalized_utf8(affa)); + + let a_surrogate_a = [0x0061u16, 0xD800u16, 0x0061u16].as_slice(); + assert!(nfd.is_normalized_utf16(a_surrogate_a)); + assert!(nfkd.is_normalized_utf16(a_surrogate_a)); + assert!(nfc.is_normalized_utf16(a_surrogate_a)); + assert!(nfkc.is_normalized_utf16(a_surrogate_a)); + + let note = "a𝅗\u{1D165}a"; + assert!(nfd.is_normalized(note)); + assert!(nfkd.is_normalized(note)); + assert!(nfc.is_normalized(note)); + assert!(nfkc.is_normalized(note)); + + assert!(nfd.is_normalized_utf8(note.as_bytes())); + assert!(nfkd.is_normalized_utf8(note.as_bytes())); + assert!(nfc.is_normalized_utf8(note.as_bytes())); + assert!(nfkc.is_normalized_utf8(note.as_bytes())); + + let note16 = [ + 0x0061u16, 0xD834u16, 0xDD57u16, 0xD834u16, 0xDD65u16, 0x0061u16, + ] + .as_slice(); + assert!(nfd.is_normalized_utf16(note16)); + assert!(nfkd.is_normalized_utf16(note16)); + assert!(nfc.is_normalized_utf16(note16)); + assert!(nfkc.is_normalized_utf16(note16)); + + let umlaut = "aäa"; + assert!(!nfd.is_normalized(umlaut)); + assert!(!nfkd.is_normalized(umlaut)); + assert!(nfc.is_normalized(umlaut)); + assert!(nfkc.is_normalized(umlaut)); + + assert!(!nfd.is_normalized_utf8(umlaut.as_bytes())); + assert!(!nfkd.is_normalized_utf8(umlaut.as_bytes())); + assert!(nfc.is_normalized_utf8(umlaut.as_bytes())); + assert!(nfkc.is_normalized_utf8(umlaut.as_bytes())); + + let umlaut16 = [0x0061u16, 0x00E4u16, 0x0061u16].as_slice(); + assert!(!nfd.is_normalized_utf16(umlaut16)); + assert!(!nfkd.is_normalized_utf16(umlaut16)); + assert!(nfc.is_normalized_utf16(umlaut16)); + assert!(nfkc.is_normalized_utf16(umlaut16)); + + let fraction = "a½a"; + assert!(nfd.is_normalized(fraction)); + assert!(!nfkd.is_normalized(fraction)); + assert!(nfc.is_normalized(fraction)); + assert!(!nfkc.is_normalized(fraction)); + + assert!(nfd.is_normalized_utf8(fraction.as_bytes())); + assert!(!nfkd.is_normalized_utf8(fraction.as_bytes())); + assert!(nfc.is_normalized_utf8(fraction.as_bytes())); + assert!(!nfkc.is_normalized_utf8(fraction.as_bytes())); + + let fraction16 = [0x0061u16, 0x00BDu16, 0x0061u16].as_slice(); + assert!(nfd.is_normalized_utf16(fraction16)); + assert!(!nfkd.is_normalized_utf16(fraction16)); + assert!(nfc.is_normalized_utf16(fraction16)); + assert!(!nfkc.is_normalized_utf16(fraction16)); +} + +#[test] +fn test_is_normalized_up_to() { + let nfd = DecomposingNormalizerBorrowed::new_nfd(); + let nfkd = DecomposingNormalizerBorrowed::new_nfkd(); + let nfc = ComposingNormalizerBorrowed::new_nfc(); + let nfkc = ComposingNormalizerBorrowed::new_nfkc(); + + // Check a string slice is normalized up to where is_normalized_up_to reports + let check_str = |input: &str| { + // Check nfd + let (head, tail) = nfd.split_normalized(input); + let mut normalized = String::from(head); + let _ = nfd.normalize_to(tail, &mut normalized); + assert!(nfd.is_normalized(&normalized)); + + // Check nfkd + let (head, tail) = nfkd.split_normalized(input); + let mut normalized = String::from(head); + let _ = nfkd.normalize_to(tail, &mut normalized); + assert!(nfkd.is_normalized(&normalized)); + + // Check nfc + let (head, tail) = nfc.split_normalized(input); + let mut normalized = String::from(head); + let _ = nfc.normalize_to(tail, &mut normalized); + assert!(nfc.is_normalized(&normalized)); + + // Check nfkc + let (head, tail) = nfkc.split_normalized(input); + let mut normalized = String::from(head); + let _ = nfkc.normalize_to(tail, &mut normalized); + assert!(nfkc.is_normalized(&normalized)); + }; + + // Check a string of UTF8 bytes is normalized up to where is_normalized_up_to reports + // note: from_utf8 can panic with invalid UTF8 input + let check_utf8 = |input: &[u8]| { + // Check nfd + let (head, tail) = nfd.split_normalized_utf8(input); + let mut normalized = String::from(head); + let _ = nfd.normalize_utf8_to(tail, &mut normalized); + assert!(nfd.is_normalized(&normalized)); + + // Check nfkd + let (head, tail) = nfkd.split_normalized_utf8(input); + let mut normalized = String::from(head); + let _ = nfkd.normalize_utf8_to(tail, &mut normalized); + assert!(nfkd.is_normalized(&normalized)); + + // Check nfc + let (head, tail) = nfc.split_normalized_utf8(input); + let mut normalized = String::from(head); + let _ = nfc.normalize_utf8_to(tail, &mut normalized); + assert!(nfc.is_normalized(&normalized)); + + // Check nfkc + let (head, tail) = nfkc.split_normalized_utf8(input); + let mut normalized = String::from(head); + let _ = nfkc.normalize_utf8_to(tail, &mut normalized); + assert!(nfkc.is_normalized(&normalized)); + }; + + // Check a string of UTF-16 code units is normalized up to where is_normalized_up_to reports + let check_utf16 = |input: &[u16]| { + // Check nfd + let (head, tail) = nfd.split_normalized_utf16(input); + let mut normalized = head.to_vec(); + let _ = nfd.normalize_utf16_to(tail, &mut normalized); + assert!(nfd.is_normalized_utf16(&normalized)); + + // Check nfkd + let (head, tail) = nfkd.split_normalized_utf16(input); + let mut normalized = head.to_vec(); + let _ = nfkd.normalize_utf16_to(tail, &mut normalized); + assert!(nfkd.is_normalized_utf16(&normalized)); + + // Check nfc + let (head, tail) = nfc.split_normalized_utf16(input); + let mut normalized = head.to_vec(); + let _ = nfc.normalize_utf16_to(tail, &mut normalized); + assert!(nfc.is_normalized_utf16(&normalized)); + + // Check nfkc + let (head, tail) = nfkc.split_normalized_utf16(input); + let mut normalized = head.to_vec(); + let _ = nfkc.normalize_utf16_to(tail, &mut normalized); + assert!(nfkc.is_normalized_utf16(&normalized)); + }; + + let aaa = "aaa"; + check_str(aaa); + + let aaa_utf8 = aaa.as_bytes(); + check_utf8(aaa_utf8); + + let aaa_utf16: Vec<u16> = aaa.encode_utf16().collect(); + check_utf16(&aaa_utf16); + + assert!(nfd.split_normalized(aaa).0.len() == aaa.len()); + assert!(nfkd.split_normalized(aaa).0.len() == aaa.len()); + assert!(nfc.split_normalized(aaa).0.len() == aaa.len()); + assert!(nfkc.split_normalized(aaa).0.len() == aaa.len()); + assert!(nfd.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len()); + assert!(nfkd.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len()); + assert!(nfc.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len()); + assert!(nfkc.split_normalized_utf8(aaa_utf8).0.len() == aaa_utf8.len()); + assert!(nfd.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len()); + assert!(nfkd.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len()); + assert!(nfc.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len()); + assert!(nfkc.split_normalized_utf16(&aaa_utf16).0.len() == aaa_utf16.len()); + + let note = "a𝅗\u{1D165}a"; + check_str(note); + + let note_utf8 = note.as_bytes(); + check_utf8(note_utf8); + + let note_utf16: Vec<u16> = note.encode_utf16().collect(); + check_utf16(¬e_utf16); + + assert!(nfd.split_normalized(note).0.len() == note.len()); + assert!(nfkd.split_normalized(note).0.len() == note.len()); + assert!(nfc.split_normalized(note).0.len() == note.len()); + assert!(nfkc.split_normalized(note).0.len() == note.len()); + assert!(nfd.split_normalized_utf8(note_utf8).0.len() == note_utf8.len()); + assert!(nfkd.split_normalized_utf8(note_utf8).0.len() == note_utf8.len()); + assert!(nfc.split_normalized_utf8(note_utf8).0.len() == note_utf8.len()); + assert!(nfkc.split_normalized_utf8(note_utf8).0.len() == note_utf8.len()); + assert!(nfd.split_normalized_utf16(¬e_utf16).0.len() == note_utf16.len()); + assert!(nfkd.split_normalized_utf16(¬e_utf16).0.len() == note_utf16.len()); + assert!(nfc.split_normalized_utf16(¬e_utf16).0.len() == note_utf16.len()); + assert!(nfkc.split_normalized_utf16(¬e_utf16).0.len() == note_utf16.len()); + + let umlaut = "aäa"; + check_str(umlaut); + + let umlaut_utf8 = umlaut.as_bytes(); + check_utf8(umlaut_utf8); + + let umlaut_utf16: Vec<u16> = umlaut.encode_utf16().collect(); + check_utf16(¨aut_utf16); + + assert_eq!(nfd.split_normalized(umlaut).0.len(), 1); + assert_eq!(nfkd.split_normalized(umlaut).0.len(), 1); + assert_eq!(nfc.split_normalized(umlaut).0.len(), 4); + assert_eq!(nfkc.split_normalized(umlaut).0.len(), 4); + assert_eq!(nfd.split_normalized_utf8(umlaut_utf8).0.len(), 1); + assert_eq!(nfkd.split_normalized_utf8(umlaut_utf8).0.len(), 1); + assert_eq!(nfc.split_normalized_utf8(umlaut_utf8).0.len(), 4); + assert_eq!(nfkc.split_normalized_utf8(umlaut_utf8).0.len(), 4); + assert_eq!(nfd.split_normalized_utf16(¨aut_utf16).0.len(), 1); + assert_eq!(nfkd.split_normalized_utf16(¨aut_utf16).0.len(), 1); + assert_eq!(nfc.split_normalized_utf16(¨aut_utf16).0.len(), 3); + assert_eq!(nfkc.split_normalized_utf16(¨aut_utf16).0.len(), 3); + + let fraction = "a½a"; + check_str(fraction); + + let fraction_utf8 = fraction.as_bytes(); + check_utf8(fraction_utf8); + + let fraction_utf16: Vec<u16> = fraction.encode_utf16().collect(); + check_utf16(&fraction_utf16); + + assert_eq!(nfd.split_normalized(fraction).0.len(), 4); + assert_eq!(nfkd.split_normalized(fraction).0.len(), 1); + assert_eq!(nfc.split_normalized(fraction).0.len(), 4); + assert_eq!(nfkc.split_normalized(fraction).0.len(), 1); + assert_eq!(nfd.split_normalized_utf8(fraction_utf8).0.len(), 4); + assert_eq!(nfkd.split_normalized_utf8(fraction_utf8).0.len(), 1); + assert_eq!(nfc.split_normalized_utf8(fraction_utf8).0.len(), 4); + assert_eq!(nfkc.split_normalized_utf8(fraction_utf8).0.len(), 1); + assert_eq!(nfd.split_normalized_utf16(&fraction_utf16).0.len(), 3); + assert_eq!(nfkd.split_normalized_utf16(&fraction_utf16).0.len(), 1); + assert_eq!(nfc.split_normalized_utf16(&fraction_utf16).0.len(), 3); + assert_eq!(nfkc.split_normalized_utf16(&fraction_utf16).0.len(), 1); + + let reversed_vietnamese = "e\u{0302}\u{0323}"; + check_str(reversed_vietnamese); + + let reversed_vietnamese_utf8 = reversed_vietnamese.as_bytes(); + check_utf8(reversed_vietnamese_utf8); + + let reversed_vietnamese_utf16: Vec<u16> = reversed_vietnamese.encode_utf16().collect(); + check_utf16(&reversed_vietnamese_utf16); + + assert_eq!(nfd.split_normalized(reversed_vietnamese).0.len(), 1); + assert_eq!(nfkd.split_normalized(reversed_vietnamese).0.len(), 1); + assert_eq!(nfc.split_normalized(reversed_vietnamese).0.len(), 0); + assert_eq!(nfkc.split_normalized(reversed_vietnamese).0.len(), 0); + assert_eq!( + nfd.split_normalized_utf8(reversed_vietnamese_utf8).0.len(), + 1 + ); + assert_eq!( + nfkd.split_normalized_utf8(reversed_vietnamese_utf8).0.len(), + 1 + ); + assert_eq!( + nfc.split_normalized_utf8(reversed_vietnamese_utf8).0.len(), + 0 + ); + assert_eq!( + nfkc.split_normalized_utf8(reversed_vietnamese_utf8).0.len(), + 0 + ); + assert_eq!( + nfd.split_normalized_utf16(&reversed_vietnamese_utf16) + .0 + .len(), + 1 + ); + assert_eq!( + nfkd.split_normalized_utf16(&reversed_vietnamese_utf16) + .0 + .len(), + 1 + ); + assert_eq!( + nfc.split_normalized_utf16(&reversed_vietnamese_utf16) + .0 + .len(), + 0 + ); + assert_eq!( + nfkc.split_normalized_utf16(&reversed_vietnamese_utf16) + .0 + .len(), + 0 + ); + + let truncated_vietnamese = "e\u{0302}"; + check_str(truncated_vietnamese); + + let truncated_vietnamese_utf8 = truncated_vietnamese.as_bytes(); + check_utf8(truncated_vietnamese_utf8); + + let truncated_vietnamese_utf16: Vec<u16> = truncated_vietnamese.encode_utf16().collect(); + check_utf16(&truncated_vietnamese_utf16); + + assert_eq!(nfd.split_normalized(truncated_vietnamese).0.len(), 3); + assert_eq!(nfkd.split_normalized(truncated_vietnamese).0.len(), 3); + assert_eq!(nfc.split_normalized(truncated_vietnamese).0.len(), 0); + assert_eq!(nfkc.split_normalized(truncated_vietnamese).0.len(), 0); + assert_eq!( + nfd.split_normalized_utf8(truncated_vietnamese_utf8).0.len(), + 3 + ); + assert_eq!( + nfkd.split_normalized_utf8(truncated_vietnamese_utf8) + .0 + .len(), + 3 + ); + assert_eq!( + nfc.split_normalized_utf8(truncated_vietnamese_utf8).0.len(), + 0 + ); + assert_eq!( + nfkc.split_normalized_utf8(truncated_vietnamese_utf8) + .0 + .len(), + 0 + ); + assert_eq!( + nfd.split_normalized_utf16(&truncated_vietnamese_utf16) + .0 + .len(), + 2 + ); + assert_eq!( + nfkd.split_normalized_utf16(&truncated_vietnamese_utf16) + .0 + .len(), + 2 + ); + assert_eq!( + nfc.split_normalized_utf16(&truncated_vietnamese_utf16) + .0 + .len(), + 0 + ); + assert_eq!( + nfkc.split_normalized_utf16(&truncated_vietnamese_utf16) + .0 + .len(), + 0 + ); +} |
