diff options
Diffstat (limited to 'vendor/potential_utf')
| -rw-r--r-- | vendor/potential_utf/.cargo-checksum.json | 1 | ||||
| -rw-r--r-- | vendor/potential_utf/Cargo.lock | 140 | ||||
| -rw-r--r-- | vendor/potential_utf/Cargo.toml | 82 | ||||
| -rw-r--r-- | vendor/potential_utf/LICENSE | 46 | ||||
| -rw-r--r-- | vendor/potential_utf/README.md | 11 | ||||
| -rw-r--r-- | vendor/potential_utf/src/lib.rs | 33 | ||||
| -rw-r--r-- | vendor/potential_utf/src/uchar.rs | 375 | ||||
| -rw-r--r-- | vendor/potential_utf/src/ustr.rs | 270 | ||||
| -rw-r--r-- | vendor/potential_utf/src/writeable.rs | 159 |
9 files changed, 1117 insertions, 0 deletions
diff --git a/vendor/potential_utf/.cargo-checksum.json b/vendor/potential_utf/.cargo-checksum.json new file mode 100644 index 00000000..12b50d7a --- /dev/null +++ b/vendor/potential_utf/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.lock":"79dcdab66c87309641bfc8e01f682d1b37b6b2dd85c273ea43a2279b3171e681","Cargo.toml":"82fd80e6633b773f2c8c72016c7966f4dec32f36a27c6ae634490cfec34e0445","LICENSE":"f367c1b8e1aa262435251e442901da4607b4650e0e63a026f5044473ecfb90f2","README.md":"693ab95a7920db4ca6655b352efe33cfe5ee42792369f591257a3f5af735e918","src/lib.rs":"58fe437a809c113ce98e30c92ed4ccf9a78a31374e033bf56bb3384d76a0827d","src/uchar.rs":"157d4dd70360083d9d3e2f3f67d004ce282bcec0f378608dfac33835ffaf0e95","src/ustr.rs":"3b9cbec56c5cfa233335220e459e9be1c3ef48e18ee07691c4bf3e29152a5e70","src/writeable.rs":"9c381775e40a47db29f0601b6090897b5e649d9294ef74f11ac76dbf42c6448c"},"package":"e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585"}
\ No newline at end of file diff --git a/vendor/potential_utf/Cargo.lock b/vendor/potential_utf/Cargo.lock new file mode 100644 index 00000000..4d1bec81 --- /dev/null +++ b/vendor/potential_utf/Cargo.lock @@ -0,0 +1,140 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "databake" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff6ee9e2d2afb173bcdeee45934c89ec341ab26f91c9933774fc15c2b58f83ef" +dependencies = [ + "proc-macro2", + "quote", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "potential_utf" +version = "0.1.2" +dependencies = [ + "bincode", + "databake", + "serde", + "serde_json", + "writeable", + "zerovec", +] + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "ryu" +version = "1.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" + +[[package]] +name = "serde" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.218" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.139" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f86c3acccc9c65b153fe1b85a3be07fe5515274ec9f0653b4a0875731c72a6" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "syn" +version = "2.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36147f1a48ae0ec2b5b3bc5b537d267457555a10dc06f3dbc8cb11ba3006d3b1" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe" + +[[package]] +name = "writeable" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74b3b5b7c6114bf7253093603034e102d479ecc8501deca33b6c1c816418b6d2" + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" + +[[package]] +name = "zerovec" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94e62113720e311984f461c56b00457ae9981c0bc7859d22306cc2ae2f95571c" +dependencies = [ + "zerofrom", +] diff --git a/vendor/potential_utf/Cargo.toml b/vendor/potential_utf/Cargo.toml new file mode 100644 index 00000000..a0328ef4 --- /dev/null +++ b/vendor/potential_utf/Cargo.toml @@ -0,0 +1,82 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2021" +rust-version = "1.81" +name = "potential_utf" +version = "0.1.2" +authors = ["The ICU4X Project Developers"] +build = false +include = [ + "data/**/*", + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md", +] +autolib = false +autobins = false +autoexamples = false +autotests = false +autobenches = false +description = "Unvalidated string and character types" +homepage = "https://icu4x.unicode.org" +readme = "README.md" +categories = ["internationalization"] +license = "Unicode-3.0" +repository = "https://github.com/unicode-org/icu4x" + +[features] +alloc = [ + "serde?/alloc", + "zerovec?/alloc", +] +databake = ["dep:databake"] +serde = ["dep:serde"] +writeable = [ + "dep:writeable", + "alloc", +] +zerovec = ["dep:zerovec"] + +[lib] +name = "potential_utf" +path = "src/lib.rs" + +[dependencies.databake] +version = "0.2.0" +optional = true +default-features = false + +[dependencies.serde] +version = "1.0.110" +optional = true +default-features = false + +[dependencies.writeable] +version = "0.6.0" +optional = true +default-features = false + +[dependencies.zerovec] +version = "0.11.1" +optional = true +default-features = false + +[dev-dependencies.bincode] +version = "1.3.1" + +[dev-dependencies.serde_json] +version = "1.0.45" diff --git a/vendor/potential_utf/LICENSE b/vendor/potential_utf/LICENSE new file mode 100644 index 00000000..c9be6012 --- /dev/null +++ b/vendor/potential_utf/LICENSE @@ -0,0 +1,46 @@ +UNICODE LICENSE V3 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 2020-2024 Unicode, Inc. + +NOTICE TO USER: Carefully read the following legal agreement. BY +DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR +SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT +DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of data files and any associated documentation (the "Data Files") or +software and any associated documentation (the "Software") to deal in the +Data Files or Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Data Files or Software, and to permit persons to whom the +Data Files or Software are furnished to do so, provided that either (a) +this copyright and permission notice appear with all copies of the Data +Files or Software, or (b) this copyright and permission notice appear in +associated Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. + +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE +BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, +OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA +FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. + +SPDX-License-Identifier: Unicode-3.0 + +— + +Portions of ICU4X may have been adapted from ICU4C and/or ICU4J. +ICU 1.8.1 to ICU 57.1 © 1995-2016 International Business Machines Corporation and others. diff --git a/vendor/potential_utf/README.md b/vendor/potential_utf/README.md new file mode 100644 index 00000000..5251f098 --- /dev/null +++ b/vendor/potential_utf/README.md @@ -0,0 +1,11 @@ +# unvalidated_utf [](https://crates.io/crates/unvalidated_utf) + +<!-- cargo-rdme start --> + +A crate providing unvalidated string and character types. + +<!-- cargo-rdme end --> + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/vendor/potential_utf/src/lib.rs b/vendor/potential_utf/src/lib.rs new file mode 100644 index 00000000..2b343c9c --- /dev/null +++ b/vendor/potential_utf/src/lib.rs @@ -0,0 +1,33 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#![cfg_attr(not(any(test, doc)), no_std)] +#![cfg_attr( + not(test), + deny( + clippy::indexing_slicing, + clippy::unwrap_used, + clippy::expect_used, + clippy::panic, + clippy::exhaustive_structs, + clippy::exhaustive_enums, + clippy::trivially_copy_pass_by_ref, + missing_debug_implementations, + ) +)] + +//! A crate providing unvalidated string and character types. + +#[cfg(feature = "alloc")] +extern crate alloc; + +mod uchar; +mod ustr; + +pub use uchar::PotentialCodePoint; +pub use ustr::PotentialUtf16; +pub use ustr::PotentialUtf8; + +#[cfg(feature = "writeable")] +mod writeable; diff --git a/vendor/potential_utf/src/uchar.rs b/vendor/potential_utf/src/uchar.rs new file mode 100644 index 00000000..280964ca --- /dev/null +++ b/vendor/potential_utf/src/uchar.rs @@ -0,0 +1,375 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::cmp::Ordering; +use core::fmt; + +/// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not +/// validated as such. +/// +/// Use this type instead of `char` when you want to deal with data that is expected to be valid +/// Unicode scalar values, but you want control over when or if you validate that assumption. +/// +/// # Examples +/// +/// ``` +/// use potential_utf::PotentialCodePoint; +/// +/// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); +/// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); +/// assert_eq!( +/// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), +/// Ok('👋') +/// ); +/// +/// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); +/// assert_eq!( +/// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), +/// char::REPLACEMENT_CHARACTER +/// ); +/// ``` +#[repr(transparent)] +#[allow(clippy::exhaustive_structs)] // transparent newtype +#[derive(PartialEq, Eq, Clone, Copy, Hash)] +pub struct PotentialCodePoint([u8; 3]); + +impl PotentialCodePoint { + /// Create a [`PotentialCodePoint`] from a `char`. + /// + /// # Examples + /// + /// ``` + /// use potential_utf::PotentialCodePoint; + /// + /// let a = PotentialCodePoint::from_char('a'); + /// assert_eq!(a.try_to_char().unwrap(), 'a'); + /// ``` + #[inline] + pub const fn from_char(c: char) -> Self { + let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); + Self([u0, u1, u2]) + } + + /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. + #[inline] + pub const fn from_u24(c: u32) -> Self { + let [u0, u1, u2, _u3] = c.to_le_bytes(); + Self([u0, u1, u2]) + } + + /// Attempt to convert a [`PotentialCodePoint`] to a `char`. + /// + /// # Examples + /// + /// ``` + /// use potential_utf::PotentialCodePoint; + /// use zerovec::ule::AsULE; + /// + /// let a = PotentialCodePoint::from_char('a'); + /// assert_eq!(a.try_to_char(), Ok('a')); + /// + /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); + /// assert!(matches!(b.try_to_char(), Err(_))); + /// ``` + #[inline] + pub fn try_to_char(self) -> Result<char, core::char::CharTryFromError> { + char::try_from(u32::from(self)) + } + + /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] + /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. + /// + /// # Examples + /// + /// ``` + /// use potential_utf::PotentialCodePoint; + /// use zerovec::ule::AsULE; + /// + /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); + /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); + /// ``` + #[inline] + pub fn to_char_lossy(self) -> char { + self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) + } + + /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is + /// a valid Unicode scalar value. + /// + /// # Safety + /// + /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. + /// + /// # Examples + /// + /// ``` + /// use potential_utf::PotentialCodePoint; + /// + /// let a = PotentialCodePoint::from_char('a'); + /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); + /// ``` + #[inline] + pub unsafe fn to_char_unchecked(self) -> char { + char::from_u32_unchecked(u32::from(self)) + } + + /// For converting to the ULE type in a const context + /// + /// Can be removed once const traits are a thing + #[inline] + #[cfg(feature = "zerovec")] + pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { + zerovec::ule::RawBytesULE(self.0) + } +} + +/// This impl requires enabling the optional `zerovec` Cargo feature +#[cfg(feature = "zerovec")] +impl zerovec::ule::AsULE for PotentialCodePoint { + type ULE = zerovec::ule::RawBytesULE<3>; + + #[inline] + fn to_unaligned(self) -> Self::ULE { + zerovec::ule::RawBytesULE(self.0) + } + + #[inline] + fn from_unaligned(unaligned: Self::ULE) -> Self { + Self(unaligned.0) + } +} + +// Safety: PotentialCodePoint is always the little-endian representation of a char, +// which corresponds to its AsULE::ULE type +/// This impl requires enabling the optional `zerovec` Cargo feature +#[cfg(feature = "zerovec")] +unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} + +impl fmt::Debug for PotentialCodePoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Debug as a char if possible + match self.try_to_char() { + Ok(c) => fmt::Debug::fmt(&c, f), + Err(_) => fmt::Debug::fmt(&self.0, f), + } + } +} + +impl PartialOrd for PotentialCodePoint { + fn partial_cmp(&self, other: &Self) -> Option<Ordering> { + Some(self.cmp(other)) + } +} + +impl PartialEq<char> for PotentialCodePoint { + fn eq(&self, other: &char) -> bool { + self.eq(&Self::from_char(*other)) + } +} + +impl PartialOrd<char> for PotentialCodePoint { + fn partial_cmp(&self, other: &char) -> Option<Ordering> { + self.partial_cmp(&Self::from_char(*other)) + } +} + +impl PartialEq<PotentialCodePoint> for char { + fn eq(&self, other: &PotentialCodePoint) -> bool { + PotentialCodePoint::from_char(*self).eq(other) + } +} + +impl PartialOrd<PotentialCodePoint> for char { + fn partial_cmp(&self, other: &PotentialCodePoint) -> Option<Ordering> { + PotentialCodePoint::from_char(*self).partial_cmp(other) + } +} + +impl Ord for PotentialCodePoint { + // custom implementation, as derived Ord would compare lexicographically + fn cmp(&self, other: &Self) -> Ordering { + let a = u32::from(*self); + let b = u32::from(*other); + a.cmp(&b) + } +} + +impl From<PotentialCodePoint> for u32 { + fn from(x: PotentialCodePoint) -> Self { + let [a0, a1, a2] = x.0; + u32::from_le_bytes([a0, a1, a2, 0]) + } +} + +impl TryFrom<u32> for PotentialCodePoint { + type Error = (); + fn try_from(x: u32) -> Result<Self, ()> { + let [u0, u1, u2, u3] = x.to_le_bytes(); + if u3 != 0 { + return Err(()); + } + Ok(Self([u0, u1, u2])) + } +} + +impl From<char> for PotentialCodePoint { + #[inline] + fn from(value: char) -> Self { + Self::from_char(value) + } +} + +impl TryFrom<PotentialCodePoint> for char { + type Error = core::char::CharTryFromError; + + #[inline] + fn try_from(value: PotentialCodePoint) -> Result<char, Self::Error> { + value.try_to_char() + } +} + +/// This impl requires enabling the optional `serde` Cargo feature +#[cfg(feature = "serde")] +impl serde::Serialize for PotentialCodePoint { + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::Error; + let c = self + .try_to_char() + .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; + if serializer.is_human_readable() { + serializer.serialize_char(c) + } else { + self.0.serialize(serializer) + } + } +} + +/// This impl requires enabling the optional `serde` Cargo feature +#[cfg(feature = "serde")] +impl<'de> serde::Deserialize<'de> for PotentialCodePoint { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + if deserializer.is_human_readable() { + let c = <char>::deserialize(deserializer)?; + Ok(PotentialCodePoint::from_char(c)) + } else { + let bytes = <[u8; 3]>::deserialize(deserializer)?; + Ok(PotentialCodePoint(bytes)) + } + } +} + +/// This impl requires enabling the optional `databake` Cargo feature +#[cfg(feature = "databake")] +impl databake::Bake for PotentialCodePoint { + fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { + match self.try_to_char() { + Ok(ch) => { + env.insert("potential_utf"); + let ch = ch.bake(env); + databake::quote! { + potential_utf::PotentialCodePoint::from_char(#ch) + } + } + Err(_) => { + env.insert("potential_utf"); + let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); + databake::quote! { + potential_utf::PotentialCodePoint::from_u24(#u24) + } + } + } + } +} + +#[cfg(test)] +mod test { + use super::*; + use zerovec::ZeroVec; + + #[test] + fn test_serde_fail() { + let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); + serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); + bincode::serialize(&uc).expect_err("serialize invalid char bytes"); + } + + #[test] + fn test_serde_json() { + let c = '🙃'; + let uc = PotentialCodePoint::from_char(c); + let json_ser = serde_json::to_string(&uc).unwrap(); + + assert_eq!(json_ser, r#""🙃""#); + + let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); + + assert_eq!(uc, json_de); + } + + #[test] + fn test_serde_bincode() { + let c = '🙃'; + let uc = PotentialCodePoint::from_char(c); + let bytes_ser = bincode::serialize(&uc).unwrap(); + + assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); + + let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); + + assert_eq!(uc, bytes_de); + } + + #[test] + fn test_representation() { + let chars = ['w', 'ω', '文', '𑄃', '🙃']; + + // backed by [PotentialCodePoint] + let uvchars: Vec<_> = chars + .iter() + .copied() + .map(PotentialCodePoint::from_char) + .collect(); + // backed by [RawBytesULE<3>] + let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); + + let ule_bytes = zvec.as_bytes(); + let uvbytes; + unsafe { + let ptr = &uvchars[..] as *const _ as *const u8; + uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); + } + + // PotentialCodePoint is defined as little-endian, so this must be true on all platforms + // also asserts that to_unaligned/from_unaligned are no-ops + assert_eq!(uvbytes, ule_bytes); + + assert_eq!( + &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], + ule_bytes + ); + } + + #[test] + fn test_char_bake() { + databake::test_bake!( + PotentialCodePoint, + const, + crate::PotentialCodePoint::from_char('b'), + potential_utf + ); + // surrogate code point + databake::test_bake!( + PotentialCodePoint, + const, + crate::PotentialCodePoint::from_u24(55296u32), + potential_utf + ); + } +} diff --git a/vendor/potential_utf/src/ustr.rs b/vendor/potential_utf/src/ustr.rs new file mode 100644 index 00000000..216f629b --- /dev/null +++ b/vendor/potential_utf/src/ustr.rs @@ -0,0 +1,270 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +#[cfg(feature = "alloc")] +use alloc::boxed::Box; +use core::cmp::Ordering; +use core::fmt; +use core::ops::Deref; + +/// A byte slice that is expected to be a UTF-8 string but does not enforce that invariant. +/// +/// Use this type instead of `str` if you don't need to enforce UTF-8 during deserialization. For +/// example, strings that are keys of a map don't need to ever be reified as `str`s. +/// +/// [`PotentialUtf8`] derefs to `[u8]`. To obtain a `str`, use [`Self::try_as_str()`]. +/// +/// The main advantage of this type over `[u8]` is that it serializes as a string in +/// human-readable formats like JSON. +/// +/// # Examples +/// +/// Using an [`PotentialUtf8`] as the key of a [`ZeroMap`]: +/// +/// ``` +/// use potential_utf::PotentialUtf8; +/// use zerovec::ZeroMap; +/// +/// // This map is cheap to deserialize, as we don't need to perform UTF-8 validation. +/// let map: ZeroMap<PotentialUtf8, u8> = [ +/// (PotentialUtf8::from_bytes(b"abc"), 11), +/// (PotentialUtf8::from_bytes(b"def"), 22), +/// (PotentialUtf8::from_bytes(b"ghi"), 33), +/// ] +/// .into_iter() +/// .collect(); +/// +/// let key = "abc"; +/// let value = map.get_copied(PotentialUtf8::from_str(key)); +/// assert_eq!(Some(11), value); +/// ``` +/// +/// [`ZeroMap`]: zerovec::ZeroMap +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +#[allow(clippy::exhaustive_structs)] // transparent newtype +pub struct PotentialUtf8(pub [u8]); + +impl fmt::Debug for PotentialUtf8 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Debug as a string if possible + match self.try_as_str() { + Ok(s) => fmt::Debug::fmt(s, f), + Err(_) => fmt::Debug::fmt(&self.0, f), + } + } +} + +impl PotentialUtf8 { + /// Create a [`PotentialUtf8`] from a byte slice. + #[inline] + pub const fn from_bytes(other: &[u8]) -> &Self { + // Safety: PotentialUtf8 is transparent over [u8] + unsafe { core::mem::transmute(other) } + } + + /// Create a [`PotentialUtf8`] from a string slice. + #[inline] + pub const fn from_str(s: &str) -> &Self { + Self::from_bytes(s.as_bytes()) + } + + /// Create a [`PotentialUtf8`] from boxed bytes. + #[inline] + #[cfg(feature = "alloc")] + pub fn from_boxed_bytes(other: Box<[u8]>) -> Box<Self> { + // Safety: PotentialUtf8 is transparent over [u8] + unsafe { core::mem::transmute(other) } + } + + /// Create a [`PotentialUtf8`] from a boxed `str`. + #[inline] + #[cfg(feature = "alloc")] + pub fn from_boxed_str(other: Box<str>) -> Box<Self> { + Self::from_boxed_bytes(other.into_boxed_bytes()) + } + + /// Get the bytes from a [`PotentialUtf8]. + #[inline] + pub const fn as_bytes(&self) -> &[u8] { + &self.0 + } + + /// Attempt to convert a [`PotentialUtf8`] to a `str`. + /// + /// # Examples + /// + /// ``` + /// use potential_utf::PotentialUtf8; + /// + /// static A: &PotentialUtf8 = PotentialUtf8::from_bytes(b"abc"); + /// + /// let b = A.try_as_str().unwrap(); + /// assert_eq!(b, "abc"); + /// ``` + // Note: this is const starting in 1.63 + #[inline] + pub fn try_as_str(&self) -> Result<&str, core::str::Utf8Error> { + core::str::from_utf8(&self.0) + } +} + +impl<'a> From<&'a str> for &'a PotentialUtf8 { + #[inline] + fn from(other: &'a str) -> Self { + PotentialUtf8::from_str(other) + } +} + +impl PartialEq<str> for PotentialUtf8 { + fn eq(&self, other: &str) -> bool { + self.eq(Self::from_str(other)) + } +} + +impl PartialOrd<str> for PotentialUtf8 { + fn partial_cmp(&self, other: &str) -> Option<Ordering> { + self.partial_cmp(Self::from_str(other)) + } +} + +impl PartialEq<PotentialUtf8> for str { + fn eq(&self, other: &PotentialUtf8) -> bool { + PotentialUtf8::from_str(self).eq(other) + } +} + +impl PartialOrd<PotentialUtf8> for str { + fn partial_cmp(&self, other: &PotentialUtf8) -> Option<Ordering> { + PotentialUtf8::from_str(self).partial_cmp(other) + } +} + +#[cfg(feature = "alloc")] +impl From<Box<str>> for Box<PotentialUtf8> { + #[inline] + fn from(other: Box<str>) -> Self { + PotentialUtf8::from_boxed_str(other) + } +} + +impl Deref for PotentialUtf8 { + type Target = [u8]; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +/// This impl requires enabling the optional `zerovec` Cargo feature +#[cfg(all(feature = "zerovec", feature = "alloc"))] +impl<'a> zerovec::maps::ZeroMapKV<'a> for PotentialUtf8 { + type Container = zerovec::VarZeroVec<'a, PotentialUtf8>; + type Slice = zerovec::VarZeroSlice<PotentialUtf8>; + type GetType = PotentialUtf8; + type OwnedType = Box<PotentialUtf8>; +} + +// Safety (based on the safety checklist on the VarULE trait): +// 1. PotentialUtf8 does not include any uninitialized or padding bytes (transparent over a ULE) +// 2. PotentialUtf8 is aligned to 1 byte (transparent over a ULE) +// 3. The impl of `validate_bytes()` returns an error if any byte is not valid (impossible) +// 4. The impl of `validate_bytes()` returns an error if the slice cannot be used in its entirety (impossible) +// 5. The impl of `from_bytes_unchecked()` returns a reference to the same data (returns the argument directly) +// 6. All other methods are defaulted +// 7. `[T]` byte equality is semantic equality (transparent over a ULE) +/// This impl requires enabling the optional `zerovec` Cargo feature +#[cfg(feature = "zerovec")] +unsafe impl zerovec::ule::VarULE for PotentialUtf8 { + #[inline] + fn validate_bytes(_: &[u8]) -> Result<(), zerovec::ule::UleError> { + Ok(()) + } + #[inline] + unsafe fn from_bytes_unchecked(bytes: &[u8]) -> &Self { + PotentialUtf8::from_bytes(bytes) + } +} + +/// This impl requires enabling the optional `serde` Cargo feature +#[cfg(feature = "serde")] +impl serde::Serialize for PotentialUtf8 { + fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> + where + S: serde::Serializer, + { + use serde::ser::Error; + let s = self + .try_as_str() + .map_err(|_| S::Error::custom("invalid UTF-8 in PotentialUtf8"))?; + if serializer.is_human_readable() { + serializer.serialize_str(s) + } else { + serializer.serialize_bytes(s.as_bytes()) + } + } +} + +/// This impl requires enabling the optional `serde` Cargo feature +#[cfg(all(feature = "serde", feature = "alloc"))] +impl<'de> serde::Deserialize<'de> for Box<PotentialUtf8> { + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + if deserializer.is_human_readable() { + let boxed_str = Box::<str>::deserialize(deserializer)?; + Ok(PotentialUtf8::from_boxed_str(boxed_str)) + } else { + let boxed_bytes = Box::<[u8]>::deserialize(deserializer)?; + Ok(PotentialUtf8::from_boxed_bytes(boxed_bytes)) + } + } +} + +/// This impl requires enabling the optional `serde` Cargo feature +#[cfg(feature = "serde")] +impl<'de, 'a> serde::Deserialize<'de> for &'a PotentialUtf8 +where + 'de: 'a, +{ + fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> + where + D: serde::Deserializer<'de>, + { + if deserializer.is_human_readable() { + let s = <&str>::deserialize(deserializer)?; + Ok(PotentialUtf8::from_str(s)) + } else { + let bytes = <&[u8]>::deserialize(deserializer)?; + Ok(PotentialUtf8::from_bytes(bytes)) + } + } +} + +#[repr(transparent)] +#[derive(PartialEq, Eq, PartialOrd, Ord)] +#[allow(clippy::exhaustive_structs)] // transparent newtype +pub struct PotentialUtf16(pub [u16]); + +impl fmt::Debug for PotentialUtf16 { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // Debug as a string if possible + for c in char::decode_utf16(self.0.iter().copied()) { + match c { + Ok(c) => write!(f, "{c}")?, + Err(e) => write!(f, "\\0x{:x}", e.unpaired_surrogate())?, + } + } + Ok(()) + } +} + +impl PotentialUtf16 { + /// Create a [`PotentialUtf16`] from a u16 slice. + #[inline] + pub const fn from_slice(other: &[u16]) -> &Self { + // Safety: PotentialUtf16 is transparent over [u16] + unsafe { core::mem::transmute(other) } + } +} diff --git a/vendor/potential_utf/src/writeable.rs b/vendor/potential_utf/src/writeable.rs new file mode 100644 index 00000000..cd489914 --- /dev/null +++ b/vendor/potential_utf/src/writeable.rs @@ -0,0 +1,159 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{PotentialUtf16, PotentialUtf8}; +use alloc::borrow::Cow; +use core::fmt::Write; +use writeable::{LengthHint, Part, PartsWrite, TryWriteable}; + +use core::{char::DecodeUtf16Error, fmt, str::Utf8Error}; + +/// This impl requires enabling the optional `writeable` Cargo feature +impl TryWriteable for &'_ PotentialUtf8 { + type Error = Utf8Error; + + fn try_write_to_parts<S: PartsWrite + ?Sized>( + &self, + sink: &mut S, + ) -> Result<Result<(), Self::Error>, fmt::Error> { + let mut remaining = &self.0; + let mut r = Ok(()); + loop { + match core::str::from_utf8(remaining) { + Ok(valid) => { + sink.write_str(valid)?; + return Ok(r); + } + Err(e) => { + // SAFETY: By Utf8Error invariants + let valid = unsafe { + core::str::from_utf8_unchecked(remaining.get_unchecked(..e.valid_up_to())) + }; + sink.write_str(valid)?; + sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; + if r.is_ok() { + r = Err(e); + } + let Some(error_len) = e.error_len() else { + return Ok(r); // end of string + }; + // SAFETY: By Utf8Error invariants + remaining = unsafe { remaining.get_unchecked(e.valid_up_to() + error_len..) } + } + } + } + } + + fn writeable_length_hint(&self) -> LengthHint { + // Lower bound is all valid UTF-8, upper bound is all bytes with the high bit, which become replacement characters. + LengthHint::between(self.0.len(), self.0.len() * 3) + } + + fn try_write_to_string(&self) -> Result<Cow<str>, (Self::Error, Cow<str>)> { + match core::str::from_utf8(&self.0) { + Ok(valid) => Ok(Cow::Borrowed(valid)), + Err(e) => { + // SAFETY: By Utf8Error invariants + let valid = unsafe { + core::str::from_utf8_unchecked(self.0.get_unchecked(..e.valid_up_to())) + }; + + // Let's assume this is the only error + let mut out = alloc::string::String::with_capacity( + self.0.len() + char::REPLACEMENT_CHARACTER.len_utf8() + - e.error_len().unwrap_or(0), + ); + + out.push_str(valid); + out.push(char::REPLACEMENT_CHARACTER); + + // If there's more, we can use `try_write_to` + if let Some(error_len) = e.error_len() { + // SAFETY: By Utf8Error invariants + let remaining = unsafe { self.0.get_unchecked(e.valid_up_to() + error_len..) }; + let _discard = PotentialUtf8::from_bytes(remaining).try_write_to(&mut out); + } + + Err((e, Cow::Owned(out))) + } + } + } +} + +/// This impl requires enabling the optional `writeable` Cargo feature +impl TryWriteable for &'_ PotentialUtf16 { + type Error = DecodeUtf16Error; + + fn try_write_to_parts<S: PartsWrite + ?Sized>( + &self, + sink: &mut S, + ) -> Result<Result<(), Self::Error>, fmt::Error> { + let mut r = Ok(()); + for c in core::char::decode_utf16(self.0.iter().copied()) { + match c { + Ok(c) => sink.write_char(c)?, + Err(e) => { + if r.is_ok() { + r = Err(e); + } + sink.with_part(Part::ERROR, |s| s.write_char(char::REPLACEMENT_CHARACTER))?; + } + } + } + Ok(r) + } + + fn writeable_length_hint(&self) -> LengthHint { + // Lower bound is all ASCII, upper bound is all 3-byte code points (including replacement character) + LengthHint::between(self.0.len(), self.0.len() * 3) + } +} + +#[cfg(test)] +mod test { + #![allow(invalid_from_utf8)] // only way to construct the error + use super::*; + use writeable::assert_try_writeable_parts_eq; + + #[test] + fn test_utf8() { + assert_try_writeable_parts_eq!( + PotentialUtf8::from_bytes(b"Foo Bar"), + "Foo Bar", + Ok(()), + [] + ); + assert_try_writeable_parts_eq!( + PotentialUtf8::from_bytes(b"Foo\xFDBar"), + "Foo�Bar", + Err(core::str::from_utf8(b"Foo\xFDBar").unwrap_err()), + [(3, 6, Part::ERROR)] + ); + assert_try_writeable_parts_eq!( + PotentialUtf8::from_bytes(b"Foo\xFDBar\xff"), + "Foo�Bar�", + Err(core::str::from_utf8(b"Foo\xFDBar\xff").unwrap_err()), + [(3, 6, Part::ERROR), (9, 12, Part::ERROR)], + ); + } + + #[test] + fn test_utf16() { + assert_try_writeable_parts_eq!( + PotentialUtf16::from_slice(&[0xD83E, 0xDD73]), + "🥳", + Ok(()), + [] + ); + assert_try_writeable_parts_eq!( + PotentialUtf16::from_slice(&[0xD83E, 0x20, 0xDD73]), + "� �", + Err(core::char::decode_utf16([0xD83E].into_iter()) + .next() + .unwrap() + .unwrap_err()), + [(0, 3, Part::ERROR), (4, 7, Part::ERROR)] + ); + } +} |
