// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use core::cmp::Ordering; use core::fmt; /// A 24-bit numeric data type that is expected to be a Unicode scalar value, but is not /// validated as such. /// /// Use this type instead of `char` when you want to deal with data that is expected to be valid /// Unicode scalar values, but you want control over when or if you validate that assumption. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// assert_eq!(PotentialCodePoint::from_u24(0x68).try_to_char(), Ok('h')); /// assert_eq!(PotentialCodePoint::from_char('i').try_to_char(), Ok('i')); /// assert_eq!( /// PotentialCodePoint::from_u24(0x1F44B).try_to_char(), /// Ok('👋') /// ); /// /// assert!(PotentialCodePoint::from_u24(0xDE01).try_to_char().is_err()); /// assert_eq!( /// PotentialCodePoint::from_u24(0xDE01).to_char_lossy(), /// char::REPLACEMENT_CHARACTER /// ); /// ``` #[repr(transparent)] #[allow(clippy::exhaustive_structs)] // transparent newtype #[derive(PartialEq, Eq, Clone, Copy, Hash)] pub struct PotentialCodePoint([u8; 3]); impl PotentialCodePoint { /// Create a [`PotentialCodePoint`] from a `char`. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(a.try_to_char().unwrap(), 'a'); /// ``` #[inline] pub const fn from_char(c: char) -> Self { let [u0, u1, u2, _u3] = (c as u32).to_le_bytes(); Self([u0, u1, u2]) } /// Create [`PotentialCodePoint`] from a u32 value, ignoring the most significant 8 bits. #[inline] pub const fn from_u24(c: u32) -> Self { let [u0, u1, u2, _u3] = c.to_le_bytes(); Self([u0, u1, u2]) } /// Attempt to convert a [`PotentialCodePoint`] to a `char`. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// use zerovec::ule::AsULE; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(a.try_to_char(), Ok('a')); /// /// let b = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); /// assert!(matches!(b.try_to_char(), Err(_))); /// ``` #[inline] pub fn try_to_char(self) -> Result { char::try_from(u32::from(self)) } /// Convert a [`PotentialCodePoint`] to a `char', returning [`char::REPLACEMENT_CHARACTER`] /// if the `PotentialCodePoint` does not represent a valid Unicode scalar value. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// use zerovec::ule::AsULE; /// /// let a = PotentialCodePoint::from_unaligned([0xFF, 0xFF, 0xFF].into()); /// assert_eq!(a.to_char_lossy(), char::REPLACEMENT_CHARACTER); /// ``` #[inline] pub fn to_char_lossy(self) -> char { self.try_to_char().unwrap_or(char::REPLACEMENT_CHARACTER) } /// Convert a [`PotentialCodePoint`] to a `char` without checking that it is /// a valid Unicode scalar value. /// /// # Safety /// /// The `PotentialCodePoint` must be a valid Unicode scalar value in little-endian order. /// /// # Examples /// /// ``` /// use potential_utf::PotentialCodePoint; /// /// let a = PotentialCodePoint::from_char('a'); /// assert_eq!(unsafe { a.to_char_unchecked() }, 'a'); /// ``` #[inline] pub unsafe fn to_char_unchecked(self) -> char { char::from_u32_unchecked(u32::from(self)) } /// For converting to the ULE type in a const context /// /// Can be removed once const traits are a thing #[inline] #[cfg(feature = "zerovec")] pub const fn to_unaligned(self) -> zerovec::ule::RawBytesULE<3> { zerovec::ule::RawBytesULE(self.0) } } /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(feature = "zerovec")] impl zerovec::ule::AsULE for PotentialCodePoint { type ULE = zerovec::ule::RawBytesULE<3>; #[inline] fn to_unaligned(self) -> Self::ULE { zerovec::ule::RawBytesULE(self.0) } #[inline] fn from_unaligned(unaligned: Self::ULE) -> Self { Self(unaligned.0) } } // Safety: PotentialCodePoint is always the little-endian representation of a char, // which corresponds to its AsULE::ULE type /// This impl requires enabling the optional `zerovec` Cargo feature #[cfg(feature = "zerovec")] unsafe impl zerovec::ule::EqULE for PotentialCodePoint {} impl fmt::Debug for PotentialCodePoint { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Debug as a char if possible match self.try_to_char() { Ok(c) => fmt::Debug::fmt(&c, f), Err(_) => fmt::Debug::fmt(&self.0, f), } } } impl PartialOrd for PotentialCodePoint { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl PartialEq for PotentialCodePoint { fn eq(&self, other: &char) -> bool { self.eq(&Self::from_char(*other)) } } impl PartialOrd for PotentialCodePoint { fn partial_cmp(&self, other: &char) -> Option { self.partial_cmp(&Self::from_char(*other)) } } impl PartialEq for char { fn eq(&self, other: &PotentialCodePoint) -> bool { PotentialCodePoint::from_char(*self).eq(other) } } impl PartialOrd for char { fn partial_cmp(&self, other: &PotentialCodePoint) -> Option { PotentialCodePoint::from_char(*self).partial_cmp(other) } } impl Ord for PotentialCodePoint { // custom implementation, as derived Ord would compare lexicographically fn cmp(&self, other: &Self) -> Ordering { let a = u32::from(*self); let b = u32::from(*other); a.cmp(&b) } } impl From for u32 { fn from(x: PotentialCodePoint) -> Self { let [a0, a1, a2] = x.0; u32::from_le_bytes([a0, a1, a2, 0]) } } impl TryFrom for PotentialCodePoint { type Error = (); fn try_from(x: u32) -> Result { let [u0, u1, u2, u3] = x.to_le_bytes(); if u3 != 0 { return Err(()); } Ok(Self([u0, u1, u2])) } } impl From for PotentialCodePoint { #[inline] fn from(value: char) -> Self { Self::from_char(value) } } impl TryFrom for char { type Error = core::char::CharTryFromError; #[inline] fn try_from(value: PotentialCodePoint) -> Result { value.try_to_char() } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl serde::Serialize for PotentialCodePoint { fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { use serde::ser::Error; let c = self .try_to_char() .map_err(|_| S::Error::custom("invalid Unicode scalar value in PotentialCodePoint"))?; if serializer.is_human_readable() { serializer.serialize_char(c) } else { self.0.serialize(serializer) } } } /// This impl requires enabling the optional `serde` Cargo feature #[cfg(feature = "serde")] impl<'de> serde::Deserialize<'de> for PotentialCodePoint { fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { if deserializer.is_human_readable() { let c = ::deserialize(deserializer)?; Ok(PotentialCodePoint::from_char(c)) } else { let bytes = <[u8; 3]>::deserialize(deserializer)?; Ok(PotentialCodePoint(bytes)) } } } /// This impl requires enabling the optional `databake` Cargo feature #[cfg(feature = "databake")] impl databake::Bake for PotentialCodePoint { fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream { match self.try_to_char() { Ok(ch) => { env.insert("potential_utf"); let ch = ch.bake(env); databake::quote! { potential_utf::PotentialCodePoint::from_char(#ch) } } Err(_) => { env.insert("potential_utf"); let u24 = u32::from_le_bytes([self.0[0], self.0[1], self.0[2], 0]); databake::quote! { potential_utf::PotentialCodePoint::from_u24(#u24) } } } } } #[cfg(test)] mod test { use super::*; use zerovec::ZeroVec; #[test] fn test_serde_fail() { let uc = PotentialCodePoint([0xFF, 0xFF, 0xFF]); serde_json::to_string(&uc).expect_err("serialize invalid char bytes"); bincode::serialize(&uc).expect_err("serialize invalid char bytes"); } #[test] fn test_serde_json() { let c = '🙃'; let uc = PotentialCodePoint::from_char(c); let json_ser = serde_json::to_string(&uc).unwrap(); assert_eq!(json_ser, r#""🙃""#); let json_de: PotentialCodePoint = serde_json::from_str(&json_ser).unwrap(); assert_eq!(uc, json_de); } #[test] fn test_serde_bincode() { let c = '🙃'; let uc = PotentialCodePoint::from_char(c); let bytes_ser = bincode::serialize(&uc).unwrap(); assert_eq!(bytes_ser, [0x43, 0xF6, 0x01]); let bytes_de: PotentialCodePoint = bincode::deserialize(&bytes_ser).unwrap(); assert_eq!(uc, bytes_de); } #[test] fn test_representation() { let chars = ['w', 'ω', '文', '𑄃', '🙃']; // backed by [PotentialCodePoint] let uvchars: Vec<_> = chars .iter() .copied() .map(PotentialCodePoint::from_char) .collect(); // backed by [RawBytesULE<3>] let zvec: ZeroVec<_> = uvchars.clone().into_iter().collect(); let ule_bytes = zvec.as_bytes(); let uvbytes; unsafe { let ptr = &uvchars[..] as *const _ as *const u8; uvbytes = core::slice::from_raw_parts(ptr, ule_bytes.len()); } // PotentialCodePoint is defined as little-endian, so this must be true on all platforms // also asserts that to_unaligned/from_unaligned are no-ops assert_eq!(uvbytes, ule_bytes); assert_eq!( &[119, 0, 0, 201, 3, 0, 135, 101, 0, 3, 17, 1, 67, 246, 1], ule_bytes ); } #[test] fn test_char_bake() { databake::test_bake!( PotentialCodePoint, const, crate::PotentialCodePoint::from_char('b'), potential_utf ); // surrogate code point databake::test_bake!( PotentialCodePoint, const, crate::PotentialCodePoint::from_u24(55296u32), potential_utf ); } }