diff options
| author | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
| commit | 8cdfa445d6629ffef4cb84967ff7017654045bc2 (patch) | |
| tree | 22f0b0907c024c78d26a731e2e1f5219407d8102 /vendor/unicode-script/src/lib.rs | |
| parent | 4351c74c7c5f97156bc94d3a8549b9940ac80e3f (diff) | |
chore: add vendor directory
Diffstat (limited to 'vendor/unicode-script/src/lib.rs')
| -rw-r--r-- | vendor/unicode-script/src/lib.rs | 560 |
1 files changed, 560 insertions, 0 deletions
diff --git a/vendor/unicode-script/src/lib.rs b/vendor/unicode-script/src/lib.rs new file mode 100644 index 00000000..a8e3026b --- /dev/null +++ b/vendor/unicode-script/src/lib.rs @@ -0,0 +1,560 @@ +//! This crate exposes the Unicode `Script` and `Script_Extension` +//! properties from [UAX #24](http://www.unicode.org/reports/tr24/) + +#![cfg_attr(not(test), no_std)] +#![cfg_attr(feature = "bench", feature(test))] + +mod tables; + +use core::convert::TryFrom; +use core::fmt; +use core::u64; +pub use tables::script_extensions; +use tables::{get_script, get_script_extension, NEXT_SCRIPT}; +pub use tables::{Script, UNICODE_VERSION}; + +impl Script { + /// Get the full name of a script. + pub fn full_name(self) -> &'static str { + self.inner_full_name() + } + + /// Attempts to parse script name from the provided string. + /// Returns `None` if the provided string does not represent a valid + /// script full name. + pub fn from_full_name(input: &str) -> Option<Self> { + Self::inner_from_full_name(input) + } + + /// Get the four-character short name of a script. + pub fn short_name(self) -> &'static str { + self.inner_short_name() + } + + /// Attempts to parse script name from the provided string. + /// Returns `None` if the provided string does not represent a valid + /// script four-character short name. + pub fn from_short_name(input: &str) -> Option<Self> { + Self::inner_from_short_name(input) + } + + /// Is this script "Recommended" according to + /// [UAX #31](www.unicode.org/reports/tr31/#Table_Recommended_Scripts)? + pub fn is_recommended(self) -> bool { + use Script::*; + match self { + Common | Inherited | Arabic | Armenian | Bengali | Bopomofo | Cyrillic | Devanagari + | Ethiopic | Georgian | Greek | Gujarati | Gurmukhi | Han | Hangul | Hebrew + | Hiragana | Kannada | Katakana | Khmer | Lao | Latin | Malayalam | Myanmar | Oriya + | Sinhala | Tamil | Telugu | Thaana | Thai | Tibetan => true, + _ => false, + } + } +} + +impl From<Script> for ScriptExtension { + fn from(script: Script) -> Self { + if script == Script::Common { + ScriptExtension::new_common() + } else if script == Script::Inherited { + ScriptExtension::new_inherited() + } else if script == Script::Unknown { + ScriptExtension::new_unknown() + } else { + let mut first = 0; + let mut second = 0; + let mut third = 0; + let bit = script as u8; + // Find out which field it's in, and set the appropriate bit there + if bit < 64 { + first = 1 << bit as u64; + } else if bit < 128 { + // offset by 64 since `bit` is an absolute number, + // not relative to the chunk + second = 1 << (bit - 64) as u64; + } else { + third = 1 << (bit - 128) as u32; + } + ScriptExtension::new(first, second, third) + } + } +} + +impl TryFrom<ScriptExtension> for Script { + type Error = (); + fn try_from(ext: ScriptExtension) -> Result<Self, ()> { + if ext.is_common_or_inherited() { + if ext.common { + Ok(Script::Common) + } else { + Ok(Script::Inherited) + } + } else if ext.is_empty() { + Ok(Script::Unknown) + } else { + // filled elements will have set ones + let fo = ext.first.count_ones(); + let so = ext.second.count_ones(); + let to = ext.third.count_ones(); + // only one bit set, in the first chunk + if fo == 1 && so == 0 && to == 0 { + // use trailing_zeroes() to figure out which bit it is + Ok(Script::for_integer(ext.first.trailing_zeros() as u8)) + // only one bit set, in the second chunk + } else if fo == 0 && so == 1 && to == 0 { + Ok(Script::for_integer(64 + ext.second.trailing_zeros() as u8)) + // only one bit set, in the third chunk + } else if fo == 0 && so == 0 && to == 1 { + Ok(Script::for_integer(128 + ext.third.trailing_zeros() as u8)) + } else { + Err(()) + } + } + } +} + +impl Default for Script { + fn default() -> Self { + Script::Common + } +} + +impl From<char> for Script { + fn from(o: char) -> Self { + o.script() + } +} + +impl fmt::Display for Script { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.full_name()) + } +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +#[non_exhaustive] +/// A value for the `Script_Extension` property +/// +/// [`ScriptExtension`] is one or more [`Script`] +/// +/// This is essentially an optimized version of `Vec<Script>` that uses bitfields +pub struct ScriptExtension { + // A bitset for the first 64 scripts + first: u64, + // A bitset for the scripts 65-128 + second: u64, + // A bitset for scripts after 128 + third: u64, + // Both Common and Inherited are represented by all used bits being set, + // this flag lets us distinguish the two. + common: bool, +} + +impl ScriptExtension { + // We don't use the complete u64 of `third`, so the "all" value is not just u32::MAX + // Instead, we take the number of the next (unused) script bit, subtract 128 to bring + // it in the range of `third`, create a u64 with just that bit set, and subtract 1 + // to create one with all the lower bits set. + const THIRD_MAX: u64 = ((1 << (NEXT_SCRIPT - 128)) - 1); + + pub(crate) const fn new(first: u64, second: u64, third: u64) -> Self { + ScriptExtension { + first, + second, + third, + common: false, + } + } + + pub(crate) const fn new_common() -> Self { + ScriptExtension { + first: u64::MAX, + second: u64::MAX, + third: Self::THIRD_MAX, + common: true, + } + } + + pub(crate) const fn new_inherited() -> Self { + ScriptExtension { + first: u64::MAX, + second: u64::MAX, + third: Self::THIRD_MAX, + common: false, + } + } + + pub(crate) const fn new_unknown() -> Self { + ScriptExtension { + first: 0, + second: 0, + third: 0, + common: false, + } + } + + const fn is_common_or_inherited(self) -> bool { + (self.first == u64::MAX) & (self.second == u64::MAX) & (self.third == Self::THIRD_MAX) + } + + /// Checks if the script extension is Common + pub const fn is_common(self) -> bool { + self.is_common_or_inherited() & self.common + } + + /// Checks if the script extension is Inherited + pub const fn is_inherited(self) -> bool { + self.is_common_or_inherited() & !self.common + } + + /// Checks if the script extension is empty (unknown) + pub const fn is_empty(self) -> bool { + (self.first == 0) & (self.second == 0) & (self.third == 0) + } + + /// Returns the number of scripts in the script extension + pub fn len(self) -> usize { + if self.is_common_or_inherited() { + 1 + } else { + (self.first.count_ones() + self.second.count_ones() + self.third.count_ones()) as usize + } + } + + /// Intersect this `ScriptExtension` with another `ScriptExtension`. Produces `Unknown` if things + /// do not intersect. This is equivalent to [`ScriptExtension::intersection`] but it stores the result + /// in `self` + /// + /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting + /// everything, the intersection of `Common` and `Inherited` is `Inherited` + pub fn intersect_with(&mut self, other: Self) { + *self = self.intersection(other) + } + + /// Find the intersection between two ScriptExtensions. Returns Unknown if things + /// do not intersect. + /// + /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting + /// everything, the intersection of `Common` and `Inherited` is `Inherited` + pub const fn intersection(self, other: Self) -> Self { + let first = self.first & other.first; + let second = self.second & other.second; + let third = self.third & other.third; + let common = self.common & other.common; + ScriptExtension { + first, + second, + third, + common, + } + } + + /// Find the union between two ScriptExtensions. + /// + /// "Common" (`Zyyy`) and "Inherited" (`Zinh`) are considered as intersecting + /// everything, the union of `Common` and `Inherited` is `Common` + pub const fn union(self, other: Self) -> Self { + let first = self.first | other.first; + let second = self.second | other.second; + let third = self.third | other.third; + let common = self.common | other.common; + ScriptExtension { + first, + second, + third, + common, + } + } + + /// Check if this ScriptExtension contains the given script + /// + /// Should be used with specific scripts only, this will + /// return `true` if `self` is not `Unknown` and `script` is + /// `Common` or `Inherited` + pub fn contains_script(self, script: Script) -> bool { + !self.intersection(script.into()).is_empty() + } + + /// Get the intersection of script extensions of all characters + /// in a string. + pub fn for_str(x: &str) -> Self { + let mut ext = ScriptExtension::default(); + for ch in x.chars() { + ext.intersect_with(ch.into()); + } + ext + } + + /// Iterate over the scripts in this script extension + /// + /// Will never yield Script::Unknown + pub fn iter(self) -> ScriptIterator { + ScriptIterator { ext: self } + } +} + +impl Default for ScriptExtension { + fn default() -> Self { + ScriptExtension::new_common() + } +} + +impl From<char> for ScriptExtension { + fn from(o: char) -> Self { + o.script_extension() + } +} + +impl From<&'_ str> for ScriptExtension { + fn from(o: &'_ str) -> Self { + Self::for_str(o) + } +} + +impl fmt::Debug for ScriptExtension { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "ScriptExtension(")?; + fmt::Display::fmt(self, f)?; + write!(f, ")") + } +} + +impl fmt::Display for ScriptExtension { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.is_common() { + write!(f, "Common")?; + } else if self.is_inherited() { + write!(f, "Inherited")?; + } else if self.is_empty() { + write!(f, "Unknown")?; + } else { + let mut first = true; + for script in self.iter() { + if !first { + write!(f, " + ")?; + first = false; + } + script.full_name().fmt(f)?; + } + } + Ok(()) + } +} + +/// Extension trait on `char` for calculating script properties +pub trait UnicodeScript { + /// Get the script for a given character + fn script(&self) -> Script; + /// Get the Script_Extension for a given character + fn script_extension(&self) -> ScriptExtension; +} + +impl UnicodeScript for char { + fn script(&self) -> Script { + get_script(*self).unwrap_or(Script::Unknown) + } + + fn script_extension(&self) -> ScriptExtension { + get_script_extension(*self).unwrap_or_else(|| self.script().into()) + } +} + +/// Iterator over scripts in a [ScriptExtension]. +/// +/// Can be obtained ia [ScriptExtension::iter()] +pub struct ScriptIterator { + ext: ScriptExtension, +} + +impl Iterator for ScriptIterator { + type Item = Script; + + fn next(&mut self) -> Option<Script> { + if self.ext.is_common_or_inherited() { + let common = self.ext.common; + self.ext = ScriptExtension::new_unknown(); + if common { + Some(Script::Common) + } else { + Some(Script::Inherited) + } + // Are there bits left in the first chunk? + } else if self.ext.first != 0 { + // Find the next bit + let bit = self.ext.first.trailing_zeros(); + // unset just that bit + self.ext.first &= !(1 << bit); + Some(Script::for_integer(bit as u8)) + // Are there bits left in the second chunk? + } else if self.ext.second != 0 { + let bit = self.ext.second.trailing_zeros(); + self.ext.second &= !(1 << bit); + Some(Script::for_integer(64 + bit as u8)) + // Are there bits left in the third chunk? + } else if self.ext.third != 0 { + let bit = self.ext.third.trailing_zeros(); + self.ext.third &= !(1 << bit); + Some(Script::for_integer(128 + bit as u8)) + } else { + // Script::Unknown + None + } + } +} + +#[cfg(test)] +mod tests { + use crate::*; + use std::collections::HashSet; + use std::convert::TryInto; + + #[cfg(feature = "bench")] + use test::bench::Bencher; + #[cfg(feature = "bench")] + extern crate test; + + #[test] + fn test_conversion() { + let mut seen_scripts = HashSet::new(); + let mut seen_exts = HashSet::new(); + for bit in 0..NEXT_SCRIPT { + let script = Script::for_integer(bit); + let ext = script.into(); + if seen_scripts.contains(&script) { + panic!("Found script {:?} twice!", script) + } + if seen_exts.contains(&ext) { + panic!("Found extension {:?} twice!", ext) + } + seen_scripts.insert(script); + seen_exts.insert(ext); + assert_eq!(script as u8, bit); + assert!(!ScriptExtension::new_common().intersection(ext).is_empty()); + assert!(!ScriptExtension::new_inherited() + .intersection(ext) + .is_empty()); + assert!(ScriptExtension::new_unknown().intersection(ext).is_empty()); + assert_eq!(ext.iter().collect::<Vec<_>>(), vec![script]); + assert_eq!(Ok(script), ext.try_into()); + } + } + + #[test] + fn test_specific() { + let s = "सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे."; + let ext = ScriptExtension::for_str(s); + assert_eq!(ext, script_extensions::DEVA); + println!( + "{:?}", + script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH + ); + println!( + "{:?}", + ext.intersection( + script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH + ) + ); + assert!(!ext + .intersection(script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH) + .is_empty()); + + let u = ext.union(Script::Dogra.into()); + assert_eq!( + u.intersection( + script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH + ), + u + ); + } + + #[test] + fn test_specific_ext() { + let ext = script_extensions::DEVA_DOGR_GUJR_GURU_KHOJ_KTHI_MAHJ_MODI_SIND_TAKR_TIRH; + + let all: HashSet<_> = ext.iter().collect(); + + for bit in 0..NEXT_SCRIPT { + let script = Script::for_integer(bit); + + if all.contains(&script) { + assert!(ext.contains_script(script)) + } else { + assert!(!ext.contains_script(script)) + } + } + + assert!(ext.contains_script(Script::Devanagari)); + assert!(ext.contains_script(Script::Dogra)); + assert!(ext.contains_script(Script::Gujarati)); + assert!(ext.contains_script(Script::Gurmukhi)); + assert!(ext.contains_script(Script::Khojki)); + assert!(ext.contains_script(Script::Kaithi)); + assert!(ext.contains_script(Script::Mahajani)); + assert!(ext.contains_script(Script::Modi)); + assert!(ext.contains_script(Script::Khudawadi)); + assert!(ext.contains_script(Script::Takri)); + assert!(ext.contains_script(Script::Tirhuta)); + + let scr: Result<Script, _> = ext.try_into(); + assert!(scr.is_err()); + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_script_intersection(b: &mut Bencher) { + b.iter(|| { + let script = test::black_box(Script::Devanagari); + let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); + test::black_box(ext.intersection(script.into())); + }) + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_ext_to_script(b: &mut Bencher) { + let ext: ScriptExtension = Script::Devanagari.into(); + b.iter(|| { + let ext = test::black_box(ext); + let script: Result<Script, _> = ext.try_into(); + let _ = test::black_box(script); + }) + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_script_to_ext(b: &mut Bencher) { + b.iter(|| { + let script = test::black_box(Script::Devanagari); + let ext: ScriptExtension = script.into(); + test::black_box(ext); + }) + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_ext_intersection(b: &mut Bencher) { + b.iter(|| { + let e1 = test::black_box(script_extensions::ARAB_GARA_NKOO_ROHG_SYRC_THAA_YEZI); + let e2 = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); + test::black_box(e2.intersection(e1)); + }) + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_to_vec(b: &mut Bencher) { + b.iter(|| { + let ext = test::black_box(script_extensions::BENG_DEVA_DOGR_GONG_GONM_GRAN_GUJR_GURU_KNDA_MAHJ_MLYM_NAND_ONAO_ORYA_SIND_SINH_SYLO_TAKR_TAML_TELU_TIRH); + test::black_box(ext.iter().collect::<Vec<_>>()); + }) + } + + #[cfg(feature = "bench")] + #[bench] + fn bench_string_ext(b: &mut Bencher) { + b.iter(|| { + let s = test::black_box("सवव मानवी व्यद्क् जन्मतःच स्वतींत्र आहेत व त्ाींना समान प्रवतष्ठा व समान अविकार आहेत. त्ाींना ववचारशद्क् व सवविे कबुद्द्धलाभलेली आहे. व त्ाींनी एकमेकाींशी बींिुत्वाचाभावनेने आचरण करावे."); + test::black_box(ScriptExtension::for_str(s)); + }) + } +} |
