// This file is part of ICU4X. For terms of use, please see the file // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). //! # Internal layout of ZeroTrie //! //! A ZeroTrie is composed of a series of nodes stored in sequence in a byte slice. //! //! There are 4 types of nodes: //! //! 1. ASCII (`0xxxxxxx`): matches a literal ASCII byte. //! 2. Span (`101xxxxx`): matches a span of non-ASCII bytes. //! 3. Value (`100xxxxx`): associates a value with a string //! 4. Branch (`11xxxxxx`): matches one of a set of bytes. //! //! Span, Value, and Branch nodes contain a varint, which has different semantics for each: //! //! - Span varint: length of the span //! - Value varint: value associated with the string //! - Branch varint: number of edges in the branch and width of the offset table //! //! If reading an ASCII, Span, or Branch node, one or more bytes are consumed from the input //! string. If the next byte(s) in the input string do not match the node, we return `None`. //! If reading a Value node, if the string is empty, return `Some(value)`; otherwise, we skip //! the Value node and continue on to the next node. //! //! When a node is consumed, a shorter, well-formed ZeroTrie remains. //! //! ### Basic Example //! //! Here is an example ZeroTrie without branch nodes: //! //! ``` //! use zerotrie::ZeroTriePerfectHash; //! //! let bytes = [ //! b'a', // ASCII literal //! 0b10001010, // value 10 //! b'b', // ASCII literal //! 0b10100011, // span of 3 //! 0x81, // first byte in span //! 0x91, // second byte in span //! 0xA1, // third and final byte in span //! 0b10000100, // value 4 //! ]; //! //! let trie = ZeroTriePerfectHash::from_bytes(&bytes); //! //! // First value: "a" → 10 //! assert_eq!(trie.get(b"a"), Some(10)); //! //! // Second value: "ab\x81\x91\xA1" → 4 //! assert_eq!(trie.get(b"ab\x81\x91\xA1"), Some(4)); //! //! // A few examples of strings that do NOT have values in the trie: //! assert_eq!(trie.get(b"ab"), None); //! assert_eq!(trie.get(b"b"), None); //! assert_eq!(trie.get(b"b\x81\x91\xA1"), None); //! ``` //! //! ## Branch Nodes //! //! There are two types of branch nodes: binary search and perfect hash. `ZeroTrieSimpleAscii` //! contains only binary search nodes, whereas `ZeroTriePerfectHash` can contain either. //! //! The head node of the branch has a varint that encodes two things: //! //! - Bottom 8 bits: number of edges in the branch (`N`); if N = 0, set N to 256 //! - Bits 9 and 10: width of the offset table (`W`) //! //! Note that N is always in the range [1, 256]. There can't be more than 256 edges because //! there are only 256 unique u8 values. //! //! A few examples of the head node of the branch: //! //! - `0b11000000`: varint bits `0`: N = 0 which means N = 256; W = 0 //! - `0b11000110`: varint bits `110`: N = 6; W = 0 //! - `0b11100000 0b00000101`: varint bits `1000101`: N = 69; W = 0 //! - `0b11100010 0b00000000`: varint bits `101000000`: N = 64; W = 1 //! //! In `ZeroTriePerfectHash`, if N <= 15, the branch is assumed to be a binary search, and if //! N > 15, the branch is assumed to be a perfect hash. //! //! ### Binary Search Branch Nodes //! //! A binary search branch node is used when: //! //! 1. The trie is a `ZeroTrieSimpleAscii`, OR //! 2. There are 15 or fewer items in the branch. //! //! The head branch node is followed by N sorted bytes. When evaluating a branch node, one byte //! is consumed from the input. If it is one of the N sorted bytes (scanned using binary search), //! the index `i` of the byte within the list is used to index into the offset table (described //! below). If the byte is not in the list, the string is not in the trie, so return `None`. //! //! ### Perfect Hash Branch Nodes //! //! A perfect hash branch node is used when: //! //! 1. The trie is NOT a `ZeroTrieSimpleAscii`, AND //! 2. There are 16 or more items in the branch. //! //! The head branch node is followed by 1 byte containing parameter `p`, N bytes containing //! parameters `q`, and N bytes containing the bytes to match. From these parameters, either an //! index within the hash table `i` is resolved and used as input to index into the offset //! table (described below), or the value is determined to not be present and `None` is //! returned. For more detail on resolving the perfect hash function, see [`crate::byte_phf`]. //! //! ### Offset Tables //! //! The _offset table_ encodes the range of the remaining buffer containing the trie reachable //! from the byte matched in the branch node. Both types of branch nodes include an offset //! table followig the key lookup. Given the index `i` from the first step, the range //! `[s_i, s_(i+1))` brackets the next step in the trie. //! //! Offset tables utilize the `W` parameter stored in the branch head node. The special case //! when `W == 0`, with `N - 1` bytes, is easiest to understand: //! //! **Offset table, W = 0:** `[s_1, s_2, ..., s_(N-1)]` //! //! Note that `s_0` is always 0 and `s_N` is always the length of the remaining slice, so those //! values are not explicitly included in the offset table. //! //! When W > 0, the high and low bits of the offsets are in separate bytes, arranged as follows: //! //! **Generalized offset table:** `[a_1, a_2, ..., a_(N-1), b_1, b_2, ..., b_(N-1), c_1, ...]` //! //! where `s_i = (a_i << 8 + b_i) << 8 + c_i ...` (high bits first, low bits last) //! //! ### Advanced Example //! //! The following trie encodes the following map. It has multiple varints and branch nodes, which //! are all binary search with W = 0. Note that there is a value for the empty string. //! //! - "" → 0 //! - "axb" → 100 //! - "ayc" → 2 //! - "azd" → 3 //! - "bxe" → 4 //! - "bxefg" → 500 //! - "bxefh" → 6 //! - "bxei" → 7 //! - "bxeikl" → 8 //! //! ``` //! use zerotrie::ZeroTrieSimpleAscii; //! //! let bytes = [ //! 0b10000000, // value 0 //! 0b11000010, // branch of 2 //! b'a', // //! b'b', // //! 13, // //! 0b11000011, // start of 'a' subtree: branch of 3 //! b'x', // //! b'y', // //! b'z', // //! 3, // //! 5, // //! b'b', // //! 0b10010000, // value 100 (lead) //! 0x54, // value 100 (trail) //! b'c', // //! 0b10000010, // value 2 //! b'd', // //! 0b10000011, // value 3 //! b'x', // start of 'b' subtree //! b'e', // //! 0b10000100, // value 4 //! 0b11000010, // branch of 2 //! b'f', // //! b'i', // //! 7, // //! 0b11000010, // branch of 2 //! b'g', // //! b'h', // //! 2, // //! 0b10010011, // value 500 (lead) //! 0x64, // value 500 (trail) //! 0b10000110, // value 6 //! 0b10000111, // value 7 //! b'k', // //! b'l', // //! 0b10001000, // value 8 //! ]; //! //! let trie = ZeroTrieSimpleAscii::from_bytes(&bytes); //! //! // Assert that the specified items are in the map //! assert_eq!(trie.get(b""), Some(0)); //! assert_eq!(trie.get(b"axb"), Some(100)); //! assert_eq!(trie.get(b"ayc"), Some(2)); //! assert_eq!(trie.get(b"azd"), Some(3)); //! assert_eq!(trie.get(b"bxe"), Some(4)); //! assert_eq!(trie.get(b"bxefg"), Some(500)); //! assert_eq!(trie.get(b"bxefh"), Some(6)); //! assert_eq!(trie.get(b"bxei"), Some(7)); //! assert_eq!(trie.get(b"bxeikl"), Some(8)); //! //! // Assert that some other items are not in the map //! assert_eq!(trie.get(b"a"), None); //! assert_eq!(trie.get(b"bx"), None); //! assert_eq!(trie.get(b"xba"), None); //! ``` use crate::byte_phf::PerfectByteHashMap; use crate::cursor::AsciiProbeResult; use crate::helpers::*; use crate::options::*; use crate::varint::read_varint_meta2; use crate::varint::read_varint_meta3; #[cfg(feature = "alloc")] use alloc::string::String; /// Given a slice starting with an offset table, returns the trie for the given index. /// /// Arguments: /// - `trie` = a trie pointing at an offset table (after the branch node and search table) /// - `i` = the desired index within the offset table /// - `n` = the number of items in the offset table /// - `w` = the width of the offset table items minus one #[inline] fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> &[u8] { let mut p = 0usize; let mut q = 0usize; loop { let indices; (indices, trie) = trie.debug_split_at(n - 1); p = (p << 8) + if i == 0 { 0 } else { *indices.get(i - 1).debug_unwrap_or(&0) as usize }; q = match indices.get(i) { Some(x) => (q << 8) + *x as usize, None => trie.len(), }; if w == 0 { break; } w -= 1; } trie.get(p..q).debug_unwrap_or(&[]) } /// Version of [`get_branch()`] specialized for the case `w == 0` for performance #[inline] fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> &[u8] { let indices; (indices, trie) = trie.debug_split_at(n - 1); let p = if i == 0 { 0 } else { *indices.get(i - 1).debug_unwrap_or(&0) as usize }; let q = match indices.get(i) { Some(x) => *x as usize, None => trie.len(), }; trie.get(p..q).debug_unwrap_or(&[]) } /// The node type. See the module-level docs for more explanation of the four node types. enum NodeType { /// An ASCII node. Contains a single literal ASCII byte and no varint. Ascii, /// A span node. Contains a varint indicating how big the span is. Span, /// A value node. Contains a varint representing the value. Value, /// A branch node. Contains a varint of the number of output nodes, plus W in the high bits. Branch, } impl core::fmt::Debug for NodeType { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use NodeType::*; f.write_str(match *self { Ascii => "a", Span => "s", Value => "v", Branch => "m", }) } } #[inline] fn byte_type(b: u8) -> NodeType { match b & 0b11100000 { 0b10000000 => NodeType::Value, 0b10100000 => NodeType::Span, 0b11000000 => NodeType::Branch, 0b11100000 => NodeType::Branch, _ => NodeType::Ascii, } } #[inline] pub(crate) fn get_parameterized( mut trie: &[u8], mut ascii: &[u8], ) -> Option { loop { let (b, x, i, search); (b, trie) = trie.split_first()?; let byte_type = byte_type(*b); (x, trie) = match byte_type { NodeType::Ascii => (0, trie), NodeType::Span => { if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) { read_varint_meta3(*b, trie) } else { debug_assert!(false, "Span node found in ASCII trie!"); return None; } } NodeType::Value => read_varint_meta3(*b, trie), NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { b.eq_ignore_ascii_case(c) } else { b == c }; if is_match { // Matched a byte ascii = temp; continue; } else { // Byte that doesn't match return None; } } if matches!(byte_type, NodeType::Value) { // Value node, but not at end of string continue; } if matches!(T::OPTIONS.ascii_mode, AsciiMode::BinarySpans) && matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); (trie_span, trie) = trie.debug_split_at(x); (ascii_span, ascii) = ascii.split_at_checked(x)?; if trie_span == ascii_span { // Matched a byte span continue; } else { // Byte span that doesn't match return None; } } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; let w = if matches!(T::OPTIONS.capacity_mode, CapacityMode::Extended) { w } else { // See the table below regarding this assertion debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); w & 0x3 }; let x = if x == 0 { 256 } else { x }; if matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly) || x < 16 { // binary search (search, trie) = trie.debug_split_at(x); let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { search.binary_search_by_key(&c.to_ascii_lowercase(), |x| { x.to_ascii_lowercase() }) } else { search.binary_search(c) }; i = bsearch_result.ok()?; } else { // phf (search, trie) = trie.debug_split_at(x * 2 + 1); i = PerfectByteHashMap::from_store(search).get(*c)?; } trie = if w == 0 { get_branch_w0(trie, i, x) } else { get_branch(trie, i, x, w) }; ascii = temp; continue; } else { if matches!(byte_type, NodeType::Value) { // Value node at end of string return Some(x); } return None; } } } // DISCUSS: This function is 7% faster *on aarch64* if we assert a max on w. // // | Bench | No Assert, x86_64 | No Assert, aarch64 | Assertion, x86_64 | Assertion, aarch64 | // |---------------|-------------------|--------------------|-------------------|--------------------| // | basic | ~187.51 ns | ~97.586 ns | ~199.11 ns | ~99.236 ns | // | subtags_10pct | ~9.5557 µs | ~4.8696 µs | ~9.5779 µs | ~4.5649 µs | // | subtags_full | ~137.75 µs | ~76.016 µs | ~142.02 µs | ~70.254 µs | /// Steps one node into the trie assuming all branch nodes are binary search and that /// there are no span nodes. /// /// The input-output argument `trie` starts at the original trie and ends pointing to /// the sub-trie reachable by `c`. #[inline] pub(crate) fn step_parameterized( trie: &mut &[u8], c: u8, ) -> Option { // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`. // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie. // If a span node is encountered, `None` is returned later in this function. debug_assert!( matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly), "Spans not yet implemented in step function" ); // PHF can be easily implemented but the code is not yet reachable debug_assert!( matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly), "PHF not yet implemented in step function" ); // Extended Capacity can be easily implemented but the code is not yet reachable debug_assert!( matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal), "Extended capacity not yet implemented in step function" ); let (mut b, x, search); loop { (b, *trie) = match trie.split_first() { Some(v) => v, None => { // Empty trie or only a value node return None; } }; match byte_type(*b) { NodeType::Ascii => { let is_match = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { b.eq_ignore_ascii_case(&c) } else { *b == c }; if is_match { // Matched a byte return Some(*b); } else { // Byte that doesn't match *trie = &[]; return None; } } NodeType::Branch => { // Proceed to the branch node logic below (x, *trie) = read_varint_meta2(*b, trie); break; } NodeType::Span => { // Question: Should we put the trie back into a valid state? // Currently this code is unreachable so let's not worry about it. debug_assert!(false, "Span node found in ASCII trie!"); return None; } NodeType::Value => { // Skip the value node and go to the next node (_, *trie) = read_varint_meta3(*b, trie); continue; } }; } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; // See comment above regarding this assertion debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); let w = w & 0x3; let x = if x == 0 { 256 } else { x }; // Always use binary search (search, *trie) = trie.debug_split_at(x); let bsearch_result = if matches!(T::OPTIONS.case_sensitivity, CaseSensitivity::IgnoreCase) { search.binary_search_by_key(&c.to_ascii_lowercase(), |x| x.to_ascii_lowercase()) } else { search.binary_search(&c) }; match bsearch_result { Ok(i) => { // Matched a byte *trie = if w == 0 { get_branch_w0(trie, i, x) } else { get_branch(trie, i, x, w) }; Some(search[i]) } Err(_) => { // Byte that doesn't match *trie = &[]; None } } } /// Steps one node into the trie, assuming all branch nodes are binary search and that /// there are no span nodes, using an index. /// /// The input-output argument `trie` starts at the original trie and ends pointing to /// the sub-trie indexed by `index`. #[inline] pub(crate) fn probe_parameterized( trie: &mut &[u8], index: usize, ) -> Option { // Currently, the only option `step_parameterized` supports is `CaseSensitivity::IgnoreCase`. // `AsciiMode::BinarySpans` is tricky because the state can no longer be simply a trie. // If a span node is encountered, `None` is returned later in this function. debug_assert!( matches!(T::OPTIONS.ascii_mode, AsciiMode::AsciiOnly), "Spans not yet implemented in step function" ); // PHF can be easily implemented but the code is not yet reachable debug_assert!( matches!(T::OPTIONS.phf_mode, PhfMode::BinaryOnly), "PHF not yet implemented in step function" ); // Extended Capacity can be easily implemented but the code is not yet reachable debug_assert!( matches!(T::OPTIONS.capacity_mode, CapacityMode::Normal), "Extended capacity not yet implemented in step function" ); let (mut b, x, search); loop { (b, *trie) = match trie.split_first() { Some(v) => v, None => { // Empty trie or only a value node return None; } }; match byte_type(*b) { NodeType::Ascii => { if index > 0 { *trie = &[]; return None; } return Some(AsciiProbeResult { byte: *b, total_siblings: 1, }); } NodeType::Branch => { // Proceed to the branch node logic below (x, *trie) = read_varint_meta2(*b, trie); break; } NodeType::Span => { // Question: Should we put the trie back into a valid state? // Currently this code is unreachable so let's not worry about it. debug_assert!(false, "Span node found in ASCII trie!"); return None; } NodeType::Value => { // Skip the value node and go to the next node (_, *trie) = read_varint_meta3(*b, trie); continue; } }; } // Branch node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; debug_assert!(u8::try_from(x).is_ok()); let total_siblings = x as u8; // See comment above regarding this assertion debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); let w = w & 0x3; let x = if x == 0 { 256 } else { x }; if index >= x { *trie = &[]; return None; } (search, *trie) = trie.debug_split_at(x); *trie = if w == 0 { get_branch_w0(trie, index, x) } else { get_branch(trie, index, x, w) }; Some(AsciiProbeResult { byte: search[index], total_siblings, }) } /// Steps one node into the trie if the head node is a value node, returning the value. /// If the head node is not a value node, no change is made. /// /// The input-output argument `trie` starts at the original trie and ends pointing to /// the sub-trie with the value node removed. pub(crate) fn take_value(trie: &mut &[u8]) -> Option { let (b, new_trie) = trie.split_first()?; match byte_type(*b) { NodeType::Ascii | NodeType::Span | NodeType::Branch => None, NodeType::Value => { let x; (x, *trie) = read_varint_meta3(*b, new_trie); Some(x) } } } #[cfg(feature = "alloc")] use alloc::vec::Vec; /// Iterator type for walking the byte sequences contained in a ZeroTrie. #[cfg(feature = "alloc")] #[derive(Debug)] pub struct ZeroTrieIterator<'a> { /// Whether the PHF is enabled on this trie. use_phf: bool, /// Intermediate state during iteration: /// 1. A trie (usually a slice of the original, bigger trie) /// 2. The string that leads to the trie /// 3. If the trie's lead node is a branch node, the current index being evaluated state: Vec<(&'a [u8], Vec, usize)>, } #[cfg(feature = "alloc")] impl<'a> ZeroTrieIterator<'a> { pub(crate) fn new + ?Sized>(store: &'a S, use_phf: bool) -> Self { ZeroTrieIterator { use_phf, state: alloc::vec![(store.as_ref(), alloc::vec![], 0)], } } } #[cfg(feature = "alloc")] impl Iterator for ZeroTrieIterator<'_> { type Item = (Vec, usize); fn next(&mut self) -> Option { let (mut trie, mut string, mut branch_idx); (trie, string, branch_idx) = self.state.pop()?; loop { let (b, x, span, search); let return_trie = trie; (b, trie) = match trie.split_first() { Some(tpl) => tpl, None => { // At end of current branch; step back to the branch node. // If there are no more branches, we are finished. (trie, string, branch_idx) = self.state.pop()?; continue; } }; let byte_type = byte_type(*b); if matches!(byte_type, NodeType::Ascii) { string.push(*b); continue; } (x, trie) = match byte_type { NodeType::Ascii => (0, trie), NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), NodeType::Branch => read_varint_meta2(*b, trie), }; if matches!(byte_type, NodeType::Span) { (span, trie) = trie.debug_split_at(x); string.extend(span); continue; } if matches!(byte_type, NodeType::Value) { let retval = string.clone(); // Return to this position on the next step self.state.push((trie, string, 0)); return Some((retval, x)); } // Match node let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; let x = if x == 0 { 256 } else { x }; if branch_idx + 1 < x { // Return to this branch node at the next index self.state .push((return_trie, string.clone(), branch_idx + 1)); } let byte = if x < 16 || !self.use_phf { // binary search (search, trie) = trie.debug_split_at(x); debug_unwrap!(search.get(branch_idx), return None) } else { // phf (search, trie) = trie.debug_split_at(x * 2 + 1); debug_unwrap!(search.get(branch_idx + x + 1), return None) }; string.push(*byte); trie = if w == 0 { get_branch_w0(trie, branch_idx, x) } else { get_branch(trie, branch_idx, x, w) }; branch_idx = 0; } } } #[cfg(feature = "alloc")] pub(crate) fn get_iter_phf + ?Sized>(store: &S) -> ZeroTrieIterator<'_> { ZeroTrieIterator::new(store, true) } /// # Panics /// Panics if the trie contains non-ASCII items. #[cfg(feature = "alloc")] #[allow(clippy::type_complexity)] pub(crate) fn get_iter_ascii_or_panic + ?Sized>( store: &S, ) -> core::iter::Map, fn((Vec, usize)) -> (String, usize)> { ZeroTrieIterator::new(store, false).map(|(k, v)| { #[allow(clippy::unwrap_used)] // in signature of function let ascii_str = String::from_utf8(k).unwrap(); (ascii_str, v) }) }