diff options
| author | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
| commit | 8cdfa445d6629ffef4cb84967ff7017654045bc2 (patch) | |
| tree | 22f0b0907c024c78d26a731e2e1f5219407d8102 /vendor/logos-codegen/src/graph/mod.rs | |
| parent | 4351c74c7c5f97156bc94d3a8549b9940ac80e3f (diff) | |
chore: add vendor directory
Diffstat (limited to 'vendor/logos-codegen/src/graph/mod.rs')
| -rw-r--r-- | vendor/logos-codegen/src/graph/mod.rs | 566 |
1 files changed, 566 insertions, 0 deletions
diff --git a/vendor/logos-codegen/src/graph/mod.rs b/vendor/logos-codegen/src/graph/mod.rs new file mode 100644 index 00000000..6d218e8c --- /dev/null +++ b/vendor/logos-codegen/src/graph/mod.rs @@ -0,0 +1,566 @@ +use std::cmp::Ordering; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap as Map; +use std::hash::{Hash, Hasher}; +use std::num::NonZeroU32; +use std::ops::Index; + +use fnv::FnvHasher; + +mod fork; +mod impls; +mod meta; +mod range; +mod regex; +mod rope; + +pub use self::fork::Fork; +pub use self::meta::Meta; +pub use self::range::Range; +pub use self::rope::Rope; + +/// Disambiguation error during the attempt to merge two leaf +/// nodes with the same priority +#[derive(Debug)] +pub struct DisambiguationError(pub NodeId, pub NodeId); + +pub struct Graph<Leaf> { + /// Internal storage of all allocated nodes. Once a node is + /// put here, it should never be mutated. + nodes: Vec<Option<Node<Leaf>>>, + /// When merging two nodes into a new node, we store the two + /// entry keys and the result, so that we don't merge the same + /// two nodes multiple times. + /// + /// Most of the time the entry we want to find will be the last + /// one that has been inserted, so we can use a vec with reverse + /// order search to get O(1) searches much faster than any *Map. + merges: Map<Merge, NodeId>, + /// Another map used for accounting. Before `.push`ing a new node + /// onto the graph (inserts are exempt), we hash it and find if + /// an identical(!) node has been created before. + hashes: Map<u64, NodeId>, + /// Instead of handling errors over return types, opt to collect + /// them internally. + errors: Vec<DisambiguationError>, + /// Deferred merges. When when attempting to merge a node with an + /// empty reserved slot, the merging operation is deferred until + /// the reserved slot is populated. This is a stack that keeps track + /// of all such deferred merges + deferred: Vec<DeferredMerge>, +} + +/// Trait to be implemented on `Leaf` nodes in order to disambiguate +/// between them. +pub trait Disambiguate { + fn cmp(left: &Self, right: &Self) -> Ordering; +} + +/// Id of a Node in the graph. `NodeId` can be referencing an empty +/// slot that is going to be populated later in time. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct NodeId(NonZeroU32); + +impl Hash for NodeId { + fn hash<H: Hasher>(&self, state: &mut H) { + // Always use little-endian byte order for hashing to avoid + // different code generation on big-endian platforms due to + // iteration over a HashMap, + // see https://github.com/maciejhirsz/logos/issues/427. + state.write(&self.0.get().to_le_bytes()) + } +} + +impl NodeId { + fn get(self) -> usize { + self.0.get() as usize + } + + fn new(n: usize) -> NodeId { + NodeId(NonZeroU32::new(n as u32).expect("Invalid NodeId")) + } +} + +/// Unique reserved `NodeId` that is guaranteed to point to an +/// empty allocated slot in the graph. It's safe to create multiple +/// `NodeId` copies of `ReservedId`, however API users should never +/// be able to clone a `ReservedId`, or create a new one from `NodeId`. +/// +/// `ReservedId` is consumed once passed into `Graph::insert`. +#[derive(Debug)] +pub struct ReservedId(NodeId); + +impl ReservedId { + pub fn get(&self) -> NodeId { + self.0 + } +} + +/// Merge key used to lookup whether two nodes have been previously +/// mered, so we can avoid duplicating merges, potentially running into +/// loops that blow out the stack. +/// +/// `Merge::new(a, b)` should always equal to `Merge::new(b, a)` to ensure +/// that node merges are symmetric. +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] +struct Merge(NodeId, NodeId); + +impl Merge { + fn new(a: NodeId, b: NodeId) -> Self { + if a < b { + Merge(a, b) + } else { + Merge(b, a) + } + } +} + +/// When attempting to merge two nodes, one of which was not yet created, +/// we can record such attempt, and execute the merge later on when the +/// `awaiting` has been `insert`ed into the graph. +#[derive(Debug)] +pub struct DeferredMerge { + awaiting: NodeId, + with: NodeId, + into: ReservedId, +} + +impl<Leaf> Graph<Leaf> { + pub fn new() -> Self { + Graph { + // Start with an empty slot so we can start + // counting NodeIds from 1 and use NonZero + // optimizations + nodes: vec![None], + merges: Map::new(), + hashes: Map::new(), + errors: Vec::new(), + deferred: Vec::new(), + } + } + + pub fn errors(&self) -> &[DisambiguationError] { + &self.errors + } + + fn next_id(&self) -> NodeId { + NodeId::new(self.nodes.len()) + } + + /// Reserve an empty slot for a node on the graph and return an + /// id for it. `ReservedId` cannot be cloned, and must be consumed + /// by calling `insert` on the graph. + pub fn reserve(&mut self) -> ReservedId { + let id = self.next_id(); + + self.nodes.push(None); + + ReservedId(id) + } + + /// Insert a node at a given, previously reserved id. Returns the + /// inserted `NodeId`. + pub fn insert<N>(&mut self, reserved: ReservedId, node: N) -> NodeId + where + N: Into<Node<Leaf>>, + Leaf: Disambiguate, + { + let id = reserved.get(); + + self.nodes[id.get()] = Some(node.into()); + + let mut awaiting = Vec::new(); + + // Partition out all `DeferredMerge`s that can be completed + // now that this `ReservedId` has a `Node` inserted into it. + for idx in (0..self.deferred.len()).rev() { + if self.deferred[idx].awaiting == id { + awaiting.push(self.deferred.remove(idx)); + } + } + + // Complete deferred merges. We've collected them from the back, + // so we must iterate through them from the back as well to restore + // proper order of merges in case there is some cascading going on. + for DeferredMerge { + awaiting, + with, + into, + } in awaiting.into_iter().rev() + { + self.merge_unchecked(awaiting, with, into); + } + + id + } + + /// Push a node onto the graph and get an id to it. If an identical + /// node has already been pushed on the graph, it will return the id + /// of that node instead. + pub fn push<B>(&mut self, node: B) -> NodeId + where + B: Into<Node<Leaf>>, + { + let node = node.into(); + + if let Node::Leaf(_) = node { + return self.push_unchecked(node); + } + + let mut hasher = FnvHasher::default(); + node.hash(&mut hasher); + + let next_id = self.next_id(); + + match self.hashes.entry(hasher.finish()) { + Entry::Occupied(occupied) => { + let id = *occupied.get(); + + if self[id].eq(&node) { + return id; + } + } + Entry::Vacant(vacant) => { + vacant.insert(next_id); + } + } + + self.push_unchecked(node) + } + + fn push_unchecked(&mut self, node: Node<Leaf>) -> NodeId { + let id = self.next_id(); + + self.nodes.push(Some(node)); + + id + } + + /// If nodes `a` and `b` have been already merged, return the + /// `NodeId` of the node they have been merged into. + fn find_merged(&self, a: NodeId, b: NodeId) -> Option<NodeId> { + let probe = Merge::new(a, b); + + self.merges.get(&probe).copied() + } + + /// Mark that nodes `a` and `b` have been merged into `product`. + /// + /// This will also mark merging `a` and `product`, as well as + /// `b` and `product` into `product`, since those are symmetric + /// operations. + /// + /// This is necessary to break out asymmetric merge loops. + fn set_merged(&mut self, a: NodeId, b: NodeId, product: NodeId) { + self.merges.insert(Merge::new(a, b), product); + self.merges.insert(Merge::new(a, product), product); + self.merges.insert(Merge::new(b, product), product); + } + + /// Merge the nodes at id `a` and `b`, returning a new id. + pub fn merge(&mut self, a: NodeId, b: NodeId) -> NodeId + where + Leaf: Disambiguate, + { + if a == b { + return a; + } + + // If the id pair is already merged (or is being merged), just return the id + if let Some(id) = self.find_merged(a, b) { + return id; + } + + match (self.get(a), self.get(b)) { + (None, None) => { + panic!( + "Merging two reserved nodes! This is a bug, please report it:\n\ + \n\ + https://github.com/maciejhirsz/logos/issues" + ); + } + (None, Some(_)) => { + let reserved = self.reserve(); + let id = reserved.get(); + self.deferred.push(DeferredMerge { + awaiting: a, + with: b, + into: reserved, + }); + self.set_merged(a, b, id); + + return id; + } + (Some(_), None) => { + let reserved = self.reserve(); + let id = reserved.get(); + self.deferred.push(DeferredMerge { + awaiting: b, + with: a, + into: reserved, + }); + self.set_merged(a, b, id); + + return id; + } + (Some(Node::Leaf(left)), Some(Node::Leaf(right))) => { + return match Disambiguate::cmp(left, right) { + Ordering::Less => b, + Ordering::Greater => a, + Ordering::Equal => { + self.errors.push(DisambiguationError(a, b)); + + a + } + }; + } + _ => (), + } + + // Reserve the id for the merge and save it. Since the graph can contain loops, + // this prevents us from trying to merge the same id pair in a loop, blowing up + // the stack. + let reserved = self.reserve(); + self.set_merged(a, b, reserved.get()); + + self.merge_unchecked(a, b, reserved) + } + + /// Unchecked merge of `a` and `b`. This fn assumes that `a` and `b` are + /// not pointing to empty slots. + fn merge_unchecked(&mut self, a: NodeId, b: NodeId, reserved: ReservedId) -> NodeId + where + Leaf: Disambiguate, + { + let merged_rope = match (self.get(a), self.get(b)) { + (Some(Node::Rope(rope)), _) => { + let rope = rope.clone(); + + self.merge_rope(rope, b) + } + (_, Some(Node::Rope(rope))) => { + let rope = rope.clone(); + + self.merge_rope(rope, a) + } + _ => None, + }; + + if let Some(rope) = merged_rope { + return self.insert(reserved, rope); + } + + let mut fork = self.fork_off(a); + fork.merge(self.fork_off(b), self); + + let mut stack = vec![reserved.get()]; + + // Flatten the fork + while let Some(miss) = fork.miss { + if stack.contains(&miss) { + break; + } + stack.push(miss); + + let other = match self.get(miss) { + Some(Node::Fork(other)) => other.clone(), + Some(Node::Rope(other)) => other.clone().into_fork(self), + _ => break, + }; + match other.miss { + Some(id) if self.get(id).is_none() => break, + _ => (), + } + fork.miss = None; + fork.merge(other, self); + } + + self.insert(reserved, fork) + } + + fn merge_rope(&mut self, rope: Rope, other: NodeId) -> Option<Rope> + where + Leaf: Disambiguate, + { + match self.get(other) { + Some(Node::Fork(fork)) if rope.miss.is_none() => { + // Count how many consecutive ranges in this rope would + // branch into the fork that results in a loop. + // + // e.g.: for rope "foobar" and a looping fork [a-z]: 6 + let count = rope + .pattern + .iter() + .take_while(|range| fork.contains(**range) == Some(other)) + .count(); + + let mut rope = rope.split_at(count, self)?.miss_any(other); + + rope.then = self.merge(rope.then, other); + + Some(rope) + } + Some(Node::Rope(other)) => { + let (prefix, miss) = rope.prefix(other)?; + + let (a, b) = (rope, other.clone()); + + let a = a.remainder(prefix.len(), self); + let b = b.remainder(prefix.len(), self); + + let rope = Rope::new(prefix, self.merge(a, b)).miss(miss); + + Some(rope) + } + Some(Node::Leaf(_)) | None => { + if rope.miss.is_none() { + Some(rope.miss(other)) + } else { + None + } + } + _ => None, + } + } + + pub fn fork_off(&mut self, id: NodeId) -> Fork + where + Leaf: Disambiguate, + { + match self.get(id) { + Some(Node::Fork(fork)) => fork.clone(), + Some(Node::Rope(rope)) => rope.clone().into_fork(self), + Some(Node::Leaf(_)) | None => Fork::new().miss(id), + } + } + + pub fn nodes(&self) -> &[Option<Node<Leaf>>] { + &self.nodes + } + + /// Find all nodes that have no references and remove them. + pub fn shake(&mut self, root: NodeId) { + let mut filter = vec![false; self.nodes.len()]; + + filter[root.get()] = true; + + self[root].shake(self, &mut filter); + + for (id, referenced) in filter.into_iter().enumerate() { + if !referenced { + self.nodes[id] = None; + } + } + } + + pub fn get(&self, id: NodeId) -> Option<&Node<Leaf>> { + self.nodes.get(id.get())?.as_ref() + } +} + +impl<Leaf> Index<NodeId> for Graph<Leaf> { + type Output = Node<Leaf>; + + fn index(&self, id: NodeId) -> &Node<Leaf> { + self.get(id).expect( + "Indexing into an empty node. This is a bug, please report it at:\n\ + \n\ + https://github.com/maciejhirsz/logos/issues", + ) + } +} + +impl std::fmt::Display for NodeId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } +} + +#[cfg_attr(test, derive(PartialEq))] +pub enum Node<Leaf> { + /// Fork node, can lead to more than one state + Fork(Fork), + /// Rope node, can lead to one state on match, one state on miss + Rope(Rope), + /// Leaf node, terminal state + Leaf(Leaf), +} + +impl<Leaf> Node<Leaf> { + pub fn miss(&self) -> Option<NodeId> { + match self { + Node::Rope(rope) => rope.miss.first(), + Node::Fork(fork) => fork.miss, + Node::Leaf(_) => None, + } + } + + fn eq(&self, other: &Node<Leaf>) -> bool { + match (self, other) { + (Node::Fork(a), Node::Fork(b)) => a == b, + (Node::Rope(a), Node::Rope(b)) => a == b, + _ => false, + } + } + + fn shake(&self, graph: &Graph<Leaf>, filter: &mut [bool]) { + match self { + Node::Fork(fork) => fork.shake(graph, filter), + Node::Rope(rope) => rope.shake(graph, filter), + Node::Leaf(_) => (), + } + } + + pub fn unwrap_leaf(&self) -> &Leaf { + match self { + Node::Fork(_) => panic!("Internal Error: called unwrap_leaf on a fork"), + Node::Rope(_) => panic!("Internal Error: called unwrap_leaf on a rope"), + Node::Leaf(leaf) => leaf, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn leaf_stack_size() { + use std::mem::size_of; + + const WORD: usize = size_of::<usize>(); + const NODE: usize = size_of::<Node<()>>(); + + assert!(NODE <= 6 * WORD, "Size of Node<()> is {} bytes!", NODE); + } + + #[test] + fn create_a_loop() { + let mut graph = Graph::new(); + + let token = graph.push(Node::Leaf("IDENT")); + let id = graph.reserve(); + let fork = Fork::new().branch('a'..='z', id.get()).miss(token); + let root = graph.insert(id, fork); + + assert_eq!(graph[token], Node::Leaf("IDENT")); + assert_eq!(graph[root], Fork::new().branch('a'..='z', root).miss(token),); + } + + #[test] + fn fork_off() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = graph.push(Rope::new("rope", leaf)); + let fork = graph.push(Fork::new().branch(b'!', leaf)); + + assert_eq!(graph.fork_off(leaf), Fork::new().miss(leaf)); + assert_eq!( + graph.fork_off(rope), + Fork::new().branch(b'r', NodeId::new(graph.nodes.len() - 1)) + ); + assert_eq!(graph.fork_off(fork), Fork::new().branch(b'!', leaf)); + } +} |
