summaryrefslogtreecommitdiff
path: root/vendor/logos-codegen/src/graph/regex.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/logos-codegen/src/graph/regex.rs')
-rw-r--r--vendor/logos-codegen/src/graph/regex.rs295
1 files changed, 295 insertions, 0 deletions
diff --git a/vendor/logos-codegen/src/graph/regex.rs b/vendor/logos-codegen/src/graph/regex.rs
new file mode 100644
index 00000000..d55c490f
--- /dev/null
+++ b/vendor/logos-codegen/src/graph/regex.rs
@@ -0,0 +1,295 @@
+use std::fmt::Debug;
+
+use regex_syntax::utf8::Utf8Sequences;
+
+use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
+use crate::mir::{Class, ClassUnicode, Literal, Mir};
+
+impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
+ pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
+ self.parse_mir(&mir, then, None, None, false)
+ }
+
+ fn parse_mir(
+ &mut self,
+ mir: &Mir,
+ then: NodeId,
+ miss: Option<NodeId>,
+ reserved: Option<ReservedId>,
+ repeated: bool,
+ ) -> NodeId {
+ match mir {
+ Mir::Empty => then,
+ Mir::Loop(mir) => {
+ let reserved_first = reserved.unwrap_or_else(|| self.reserve());
+
+ let (new_then, new_miss);
+ if let Some(old_miss) = miss {
+ // We have to separate the first iteration from the other iterations,
+ // because the `old_miss` path must only be taken if we miss the first
+ // iteration.
+ let reserved_next = self.reserve();
+ new_then = self.parse_mir(
+ mir,
+ reserved_next.get(),
+ Some(then),
+ Some(reserved_next),
+ true,
+ );
+ new_miss = self.merge(old_miss, then);
+ } else {
+ new_then = reserved_first.get();
+ new_miss = then;
+ }
+
+ self.parse_mir(mir, new_then, Some(new_miss), Some(reserved_first), true)
+ }
+ Mir::Maybe(mir) => {
+ let miss = match miss {
+ Some(id) => self.merge(id, then),
+ None => then,
+ };
+
+ self.parse_mir(mir, then, Some(miss), reserved, true)
+ }
+ Mir::Alternation(alternation) => {
+ let mut fork = Fork::new().miss(miss);
+
+ for mir in alternation {
+ let id = self.parse_mir(mir, then, None, None, repeated);
+ let alt = self.fork_off(id);
+
+ fork.merge(alt, self);
+ }
+
+ self.insert_or_push(reserved, fork)
+ }
+ Mir::Literal(literal) => {
+ let pattern = literal.0.to_vec();
+
+ self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
+ }
+ Mir::Concat(concat) => {
+ // Take an initial guess at the capacity - estimates a little worse than an average case
+ // scenario by assuming every concat element is singular but has a full code-point unicode literal.
+ // The only way to get the actual size of the Vec is if every sub-concat node is added up.
+ let mut ropebuf: Vec<Range> = Vec::with_capacity(concat.len() * 4);
+ let mut then = then;
+
+ let mut handle_bytes = |graph: &mut Self, mir: &Mir, then: &mut NodeId| match mir {
+ Mir::Literal(Literal(bytes)) => {
+ ropebuf.extend(bytes.iter().rev().map(Into::<Range>::into));
+ true
+ }
+ Mir::Class(Class::Unicode(class)) if is_one_ascii(class, repeated) => {
+ ropebuf.push(class.ranges()[0].into());
+ true
+ }
+ Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => {
+ ropebuf.push(class.ranges()[0].into());
+ true
+ }
+ _ => {
+ if !ropebuf.is_empty() {
+ let rope =
+ Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), *then);
+
+ *then = graph.push(rope);
+ ropebuf = Vec::with_capacity(concat.len() * 4);
+ }
+ false
+ }
+ };
+
+ for mir in concat[1..].iter().rev() {
+ if !handle_bytes(self, mir, &mut then) {
+ then = self.parse_mir(mir, then, None, None, false);
+ }
+ }
+
+ let first_mir = &concat[0];
+ if handle_bytes(self, first_mir, &mut then) {
+ let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), then)
+ .miss(miss);
+ self.insert_or_push(reserved, rope)
+ } else {
+ self.parse_mir(first_mir, then, miss, reserved, false)
+ }
+ }
+ Mir::Class(Class::Unicode(class)) if !is_ascii(class, repeated) => {
+ let mut ropes = class
+ .iter()
+ .flat_map(|range| Utf8Sequences::new(range.start(), range.end()))
+ .map(|sequence| Rope::new(sequence.as_slice(), then))
+ .collect::<Vec<_>>();
+
+ if ropes.len() == 1 {
+ let rope = ropes.remove(0);
+
+ return self.insert_or_push(reserved, rope.miss(miss));
+ }
+
+ let mut root = Fork::new().miss(miss);
+
+ for rope in ropes {
+ let fork = rope.into_fork(self);
+ root.merge(fork, self);
+ }
+
+ self.insert_or_push(reserved, root)
+ }
+ Mir::Class(class) => {
+ let mut fork = Fork::new().miss(miss);
+
+ let class: Vec<Range> = match class {
+ Class::Unicode(u) => u.iter().copied().map(Into::into).collect(),
+ Class::Bytes(b) => b.iter().copied().map(Into::into).collect(),
+ };
+
+ for range in class {
+ fork.add_branch(range, then, self);
+ }
+
+ self.insert_or_push(reserved, fork)
+ }
+ }
+ }
+
+ fn insert_or_push<N>(&mut self, id: Option<ReservedId>, node: N) -> NodeId
+ where
+ N: Into<Node<Leaf>>,
+ {
+ match id {
+ Some(id) => self.insert(id, node),
+ None => self.push(node),
+ }
+ }
+}
+
+/// Return whether current class unicode is ascii.
+///
+/// Because unicode ranges are iterated in increasing order,
+/// it is only necessary to check the last range.
+///
+/// If the check is performed in a repetition,
+/// a fast path is used by checking if end of range is 0x0010_FFFF.
+fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool {
+ class.iter().last().map_or(true, |range| {
+ let start = range.start() as u32;
+ let end = range.end() as u32;
+ end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
+ })
+}
+
+/// Return whether current class unicode is ascii and only contains
+/// one range.
+///
+/// See [`is_ascii`] function for more details.
+fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool {
+ if class.ranges().len() != 1 {
+ return false;
+ }
+
+ let range = &class.ranges()[0];
+ let start = range.start() as u32;
+ let end = range.end() as u32;
+ end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
+}
+
+#[cfg(test)]
+mod tests {
+ use std::num::NonZeroU32;
+
+ use super::*;
+ use crate::graph::Node;
+ use pretty_assertions::assert_eq;
+
+ #[test]
+ fn rope() {
+ let mut graph = Graph::new();
+
+ let mir = Mir::utf8("foobar").unwrap();
+
+ assert_eq!(mir.priority(), 12);
+
+ let leaf = graph.push(Node::Leaf("LEAF"));
+ let id = graph.regex(mir, leaf);
+
+ assert_eq!(graph[id], Node::Rope(Rope::new("foobar", leaf)),)
+ }
+
+ #[test]
+ fn alternation() {
+ let mut graph = Graph::new();
+
+ let mir = Mir::utf8("a|b").unwrap();
+
+ assert_eq!(mir.priority(), 2);
+
+ let leaf = graph.push(Node::Leaf("LEAF"));
+ let id = graph.regex(mir, leaf);
+
+ assert_eq!(
+ graph[id],
+ Node::Fork(Fork::new().branch(b'a', leaf).branch(b'b', leaf)),
+ );
+ }
+
+ #[test]
+ fn repeat() {
+ let mut graph = Graph::new();
+
+ let mir = Mir::utf8("[a-z]*").unwrap();
+
+ assert_eq!(mir.priority(), 0);
+
+ let leaf = graph.push(Node::Leaf("LEAF"));
+ let id = graph.regex(mir, leaf);
+
+ assert_eq!(
+ graph[id],
+ Node::Fork(
+ Fork::new()
+ .branch('a'..='z', id) // goto self == loop
+ .miss(leaf)
+ ),
+ );
+ }
+
+ #[test]
+ fn maybe() {
+ let mut graph = Graph::new();
+
+ let mir = Mir::utf8("[a-z]?").unwrap();
+
+ assert_eq!(mir.priority(), 0);
+
+ let leaf = graph.push(Node::Leaf("LEAF"));
+ let id = graph.regex(mir, leaf);
+
+ assert_eq!(
+ graph[id],
+ Node::Fork(Fork::new().branch('a'..='z', leaf).miss(leaf)),
+ );
+ }
+
+ #[test]
+ fn long_concat_389() {
+ let mut graph = Graph::new();
+
+ let mir = Mir::utf8("abcdefghijklmnopqrstuvwxyz*").unwrap();
+
+ assert_eq!(mir.priority(), 50);
+
+ let leaf = graph.push(Node::Leaf("LEAF"));
+ let id = graph.regex(mir, leaf);
+ let sub_id = NodeId(NonZeroU32::new(2).unwrap());
+
+ assert_eq!(
+ graph[id],
+ Node::Rope(Rope::new("abcdefghijklmnopqrstuvwxy", sub_id))
+ );
+
+ assert_eq!(graph[sub_id], Node::Rope(Rope::new("z", sub_id).miss(leaf)))
+ }
+}