1 files changed, 295 insertions, 0 deletions
diff --git a/vendor/logos-codegen/src/graph/regex.rs b/vendor/logos-codegen/src/graph/regex.rs
new file mode 100644
index 00000000..d55c490f
--- /dev/null
+++ b/vendor/logos-codegen/src/graph/regex.rs
@@ -0,0 +1,295 @@
+use std::fmt::Debug;
+
+use regex_syntax::utf8::Utf8Sequences;
+
+use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope};
+use crate::mir::{Class, ClassUnicode, Literal, Mir};
+
+impl<Leaf: Disambiguate + Debug> Graph<Leaf> {
+    pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId {
+        self.parse_mir(&mir, then, None, None, false)
+    }
+
+    fn parse_mir(
+        &mut self,
+        mir: &Mir,
+        then: NodeId,
+        miss: Option<NodeId>,
+        reserved: Option<ReservedId>,
+        repeated: bool,
+    ) -> NodeId {
+        match mir {
+            Mir::Empty => then,
+            Mir::Loop(mir) => {
+                let reserved_first = reserved.unwrap_or_else(|| self.reserve());
+
+                let (new_then, new_miss);
+                if let Some(old_miss) = miss {
+                    // We have to separate the first iteration from the other iterations,
+                    // because the `old_miss` path must only be taken if we miss the first
+                    // iteration.
+                    let reserved_next = self.reserve();
+                    new_then = self.parse_mir(
+                        mir,
+                        reserved_next.get(),
+                        Some(then),
+                        Some(reserved_next),
+                        true,
+                    );
+                    new_miss = self.merge(old_miss, then);
+                } else {
+                    new_then = reserved_first.get();
+                    new_miss = then;
+                }
+
+                self.parse_mir(mir, new_then, Some(new_miss), Some(reserved_first), true)
+            }
+            Mir::Maybe(mir) => {
+                let miss = match miss {
+                    Some(id) => self.merge(id, then),
+                    None => then,
+                };
+
+                self.parse_mir(mir, then, Some(miss), reserved, true)
+            }
+            Mir::Alternation(alternation) => {
+                let mut fork = Fork::new().miss(miss);
+
+                for mir in alternation {
+                    let id = self.parse_mir(mir, then, None, None, repeated);
+                    let alt = self.fork_off(id);
+
+                    fork.merge(alt, self);
+                }
+
+                self.insert_or_push(reserved, fork)
+            }
+            Mir::Literal(literal) => {
+                let pattern = literal.0.to_vec();
+
+                self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss))
+            }
+            Mir::Concat(concat) => {
+                // Take an initial guess at the capacity - estimates a little worse than an average case
+                // scenario by assuming every concat element is singular but has a full code-point unicode literal.
+                // The only way to get the actual size of the Vec is if every sub-concat node is added up.
+                let mut ropebuf: Vec<Range> = Vec::with_capacity(concat.len() * 4);
+                let mut then = then;
+
+                let mut handle_bytes = |graph: &mut Self, mir: &Mir, then: &mut NodeId| match mir {
+                    Mir::Literal(Literal(bytes)) => {
+                        ropebuf.extend(bytes.iter().rev().map(Into::<Range>::into));
+                        true
+                    }
+                    Mir::Class(Class::Unicode(class)) if is_one_ascii(class, repeated) => {
+                        ropebuf.push(class.ranges()[0].into());
+                        true
+                    }
+                    Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => {
+                        ropebuf.push(class.ranges()[0].into());
+                        true
+                    }
+                    _ => {
+                        if !ropebuf.is_empty() {
+                            let rope =
+                                Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), *then);
+
+                            *then = graph.push(rope);
+                            ropebuf = Vec::with_capacity(concat.len() * 4);
+                        }
+                        false
+                    }
+                };
+
+                for mir in concat[1..].iter().rev() {
+                    if !handle_bytes(self, mir, &mut then) {
+                        then = self.parse_mir(mir, then, None, None, false);
+                    }
+                }
+
+                let first_mir = &concat[0];
+                if handle_bytes(self, first_mir, &mut then) {
+                    let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), then)
+                        .miss(miss);
+                    self.insert_or_push(reserved, rope)
+                } else {
+                    self.parse_mir(first_mir, then, miss, reserved, false)
+                }
+            }
+            Mir::Class(Class::Unicode(class)) if !is_ascii(class, repeated) => {
+                let mut ropes = class
+                    .iter()
+                    .flat_map(|range| Utf8Sequences::new(range.start(), range.end()))
+                    .map(|sequence| Rope::new(sequence.as_slice(), then))
+                    .collect::<Vec<_>>();
+
+                if ropes.len() == 1 {
+                    let rope = ropes.remove(0);
+
+                    return self.insert_or_push(reserved, rope.miss(miss));
+                }
+
+                let mut root = Fork::new().miss(miss);
+
+                for rope in ropes {
+                    let fork = rope.into_fork(self);
+                    root.merge(fork, self);
+                }
+
+                self.insert_or_push(reserved, root)
+            }
+            Mir::Class(class) => {
+                let mut fork = Fork::new().miss(miss);
+
+                let class: Vec<Range> = match class {
+                    Class::Unicode(u) => u.iter().copied().map(Into::into).collect(),
+                    Class::Bytes(b) => b.iter().copied().map(Into::into).collect(),
+                };
+
+                for range in class {
+                    fork.add_branch(range, then, self);
+                }
+
+                self.insert_or_push(reserved, fork)
+            }
+        }
+    }
+
+    fn insert_or_push<N>(&mut self, id: Option<ReservedId>, node: N) -> NodeId
+    where
+        N: Into<Node<Leaf>>,
+    {
+        match id {
+            Some(id) => self.insert(id, node),
+            None => self.push(node),
+        }
+    }
+}
+
+/// Return whether current class unicode is ascii.
+///
+/// Because unicode ranges are iterated in increasing order,
+/// it is only necessary to check the last range.
+///
+/// If the check is performed in a repetition,
+/// a fast path is used by checking if end of range is 0x0010_FFFF.
+fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool {
+    class.iter().last().map_or(true, |range| {
+        let start = range.start() as u32;
+        let end = range.end() as u32;
+        end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
+    })
+}
+
+/// Return whether current class unicode is ascii and only contains
+/// one range.
+///
+/// See [`is_ascii`] function for more details.
+fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool {
+    if class.ranges().len() != 1 {
+        return false;
+    }
+
+    let range = &class.ranges()[0];
+    let start = range.start() as u32;
+    let end = range.end() as u32;
+    end < 128 || (repeated && start < 128 && end == 0x0010_FFFF)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroU32;
+
+    use super::*;
+    use crate::graph::Node;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn rope() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("foobar").unwrap();
+
+        assert_eq!(mir.priority(), 12);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+
+        assert_eq!(graph[id], Node::Rope(Rope::new("foobar", leaf)),)
+    }
+
+    #[test]
+    fn alternation() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("a|b").unwrap();
+
+        assert_eq!(mir.priority(), 2);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+
+        assert_eq!(
+            graph[id],
+            Node::Fork(Fork::new().branch(b'a', leaf).branch(b'b', leaf)),
+        );
+    }
+
+    #[test]
+    fn repeat() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("[a-z]*").unwrap();
+
+        assert_eq!(mir.priority(), 0);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+
+        assert_eq!(
+            graph[id],
+            Node::Fork(
+                Fork::new()
+                    .branch('a'..='z', id) // goto self == loop
+                    .miss(leaf)
+            ),
+        );
+    }
+
+    #[test]
+    fn maybe() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("[a-z]?").unwrap();
+
+        assert_eq!(mir.priority(), 0);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+
+        assert_eq!(
+            graph[id],
+            Node::Fork(Fork::new().branch('a'..='z', leaf).miss(leaf)),
+        );
+    }
+
+    #[test]
+    fn long_concat_389() {
+        let mut graph = Graph::new();
+
+        let mir = Mir::utf8("abcdefghijklmnopqrstuvwxyz*").unwrap();
+
+        assert_eq!(mir.priority(), 50);
+
+        let leaf = graph.push(Node::Leaf("LEAF"));
+        let id = graph.regex(mir, leaf);
+        let sub_id = NodeId(NonZeroU32::new(2).unwrap());
+
+        assert_eq!(
+            graph[id],
+            Node::Rope(Rope::new("abcdefghijklmnopqrstuvwxy", sub_id))
+        );
+
+        assert_eq!(graph[sub_id], Node::Rope(Rope::new("z", sub_id).miss(leaf)))
+    }
+}