From 8cdfa445d6629ffef4cb84967ff7017654045bc2 Mon Sep 17 00:00:00 2001 From: mo khan Date: Wed, 2 Jul 2025 18:36:06 -0600 Subject: chore: add vendor directory --- vendor/logos-codegen/src/parser/ignore_flags.rs | 499 ++++++++++++++++++++++++ 1 file changed, 499 insertions(+) create mode 100644 vendor/logos-codegen/src/parser/ignore_flags.rs (limited to 'vendor/logos-codegen/src/parser/ignore_flags.rs') diff --git a/vendor/logos-codegen/src/parser/ignore_flags.rs b/vendor/logos-codegen/src/parser/ignore_flags.rs new file mode 100644 index 00000000..3a79d31b --- /dev/null +++ b/vendor/logos-codegen/src/parser/ignore_flags.rs @@ -0,0 +1,499 @@ +use std::ops::{BitAnd, BitOr}; + +use proc_macro2::{Ident, TokenStream, TokenTree}; + +use crate::parser::Parser; +use crate::util::is_punct; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct IgnoreFlags { + bits: u8, +} + +#[allow(non_upper_case_globals)] +impl IgnoreFlags { + pub const Empty: Self = Self::new(0x00); + pub const IgnoreCase: Self = Self::new(0x01); + pub const IgnoreAsciiCase: Self = Self::new(0x02); + + #[inline] + pub const fn new(bits: u8) -> Self { + Self { bits } + } + + /// Enables a variant. + #[inline] + pub fn enable(&mut self, variant: Self) { + self.bits |= variant.bits; + } + + /// Checks if this `IgnoreFlags` contains *any* of the given variants. + #[inline] + pub fn contains(&self, variants: Self) -> bool { + self.bits & variants.bits != 0 + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.bits == 0 + } + + /// Parses an identifier an enables it for `self`. + /// + /// Valid inputs are (that produces `true`): + /// * `"case"` (incompatible with `"ascii_case"`) + /// * `"ascii_case"` (incompatible with `"case"`) + /// + /// An error causes this function to return `false` and emits an error to + /// the given `Parser`. + fn parse_ident(&mut self, ident: Ident, parser: &mut Parser) -> bool { + match ident.to_string().as_str() { + "case" => { + if self.contains(Self::IgnoreAsciiCase) { + parser.err( + "\ + The flag \"case\" cannot be used along with \"ascii_case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreCase); + true + } + } + "ascii_case" => { + if self.contains(Self::IgnoreCase) { + parser.err( + "\ + The flag \"ascii_case\" cannot be used along with \"case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreAsciiCase); + true + } + } + unknown => { + parser.err( + format!( + "\ + Unknown flag: {}\n\ + \n\ + Expected one of: case, ascii_case\ + ", + unknown + ), + ident.span(), + ); + false + } + } + } + + pub fn parse_group(&mut self, name: Ident, tokens: TokenStream, parser: &mut Parser) { + // Little finite state machine to parse "(,)*,?" + + // FSM description for future maintenance + // 0: Initial state + // -> 1 + // _ -> error + // 1: A flag was found + // , -> 2 + // None -> done + // _ -> error + // 2: A comma was found (after a ) + // -> 1 + // None -> done + // _ -> error + let mut state = 0u8; + + let mut tokens = tokens.into_iter(); + + loop { + state = match state { + 0 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + _ => { + parser.err( + "\ + Invalid ignore flag\n\ + \n\ + Expected one of: case, ascii_case\ + ", + name.span(), + ); + return; + } + }, + 1 => match tokens.next() { + Some(tt) if is_punct(&tt, ',') => 2, + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + 2 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + _ => unreachable!("Internal Error: invalid state ({})", state), + } + } + } +} + +impl BitOr for IgnoreFlags { + type Output = Self; + + fn bitor(self, other: Self) -> Self { + Self::new(self.bits | other.bits) + } +} + +impl BitAnd for IgnoreFlags { + type Output = Self; + + fn bitand(self, other: Self) -> Self { + Self::new(self.bits & other.bits) + } +} + +pub mod ascii_case { + use regex_syntax::hir; + + use crate::mir::Mir; + use crate::parser::Literal; + + macro_rules! literal { + ($byte:expr) => { + hir::Literal(Box::new([$byte])) + }; + (@char $c:expr) => { + hir::Literal( + $c.encode_utf8(&mut [0; 4]) + .as_bytes() + .to_vec() + .into_boxed_slice(), + ) + }; + } + + pub trait MakeAsciiCaseInsensitive { + /// Creates a equivalent regular expression which ignore the letter casing + /// of ascii characters. + fn make_ascii_case_insensitive(self) -> Mir; + } + + impl MakeAsciiCaseInsensitive for u8 { + fn make_ascii_case_insensitive(self) -> Mir { + if self.is_ascii_lowercase() { + Mir::Alternation(vec![ + Mir::Literal(literal!(self - 32)), + Mir::Literal(literal!(self)), + ]) + } else if self.is_ascii_uppercase() { + Mir::Alternation(vec![ + Mir::Literal(literal!(self)), + Mir::Literal(literal!(self + 32)), + ]) + } else { + Mir::Literal(literal!(self)) + } + } + } + + impl MakeAsciiCaseInsensitive for char { + fn make_ascii_case_insensitive(self) -> Mir { + if self.is_ascii() { + (self as u8).make_ascii_case_insensitive() + } else { + Mir::Literal(literal!(@char self)) + } + } + } + + impl MakeAsciiCaseInsensitive for hir::Literal { + fn make_ascii_case_insensitive(self) -> Mir { + Mir::Concat( + self.0 + .iter() + .map(|x| x.make_ascii_case_insensitive()) + .collect(), + ) + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassBytes { + fn make_ascii_case_insensitive(mut self) -> Mir { + self.case_fold_simple(); + Mir::Class(hir::Class::Bytes(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassUnicode { + fn make_ascii_case_insensitive(mut self) -> Mir { + use std::cmp; + + // Manuall implementation to only perform the case folding on ascii characters. + + let mut ranges = Vec::new(); + + for range in self.ranges() { + #[inline] + fn overlaps(st1: u8, end1: u8, st2: u8, end2: u8) -> bool { + (st2 <= st1 && st1 <= end2) || (st1 <= st2 && st2 <= end1) + } + + #[inline] + fn make_ascii(c: char) -> Option { + if c.is_ascii() { + Some(c as u8) + } else { + None + } + } + + match (make_ascii(range.start()), make_ascii(range.end())) { + (Some(start), Some(end)) => { + if overlaps(b'a', b'z', start, end) { + let lower = cmp::max(start, b'a'); + let upper = cmp::min(end, b'z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower - 32) as char, + (upper - 32) as char, + )) + } + + if overlaps(b'A', b'Z', start, end) { + let lower = cmp::max(start, b'A'); + let upper = cmp::min(end, b'Z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower + 32) as char, + (upper + 32) as char, + )) + } + } + (Some(start), None) => { + if overlaps(b'a', b'z', start, b'z') { + let lower = cmp::max(start, b'a'); + ranges.push(hir::ClassUnicodeRange::new((lower - 32) as char, 'Z')) + } + + if overlaps(b'A', b'Z', start, b'Z') { + let lower = cmp::max(start, b'A'); + ranges.push(hir::ClassUnicodeRange::new((lower + 32) as char, 'Z')) + } + } + _ => (), + } + } + + self.union(&hir::ClassUnicode::new(ranges)); + + Mir::Class(hir::Class::Unicode(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::Class { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + hir::Class::Bytes(b) => b.make_ascii_case_insensitive(), + hir::Class::Unicode(u) => u.make_ascii_case_insensitive(), + } + } + } + + impl MakeAsciiCaseInsensitive for &Literal { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Literal::Bytes(bytes) => Mir::Concat( + bytes + .value() + .into_iter() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + Literal::Utf8(s) => Mir::Concat( + s.value() + .chars() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + } + } + } + + impl MakeAsciiCaseInsensitive for Mir { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Mir::Empty => Mir::Empty, + Mir::Loop(l) => Mir::Loop(Box::new(l.make_ascii_case_insensitive())), + Mir::Maybe(m) => Mir::Maybe(Box::new(m.make_ascii_case_insensitive())), + Mir::Concat(c) => Mir::Concat( + c.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Alternation(a) => Mir::Alternation( + a.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Class(c) => c.make_ascii_case_insensitive(), + Mir::Literal(l) => l.make_ascii_case_insensitive(), + } + } + } + + #[cfg(test)] + mod tests { + use super::MakeAsciiCaseInsensitive; + use crate::mir::{Class, Mir}; + use regex_syntax::hir::{ClassUnicode, ClassUnicodeRange}; + + fn assert_range(in_s: char, in_e: char, expected: &[(char, char)]) { + let range = ClassUnicodeRange::new(in_s, in_e); + let class = ClassUnicode::new(vec![range]); + + let expected = + ClassUnicode::new(expected.iter().map(|&(a, b)| ClassUnicodeRange::new(a, b))); + + if let Mir::Class(Class::Unicode(result)) = class.make_ascii_case_insensitive() { + assert_eq!(result, expected); + } else { + panic!("Not a unicode class"); + }; + } + + #[test] + fn no_letters_left() { + assert_range(' ', '+', &[(' ', '+')]); + } + + #[test] + fn no_letters_right() { + assert_range('{', '~', &[('{', '~')]); + } + + #[test] + fn no_letters_middle() { + assert_range('[', '`', &[('[', '`')]); + } + + #[test] + fn lowercase_left_edge() { + assert_range('a', 'd', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn lowercase_right_edge() { + assert_range('r', 'z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn lowercase_total() { + assert_range('a', 'z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn uppercase_left_edge() { + assert_range('A', 'D', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn uppercase_right_edge() { + assert_range('R', 'Z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn uppercase_total() { + assert_range('A', 'Z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn lowercase_cross_left() { + assert_range('[', 'h', &[('[', 'h'), ('A', 'H')]); + } + + #[test] + fn lowercase_cross_right() { + assert_range('d', '}', &[('d', '}'), ('D', 'Z')]); + } + + #[test] + fn uppercase_cross_left() { + assert_range(';', 'H', &[(';', 'H'), ('a', 'h')]); + } + + #[test] + fn uppercase_cross_right() { + assert_range('T', ']', &[('t', 'z'), ('T', ']')]); + } + + #[test] + fn cross_both() { + assert_range('X', 'c', &[('X', 'c'), ('x', 'z'), ('A', 'C')]); + } + + #[test] + fn all_letters() { + assert_range('+', '|', &[('+', '|')]); + } + + #[test] + fn oob_all_letters() { + assert_range('#', 'é', &[('#', 'é')]); + } + + #[test] + fn oob_from_uppercase() { + assert_range('Q', 'é', &[('A', 'é')]); + } + + #[test] + fn oob_from_lowercase() { + assert_range('q', 'é', &[('q', 'é'), ('Q', 'Z')]); + } + + #[test] + fn oob_no_letters() { + assert_range('|', 'é', &[('|', 'é')]); + } + } +} -- cgit v1.2.3