diff options
Diffstat (limited to 'vendor/regex-syntax/src/ast/parse.rs')
| -rw-r--r-- | vendor/regex-syntax/src/ast/parse.rs | 6377 |
1 files changed, 0 insertions, 6377 deletions
diff --git a/vendor/regex-syntax/src/ast/parse.rs b/vendor/regex-syntax/src/ast/parse.rs deleted file mode 100644 index 0c2a3526..00000000 --- a/vendor/regex-syntax/src/ast/parse.rs +++ /dev/null @@ -1,6377 +0,0 @@ -/*! -This module provides a regular expression parser. -*/ - -use core::{ - borrow::Borrow, - cell::{Cell, RefCell}, - mem, -}; - -use alloc::{ - boxed::Box, - string::{String, ToString}, - vec, - vec::Vec, -}; - -use crate::{ - ast::{self, Ast, Position, Span}, - either::Either, - is_escapeable_character, is_meta_character, -}; - -type Result<T> = core::result::Result<T, ast::Error>; - -/// A primitive is an expression with no sub-expressions. This includes -/// literals, assertions and non-set character classes. This representation -/// is used as intermediate state in the parser. -/// -/// This does not include ASCII character classes, since they can only appear -/// within a set character class. -#[derive(Clone, Debug, Eq, PartialEq)] -enum Primitive { - Literal(ast::Literal), - Assertion(ast::Assertion), - Dot(Span), - Perl(ast::ClassPerl), - Unicode(ast::ClassUnicode), -} - -impl Primitive { - /// Return the span of this primitive. - fn span(&self) -> &Span { - match *self { - Primitive::Literal(ref x) => &x.span, - Primitive::Assertion(ref x) => &x.span, - Primitive::Dot(ref span) => span, - Primitive::Perl(ref x) => &x.span, - Primitive::Unicode(ref x) => &x.span, - } - } - - /// Convert this primitive into a proper AST. - fn into_ast(self) -> Ast { - match self { - Primitive::Literal(lit) => Ast::literal(lit), - Primitive::Assertion(assert) => Ast::assertion(assert), - Primitive::Dot(span) => Ast::dot(span), - Primitive::Perl(cls) => Ast::class_perl(cls), - Primitive::Unicode(cls) => Ast::class_unicode(cls), - } - } - - /// Convert this primitive into an item in a character class. - /// - /// If this primitive is not a legal item (i.e., an assertion or a dot), - /// then return an error. - fn into_class_set_item<P: Borrow<Parser>>( - self, - p: &ParserI<'_, P>, - ) -> Result<ast::ClassSetItem> { - use self::Primitive::*; - use crate::ast::ClassSetItem; - - match self { - Literal(lit) => Ok(ClassSetItem::Literal(lit)), - Perl(cls) => Ok(ClassSetItem::Perl(cls)), - Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), - x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), - } - } - - /// Convert this primitive into a literal in a character class. In - /// particular, literals are the only valid items that can appear in - /// ranges. - /// - /// If this primitive is not a legal item (i.e., a class, assertion or a - /// dot), then return an error. - fn into_class_literal<P: Borrow<Parser>>( - self, - p: &ParserI<'_, P>, - ) -> Result<ast::Literal> { - use self::Primitive::*; - - match self { - Literal(lit) => Ok(lit), - x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), - } - } -} - -/// Returns true if the given character is a hexadecimal digit. -fn is_hex(c: char) -> bool { - ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') -} - -/// Returns true if the given character is a valid in a capture group name. -/// -/// If `first` is true, then `c` is treated as the first character in the -/// group name (which must be alphabetic or underscore). -fn is_capture_char(c: char, first: bool) -> bool { - if first { - c == '_' || c.is_alphabetic() - } else { - c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() - } -} - -/// A builder for a regular expression parser. -/// -/// This builder permits modifying configuration options for the parser. -#[derive(Clone, Debug)] -pub struct ParserBuilder { - ignore_whitespace: bool, - nest_limit: u32, - octal: bool, - empty_min_range: bool, -} - -impl Default for ParserBuilder { - fn default() -> ParserBuilder { - ParserBuilder::new() - } -} - -impl ParserBuilder { - /// Create a new parser builder with a default configuration. - pub fn new() -> ParserBuilder { - ParserBuilder { - ignore_whitespace: false, - nest_limit: 250, - octal: false, - empty_min_range: false, - } - } - - /// Build a parser from this configuration with the given pattern. - pub fn build(&self) -> Parser { - Parser { - pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), - capture_index: Cell::new(0), - nest_limit: self.nest_limit, - octal: self.octal, - empty_min_range: self.empty_min_range, - initial_ignore_whitespace: self.ignore_whitespace, - ignore_whitespace: Cell::new(self.ignore_whitespace), - comments: RefCell::new(vec![]), - stack_group: RefCell::new(vec![]), - stack_class: RefCell::new(vec![]), - capture_names: RefCell::new(vec![]), - scratch: RefCell::new(String::new()), - } - } - - /// Set the nesting limit for this parser. - /// - /// The nesting limit controls how deep the abstract syntax tree is allowed - /// to be. If the AST exceeds the given limit (e.g., with too many nested - /// groups), then an error is returned by the parser. - /// - /// The purpose of this limit is to act as a heuristic to prevent stack - /// overflow for consumers that do structural induction on an `Ast` using - /// explicit recursion. While this crate never does this (instead using - /// constant stack space and moving the call stack to the heap), other - /// crates may. - /// - /// This limit is not checked until the entire AST is parsed. Therefore, - /// if callers want to put a limit on the amount of heap space used, then - /// they should impose a limit on the length, in bytes, of the concrete - /// pattern string. In particular, this is viable since this parser - /// implementation will limit itself to heap space proportional to the - /// length of the pattern string. - /// - /// Note that a nest limit of `0` will return a nest limit error for most - /// patterns but not all. For example, a nest limit of `0` permits `a` but - /// not `ab`, since `ab` requires a concatenation, which results in a nest - /// depth of `1`. In general, a nest limit is not something that manifests - /// in an obvious way in the concrete syntax, therefore, it should not be - /// used in a granular way. - pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { - self.nest_limit = limit; - self - } - - /// Whether to support octal syntax or not. - /// - /// Octal syntax is a little-known way of uttering Unicode codepoints in - /// a regular expression. For example, `a`, `\x61`, `\u0061` and - /// `\141` are all equivalent regular expressions, where the last example - /// shows octal syntax. - /// - /// While supporting octal syntax isn't in and of itself a problem, it does - /// make good error messages harder. That is, in PCRE based regex engines, - /// syntax like `\0` invokes a backreference, which is explicitly - /// unsupported in Rust's regex engine. However, many users expect it to - /// be supported. Therefore, when octal support is disabled, the error - /// message will explicitly mention that backreferences aren't supported. - /// - /// Octal syntax is disabled by default. - pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { - self.octal = yes; - self - } - - /// Enable verbose mode in the regular expression. - /// - /// When enabled, verbose mode permits insignificant whitespace in many - /// places in the regular expression, as well as comments. Comments are - /// started using `#` and continue until the end of the line. - /// - /// By default, this is disabled. It may be selectively enabled in the - /// regular expression by using the `x` flag regardless of this setting. - pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { - self.ignore_whitespace = yes; - self - } - - /// Allow using `{,n}` as an equivalent to `{0,n}`. - /// - /// When enabled, the parser accepts `{,n}` as valid syntax for `{0,n}`. - /// Most regular expression engines don't support the `{,n}` syntax, but - /// some others do it, namely Python's `re` library. - /// - /// This is disabled by default. - pub fn empty_min_range(&mut self, yes: bool) -> &mut ParserBuilder { - self.empty_min_range = yes; - self - } -} - -/// A regular expression parser. -/// -/// This parses a string representation of a regular expression into an -/// abstract syntax tree. The size of the tree is proportional to the length -/// of the regular expression pattern. -/// -/// A `Parser` can be configured in more detail via a [`ParserBuilder`]. -#[derive(Clone, Debug)] -pub struct Parser { - /// The current position of the parser. - pos: Cell<Position>, - /// The current capture index. - capture_index: Cell<u32>, - /// The maximum number of open parens/brackets allowed. If the parser - /// exceeds this number, then an error is returned. - nest_limit: u32, - /// Whether to support octal syntax or not. When `false`, the parser will - /// return an error helpfully pointing out that backreferences are not - /// supported. - octal: bool, - /// The initial setting for `ignore_whitespace` as provided by - /// `ParserBuilder`. It is used when resetting the parser's state. - initial_ignore_whitespace: bool, - /// Whether the parser supports `{,n}` repetitions as an equivalent to - /// `{0,n}.` - empty_min_range: bool, - /// Whether whitespace should be ignored. When enabled, comments are - /// also permitted. - ignore_whitespace: Cell<bool>, - /// A list of comments, in order of appearance. - comments: RefCell<Vec<ast::Comment>>, - /// A stack of grouped sub-expressions, including alternations. - stack_group: RefCell<Vec<GroupState>>, - /// A stack of nested character classes. This is only non-empty when - /// parsing a class. - stack_class: RefCell<Vec<ClassState>>, - /// A sorted sequence of capture names. This is used to detect duplicate - /// capture names and report an error if one is detected. - capture_names: RefCell<Vec<ast::CaptureName>>, - /// A scratch buffer used in various places. Mostly this is used to - /// accumulate relevant characters from parts of a pattern. - scratch: RefCell<String>, -} - -/// ParserI is the internal parser implementation. -/// -/// We use this separate type so that we can carry the provided pattern string -/// along with us. In particular, a `Parser` internal state is not tied to any -/// one pattern, but `ParserI` is. -/// -/// This type also lets us use `ParserI<&Parser>` in production code while -/// retaining the convenience of `ParserI<Parser>` for tests, which sometimes -/// work against the internal interface of the parser. -#[derive(Clone, Debug)] -struct ParserI<'s, P> { - /// The parser state/configuration. - parser: P, - /// The full regular expression provided by the user. - pattern: &'s str, -} - -/// GroupState represents a single stack frame while parsing nested groups -/// and alternations. Each frame records the state up to an opening parenthesis -/// or a alternating bracket `|`. -#[derive(Clone, Debug)] -enum GroupState { - /// This state is pushed whenever an opening group is found. - Group { - /// The concatenation immediately preceding the opening group. - concat: ast::Concat, - /// The group that has been opened. Its sub-AST is always empty. - group: ast::Group, - /// Whether this group has the `x` flag enabled or not. - ignore_whitespace: bool, - }, - /// This state is pushed whenever a new alternation branch is found. If - /// an alternation branch is found and this state is at the top of the - /// stack, then this state should be modified to include the new - /// alternation. - Alternation(ast::Alternation), -} - -/// ClassState represents a single stack frame while parsing character classes. -/// Each frame records the state up to an intersection, difference, symmetric -/// difference or nested class. -/// -/// Note that a parser's character class stack is only non-empty when parsing -/// a character class. In all other cases, it is empty. -#[derive(Clone, Debug)] -enum ClassState { - /// This state is pushed whenever an opening bracket is found. - Open { - /// The union of class items immediately preceding this class. - union: ast::ClassSetUnion, - /// The class that has been opened. Typically this just corresponds - /// to the `[`, but it can also include `[^` since `^` indicates - /// negation of the class. - set: ast::ClassBracketed, - }, - /// This state is pushed when a operator is seen. When popped, the stored - /// set becomes the left hand side of the operator. - Op { - /// The type of the operation, i.e., &&, -- or ~~. - kind: ast::ClassSetBinaryOpKind, - /// The left-hand side of the operator. - lhs: ast::ClassSet, - }, -} - -impl Parser { - /// Create a new parser with a default configuration. - /// - /// The parser can be run with either the `parse` or `parse_with_comments` - /// methods. The parse methods return an abstract syntax tree. - /// - /// To set configuration options on the parser, use [`ParserBuilder`]. - pub fn new() -> Parser { - ParserBuilder::new().build() - } - - /// Parse the regular expression into an abstract syntax tree. - pub fn parse(&mut self, pattern: &str) -> Result<Ast> { - ParserI::new(self, pattern).parse() - } - - /// Parse the regular expression and return an abstract syntax tree with - /// all of the comments found in the pattern. - pub fn parse_with_comments( - &mut self, - pattern: &str, - ) -> Result<ast::WithComments> { - ParserI::new(self, pattern).parse_with_comments() - } - - /// Reset the internal state of a parser. - /// - /// This is called at the beginning of every parse. This prevents the - /// parser from running with inconsistent state (say, if a previous - /// invocation returned an error and the parser is reused). - fn reset(&self) { - // These settings should be in line with the construction - // in `ParserBuilder::build`. - self.pos.set(Position { offset: 0, line: 1, column: 1 }); - self.ignore_whitespace.set(self.initial_ignore_whitespace); - self.comments.borrow_mut().clear(); - self.stack_group.borrow_mut().clear(); - self.stack_class.borrow_mut().clear(); - } -} - -impl<'s, P: Borrow<Parser>> ParserI<'s, P> { - /// Build an internal parser from a parser configuration and a pattern. - fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { - ParserI { parser, pattern } - } - - /// Return a reference to the parser state. - fn parser(&self) -> &Parser { - self.parser.borrow() - } - - /// Return a reference to the pattern being parsed. - fn pattern(&self) -> &str { - self.pattern - } - - /// Create a new error with the given span and error type. - fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { - ast::Error { kind, pattern: self.pattern().to_string(), span } - } - - /// Return the current offset of the parser. - /// - /// The offset starts at `0` from the beginning of the regular expression - /// pattern string. - fn offset(&self) -> usize { - self.parser().pos.get().offset - } - - /// Return the current line number of the parser. - /// - /// The line number starts at `1`. - fn line(&self) -> usize { - self.parser().pos.get().line - } - - /// Return the current column of the parser. - /// - /// The column number starts at `1` and is reset whenever a `\n` is seen. - fn column(&self) -> usize { - self.parser().pos.get().column - } - - /// Return the next capturing index. Each subsequent call increments the - /// internal index. - /// - /// The span given should correspond to the location of the opening - /// parenthesis. - /// - /// If the capture limit is exceeded, then an error is returned. - fn next_capture_index(&self, span: Span) -> Result<u32> { - let current = self.parser().capture_index.get(); - let i = current.checked_add(1).ok_or_else(|| { - self.error(span, ast::ErrorKind::CaptureLimitExceeded) - })?; - self.parser().capture_index.set(i); - Ok(i) - } - - /// Adds the given capture name to this parser. If this capture name has - /// already been used, then an error is returned. - fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { - let mut names = self.parser().capture_names.borrow_mut(); - match names - .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) - { - Err(i) => { - names.insert(i, cap.clone()); - Ok(()) - } - Ok(i) => Err(self.error( - cap.span, - ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, - )), - } - } - - /// Return whether the parser should ignore whitespace or not. - fn ignore_whitespace(&self) -> bool { - self.parser().ignore_whitespace.get() - } - - /// Return the character at the current position of the parser. - /// - /// This panics if the current position does not point to a valid char. - fn char(&self) -> char { - self.char_at(self.offset()) - } - - /// Return the character at the given position. - /// - /// This panics if the given position does not point to a valid char. - fn char_at(&self, i: usize) -> char { - self.pattern()[i..] - .chars() - .next() - .unwrap_or_else(|| panic!("expected char at offset {}", i)) - } - - /// Bump the parser to the next Unicode scalar value. - /// - /// If the end of the input has been reached, then `false` is returned. - fn bump(&self) -> bool { - if self.is_eof() { - return false; - } - let Position { mut offset, mut line, mut column } = self.pos(); - if self.char() == '\n' { - line = line.checked_add(1).unwrap(); - column = 1; - } else { - column = column.checked_add(1).unwrap(); - } - offset += self.char().len_utf8(); - self.parser().pos.set(Position { offset, line, column }); - self.pattern()[self.offset()..].chars().next().is_some() - } - - /// If the substring starting at the current position of the parser has - /// the given prefix, then bump the parser to the character immediately - /// following the prefix and return true. Otherwise, don't bump the parser - /// and return false. - fn bump_if(&self, prefix: &str) -> bool { - if self.pattern()[self.offset()..].starts_with(prefix) { - for _ in 0..prefix.chars().count() { - self.bump(); - } - true - } else { - false - } - } - - /// Returns true if and only if the parser is positioned at a look-around - /// prefix. The conditions under which this returns true must always - /// correspond to a regular expression that would otherwise be consider - /// invalid. - /// - /// This should only be called immediately after parsing the opening of - /// a group or a set of flags. - fn is_lookaround_prefix(&self) -> bool { - self.bump_if("?=") - || self.bump_if("?!") - || self.bump_if("?<=") - || self.bump_if("?<!") - } - - /// Bump the parser, and if the `x` flag is enabled, bump through any - /// subsequent spaces. Return true if and only if the parser is not at - /// EOF. - fn bump_and_bump_space(&self) -> bool { - if !self.bump() { - return false; - } - self.bump_space(); - !self.is_eof() - } - - /// If the `x` flag is enabled (i.e., whitespace insensitivity with - /// comments), then this will advance the parser through all whitespace - /// and comments to the next non-whitespace non-comment byte. - /// - /// If the `x` flag is disabled, then this is a no-op. - /// - /// This should be used selectively throughout the parser where - /// arbitrary whitespace is permitted when the `x` flag is enabled. For - /// example, `{ 5 , 6}` is equivalent to `{5,6}`. - fn bump_space(&self) { - if !self.ignore_whitespace() { - return; - } - while !self.is_eof() { - if self.char().is_whitespace() { - self.bump(); - } else if self.char() == '#' { - let start = self.pos(); - let mut comment_text = String::new(); - self.bump(); - while !self.is_eof() { - let c = self.char(); - self.bump(); - if c == '\n' { - break; - } - comment_text.push(c); - } - let comment = ast::Comment { - span: Span::new(start, self.pos()), - comment: comment_text, - }; - self.parser().comments.borrow_mut().push(comment); - } else { - break; - } - } - } - - /// Peek at the next character in the input without advancing the parser. - /// - /// If the input has been exhausted, then this returns `None`. - fn peek(&self) -> Option<char> { - if self.is_eof() { - return None; - } - self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() - } - - /// Like peek, but will ignore spaces when the parser is in whitespace - /// insensitive mode. - fn peek_space(&self) -> Option<char> { - if !self.ignore_whitespace() { - return self.peek(); - } - if self.is_eof() { - return None; - } - let mut start = self.offset() + self.char().len_utf8(); - let mut in_comment = false; - for (i, c) in self.pattern()[start..].char_indices() { - if c.is_whitespace() { - continue; - } else if !in_comment && c == '#' { - in_comment = true; - } else if in_comment && c == '\n' { - in_comment = false; - } else { - start += i; - break; - } - } - self.pattern()[start..].chars().next() - } - - /// Returns true if the next call to `bump` would return false. - fn is_eof(&self) -> bool { - self.offset() == self.pattern().len() - } - - /// Return the current position of the parser, which includes the offset, - /// line and column. - fn pos(&self) -> Position { - self.parser().pos.get() - } - - /// Create a span at the current position of the parser. Both the start - /// and end of the span are set. - fn span(&self) -> Span { - Span::splat(self.pos()) - } - - /// Create a span that covers the current character. - fn span_char(&self) -> Span { - let mut next = Position { - offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), - line: self.line(), - column: self.column().checked_add(1).unwrap(), - }; - if self.char() == '\n' { - next.line += 1; - next.column = 1; - } - Span::new(self.pos(), next) - } - - /// Parse and push a single alternation on to the parser's internal stack. - /// If the top of the stack already has an alternation, then add to that - /// instead of pushing a new one. - /// - /// The concatenation given corresponds to a single alternation branch. - /// The concatenation returned starts the next branch and is empty. - /// - /// This assumes the parser is currently positioned at `|` and will advance - /// the parser to the character following `|`. - #[inline(never)] - fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> { - assert_eq!(self.char(), '|'); - concat.span.end = self.pos(); - self.push_or_add_alternation(concat); - self.bump(); - Ok(ast::Concat { span: self.span(), asts: vec![] }) - } - - /// Pushes or adds the given branch of an alternation to the parser's - /// internal stack of state. - fn push_or_add_alternation(&self, concat: ast::Concat) { - use self::GroupState::*; - - let mut stack = self.parser().stack_group.borrow_mut(); - if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { - alts.asts.push(concat.into_ast()); - return; - } - stack.push(Alternation(ast::Alternation { - span: Span::new(concat.span.start, self.pos()), - asts: vec![concat.into_ast()], - })); - } - - /// Parse and push a group AST (and its parent concatenation) on to the - /// parser's internal stack. Return a fresh concatenation corresponding - /// to the group's sub-AST. - /// - /// If a set of flags was found (with no group), then the concatenation - /// is returned with that set of flags added. - /// - /// This assumes that the parser is currently positioned on the opening - /// parenthesis. It advances the parser to the character at the start - /// of the sub-expression (or adjoining expression). - /// - /// If there was a problem parsing the start of the group, then an error - /// is returned. - #[inline(never)] - fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> { - assert_eq!(self.char(), '('); - match self.parse_group()? { - Either::Left(set) => { - let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); - if let Some(v) = ignore { - self.parser().ignore_whitespace.set(v); - } - - concat.asts.push(Ast::flags(set)); - Ok(concat) - } - Either::Right(group) => { - let old_ignore_whitespace = self.ignore_whitespace(); - let new_ignore_whitespace = group - .flags() - .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) - .unwrap_or(old_ignore_whitespace); - self.parser().stack_group.borrow_mut().push( - GroupState::Group { - concat, - group, - ignore_whitespace: old_ignore_whitespace, - }, - ); - self.parser().ignore_whitespace.set(new_ignore_whitespace); - Ok(ast::Concat { span: self.span(), asts: vec![] }) - } - } - } - - /// Pop a group AST from the parser's internal stack and set the group's - /// AST to the given concatenation. Return the concatenation containing - /// the group. - /// - /// This assumes that the parser is currently positioned on the closing - /// parenthesis and advances the parser to the character following the `)`. - /// - /// If no such group could be popped, then an unopened group error is - /// returned. - #[inline(never)] - fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> { - use self::GroupState::*; - - assert_eq!(self.char(), ')'); - let mut stack = self.parser().stack_group.borrow_mut(); - let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack - .pop() - { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, None) - } - Some(Alternation(alt)) => match stack.pop() { - Some(Group { concat, group, ignore_whitespace }) => { - (concat, group, ignore_whitespace, Some(alt)) - } - None | Some(Alternation(_)) => { - return Err(self.error( - self.span_char(), - ast::ErrorKind::GroupUnopened, - )); - } - }, - None => { - return Err(self - .error(self.span_char(), ast::ErrorKind::GroupUnopened)); - } - }; - self.parser().ignore_whitespace.set(ignore_whitespace); - group_concat.span.end = self.pos(); - self.bump(); - group.span.end = self.pos(); - match alt { - Some(mut alt) => { - alt.span.end = group_concat.span.end; - alt.asts.push(group_concat.into_ast()); - group.ast = Box::new(alt.into_ast()); - } - None => { - group.ast = Box::new(group_concat.into_ast()); - } - } - prior_concat.asts.push(Ast::group(group)); - Ok(prior_concat) - } - - /// Pop the last state from the parser's internal stack, if it exists, and - /// add the given concatenation to it. There either must be no state or a - /// single alternation item on the stack. Any other scenario produces an - /// error. - /// - /// This assumes that the parser has advanced to the end. - #[inline(never)] - fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> { - concat.span.end = self.pos(); - let mut stack = self.parser().stack_group.borrow_mut(); - let ast = match stack.pop() { - None => Ok(concat.into_ast()), - Some(GroupState::Alternation(mut alt)) => { - alt.span.end = self.pos(); - alt.asts.push(concat.into_ast()); - Ok(Ast::alternation(alt)) - } - Some(GroupState::Group { group, .. }) => { - return Err( - self.error(group.span, ast::ErrorKind::GroupUnclosed) - ); - } - }; - // If we try to pop again, there should be nothing. - match stack.pop() { - None => ast, - Some(GroupState::Alternation(_)) => { - // This unreachable is unfortunate. This case can't happen - // because the only way we can be here is if there were two - // `GroupState::Alternation`s adjacent in the parser's stack, - // which we guarantee to never happen because we never push a - // `GroupState::Alternation` if one is already at the top of - // the stack. - unreachable!() - } - Some(GroupState::Group { group, .. }) => { - Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) - } - } - } - - /// Parse the opening of a character class and push the current class - /// parsing context onto the parser's stack. This assumes that the parser - /// is positioned at an opening `[`. The given union should correspond to - /// the union of set items built up before seeing the `[`. - /// - /// If there was a problem parsing the opening of the class, then an error - /// is returned. Otherwise, a new union of set items for the class is - /// returned (which may be populated with either a `]` or a `-`). - #[inline(never)] - fn push_class_open( - &self, - parent_union: ast::ClassSetUnion, - ) -> Result<ast::ClassSetUnion> { - assert_eq!(self.char(), '['); - - let (nested_set, nested_union) = self.parse_set_class_open()?; - self.parser() - .stack_class - .borrow_mut() - .push(ClassState::Open { union: parent_union, set: nested_set }); - Ok(nested_union) - } - - /// Parse the end of a character class set and pop the character class - /// parser stack. The union given corresponds to the last union built - /// before seeing the closing `]`. The union returned corresponds to the - /// parent character class set with the nested class added to it. - /// - /// This assumes that the parser is positioned at a `]` and will advance - /// the parser to the byte immediately following the `]`. - /// - /// If the stack is empty after popping, then this returns the final - /// "top-level" character class AST (where a "top-level" character class - /// is one that is not nested inside any other character class). - /// - /// If there is no corresponding opening bracket on the parser's stack, - /// then an error is returned. - #[inline(never)] - fn pop_class( - &self, - nested_union: ast::ClassSetUnion, - ) -> Result<Either<ast::ClassSetUnion, ast::ClassBracketed>> { - assert_eq!(self.char(), ']'); - - let item = ast::ClassSet::Item(nested_union.into_item()); - let prevset = self.pop_class_op(item); - let mut stack = self.parser().stack_class.borrow_mut(); - match stack.pop() { - None => { - // We can never observe an empty stack: - // - // 1) We are guaranteed to start with a non-empty stack since - // the character class parser is only initiated when it sees - // a `[`. - // 2) If we ever observe an empty stack while popping after - // seeing a `]`, then we signal the character class parser - // to terminate. - panic!("unexpected empty character class stack") - } - Some(ClassState::Op { .. }) => { - // This panic is unfortunate, but this case is impossible - // since we already popped the Op state if one exists above. - // Namely, every push to the class parser stack is guarded by - // whether an existing Op is already on the top of the stack. - // If it is, the existing Op is modified. That is, the stack - // can never have consecutive Op states. - panic!("unexpected ClassState::Op") - } - Some(ClassState::Open { mut union, mut set }) => { - self.bump(); - set.span.end = self.pos(); - set.kind = prevset; - if stack.is_empty() { - Ok(Either::Right(set)) - } else { - union.push(ast::ClassSetItem::Bracketed(Box::new(set))); - Ok(Either::Left(union)) - } - } - } - } - - /// Return an "unclosed class" error whose span points to the most - /// recently opened class. - /// - /// This should only be called while parsing a character class. - #[inline(never)] - fn unclosed_class_error(&self) -> ast::Error { - for state in self.parser().stack_class.borrow().iter().rev() { - if let ClassState::Open { ref set, .. } = *state { - return self.error(set.span, ast::ErrorKind::ClassUnclosed); - } - } - // We are guaranteed to have a non-empty stack with at least - // one open bracket, so we should never get here. - panic!("no open character class found") - } - - /// Push the current set of class items on to the class parser's stack as - /// the left hand side of the given operator. - /// - /// A fresh set union is returned, which should be used to build the right - /// hand side of this operator. - #[inline(never)] - fn push_class_op( - &self, - next_kind: ast::ClassSetBinaryOpKind, - next_union: ast::ClassSetUnion, - ) -> ast::ClassSetUnion { - let item = ast::ClassSet::Item(next_union.into_item()); - let new_lhs = self.pop_class_op(item); - self.parser() - .stack_class - .borrow_mut() - .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); - ast::ClassSetUnion { span: self.span(), items: vec![] } - } - - /// Pop a character class set from the character class parser stack. If the - /// top of the stack is just an item (not an operation), then return the - /// given set unchanged. If the top of the stack is an operation, then the - /// given set will be used as the rhs of the operation on the top of the - /// stack. In that case, the binary operation is returned as a set. - #[inline(never)] - fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { - let mut stack = self.parser().stack_class.borrow_mut(); - let (kind, lhs) = match stack.pop() { - Some(ClassState::Op { kind, lhs }) => (kind, lhs), - Some(state @ ClassState::Open { .. }) => { - stack.push(state); - return rhs; - } - None => unreachable!(), - }; - let span = Span::new(lhs.span().start, rhs.span().end); - ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { - span, - kind, - lhs: Box::new(lhs), - rhs: Box::new(rhs), - }) - } -} - -impl<'s, P: Borrow<Parser>> ParserI<'s, P> { - /// Parse the regular expression into an abstract syntax tree. - fn parse(&self) -> Result<Ast> { - self.parse_with_comments().map(|astc| astc.ast) - } - - /// Parse the regular expression and return an abstract syntax tree with - /// all of the comments found in the pattern. - fn parse_with_comments(&self) -> Result<ast::WithComments> { - assert_eq!(self.offset(), 0, "parser can only be used once"); - self.parser().reset(); - let mut concat = ast::Concat { span: self.span(), asts: vec![] }; - loop { - self.bump_space(); - if self.is_eof() { - break; - } - match self.char() { - '(' => concat = self.push_group(concat)?, - ')' => concat = self.pop_group(concat)?, - '|' => concat = self.push_alternate(concat)?, - '[' => { - let class = self.parse_set_class()?; - concat.asts.push(Ast::class_bracketed(class)); - } - '?' => { - concat = self.parse_uncounted_repetition( - concat, - ast::RepetitionKind::ZeroOrOne, - )?; - } - '*' => { - concat = self.parse_uncounted_repetition( - concat, - ast::RepetitionKind::ZeroOrMore, - )?; - } - '+' => { - concat = self.parse_uncounted_repetition( - concat, - ast::RepetitionKind::OneOrMore, - )?; - } - '{' => { - concat = self.parse_counted_repetition(concat)?; - } - _ => concat.asts.push(self.parse_primitive()?.into_ast()), - } - } - let ast = self.pop_group_end(concat)?; - NestLimiter::new(self).check(&ast)?; - Ok(ast::WithComments { - ast, - comments: mem::replace( - &mut *self.parser().comments.borrow_mut(), - vec![], - ), - }) - } - - /// Parses an uncounted repetition operation. An uncounted repetition - /// operator includes ?, * and +, but does not include the {m,n} syntax. - /// The given `kind` should correspond to the operator observed by the - /// caller. - /// - /// This assumes that the parser is currently positioned at the repetition - /// operator and advances the parser to the first character after the - /// operator. (Note that the operator may include a single additional `?`, - /// which makes the operator ungreedy.) - /// - /// The caller should include the concatenation that is being built. The - /// concatenation returned includes the repetition operator applied to the - /// last expression in the given concatenation. - #[inline(never)] - fn parse_uncounted_repetition( - &self, - mut concat: ast::Concat, - kind: ast::RepetitionKind, - ) -> Result<ast::Concat> { - assert!( - self.char() == '?' || self.char() == '*' || self.char() == '+' - ); - let op_start = self.pos(); - let ast = match concat.asts.pop() { - Some(ast) => ast, - None => { - return Err( - self.error(self.span(), ast::ErrorKind::RepetitionMissing) - ) - } - }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { - return Err( - self.error(self.span(), ast::ErrorKind::RepetitionMissing) - ) - } - _ => {} - } - let mut greedy = true; - if self.bump() && self.char() == '?' { - greedy = false; - self.bump(); - } - concat.asts.push(Ast::repetition(ast::Repetition { - span: ast.span().with_end(self.pos()), - op: ast::RepetitionOp { - span: Span::new(op_start, self.pos()), - kind, - }, - greedy, - ast: Box::new(ast), - })); - Ok(concat) - } - - /// Parses a counted repetition operation. A counted repetition operator - /// corresponds to the {m,n} syntax, and does not include the ?, * or + - /// operators. - /// - /// This assumes that the parser is currently positioned at the opening `{` - /// and advances the parser to the first character after the operator. - /// (Note that the operator may include a single additional `?`, which - /// makes the operator ungreedy.) - /// - /// The caller should include the concatenation that is being built. The - /// concatenation returned includes the repetition operator applied to the - /// last expression in the given concatenation. - #[inline(never)] - fn parse_counted_repetition( - &self, - mut concat: ast::Concat, - ) -> Result<ast::Concat> { - assert!(self.char() == '{'); - let start = self.pos(); - let ast = match concat.asts.pop() { - Some(ast) => ast, - None => { - return Err( - self.error(self.span(), ast::ErrorKind::RepetitionMissing) - ) - } - }; - match ast { - Ast::Empty(_) | Ast::Flags(_) => { - return Err( - self.error(self.span(), ast::ErrorKind::RepetitionMissing) - ) - } - _ => {} - } - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::RepetitionCountUnclosed, - )); - } - let count_start = specialize_err( - self.parse_decimal(), - ast::ErrorKind::DecimalEmpty, - ast::ErrorKind::RepetitionCountDecimalEmpty, - ); - if self.is_eof() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::RepetitionCountUnclosed, - )); - } - let range = if self.char() == ',' { - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::RepetitionCountUnclosed, - )); - } - if self.char() != '}' { - let count_start = match count_start { - Ok(c) => c, - Err(err) - if err.kind - == ast::ErrorKind::RepetitionCountDecimalEmpty => - { - if self.parser().empty_min_range { - 0 - } else { - return Err(err); - } - } - err => err?, - }; - let count_end = specialize_err( - self.parse_decimal(), - ast::ErrorKind::DecimalEmpty, - ast::ErrorKind::RepetitionCountDecimalEmpty, - )?; - ast::RepetitionRange::Bounded(count_start, count_end) - } else { - ast::RepetitionRange::AtLeast(count_start?) - } - } else { - ast::RepetitionRange::Exactly(count_start?) - }; - - if self.is_eof() || self.char() != '}' { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::RepetitionCountUnclosed, - )); - } - - let mut greedy = true; - if self.bump_and_bump_space() && self.char() == '?' { - greedy = false; - self.bump(); - } - - let op_span = Span::new(start, self.pos()); - if !range.is_valid() { - return Err( - self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) - ); - } - concat.asts.push(Ast::repetition(ast::Repetition { - span: ast.span().with_end(self.pos()), - op: ast::RepetitionOp { - span: op_span, - kind: ast::RepetitionKind::Range(range), - }, - greedy, - ast: Box::new(ast), - })); - Ok(concat) - } - - /// Parse a group (which contains a sub-expression) or a set of flags. - /// - /// If a group was found, then it is returned with an empty AST. If a set - /// of flags is found, then that set is returned. - /// - /// The parser should be positioned at the opening parenthesis. - /// - /// This advances the parser to the character before the start of the - /// sub-expression (in the case of a group) or to the closing parenthesis - /// immediately following the set of flags. - /// - /// # Errors - /// - /// If flags are given and incorrectly specified, then a corresponding - /// error is returned. - /// - /// If a capture name is given and it is incorrectly specified, then a - /// corresponding error is returned. - #[inline(never)] - fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> { - assert_eq!(self.char(), '('); - let open_span = self.span_char(); - self.bump(); - self.bump_space(); - if self.is_lookaround_prefix() { - return Err(self.error( - Span::new(open_span.start, self.span().end), - ast::ErrorKind::UnsupportedLookAround, - )); - } - let inner_span = self.span(); - let mut starts_with_p = true; - if self.bump_if("?P<") || { - starts_with_p = false; - self.bump_if("?<") - } { - let capture_index = self.next_capture_index(open_span)?; - let name = self.parse_capture_name(capture_index)?; - Ok(Either::Right(ast::Group { - span: open_span, - kind: ast::GroupKind::CaptureName { starts_with_p, name }, - ast: Box::new(Ast::empty(self.span())), - })) - } else if self.bump_if("?") { - if self.is_eof() { - return Err( - self.error(open_span, ast::ErrorKind::GroupUnclosed) - ); - } - let flags = self.parse_flags()?; - let char_end = self.char(); - self.bump(); - if char_end == ')' { - // We don't allow empty flags, e.g., `(?)`. We instead - // interpret it as a repetition operator missing its argument. - if flags.items.is_empty() { - return Err(self.error( - inner_span, - ast::ErrorKind::RepetitionMissing, - )); - } - Ok(Either::Left(ast::SetFlags { - span: Span { end: self.pos(), ..open_span }, - flags, - })) - } else { - assert_eq!(char_end, ':'); - Ok(Either::Right(ast::Group { - span: open_span, - kind: ast::GroupKind::NonCapturing(flags), - ast: Box::new(Ast::empty(self.span())), - })) - } - } else { - let capture_index = self.next_capture_index(open_span)?; - Ok(Either::Right(ast::Group { - span: open_span, - kind: ast::GroupKind::CaptureIndex(capture_index), - ast: Box::new(Ast::empty(self.span())), - })) - } - } - - /// Parses a capture group name. Assumes that the parser is positioned at - /// the first character in the name following the opening `<` (and may - /// possibly be EOF). This advances the parser to the first character - /// following the closing `>`. - /// - /// The caller must provide the capture index of the group for this name. - #[inline(never)] - fn parse_capture_name( - &self, - capture_index: u32, - ) -> Result<ast::CaptureName> { - if self.is_eof() { - return Err(self - .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); - } - let start = self.pos(); - loop { - if self.char() == '>' { - break; - } - if !is_capture_char(self.char(), self.pos() == start) { - return Err(self.error( - self.span_char(), - ast::ErrorKind::GroupNameInvalid, - )); - } - if !self.bump() { - break; - } - } - let end = self.pos(); - if self.is_eof() { - return Err(self - .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); - } - assert_eq!(self.char(), '>'); - self.bump(); - let name = &self.pattern()[start.offset..end.offset]; - if name.is_empty() { - return Err(self.error( - Span::new(start, start), - ast::ErrorKind::GroupNameEmpty, - )); - } - let capname = ast::CaptureName { - span: Span::new(start, end), - name: name.to_string(), - index: capture_index, - }; - self.add_capture_name(&capname)?; - Ok(capname) - } - - /// Parse a sequence of flags starting at the current character. - /// - /// This advances the parser to the character immediately following the - /// flags, which is guaranteed to be either `:` or `)`. - /// - /// # Errors - /// - /// If any flags are duplicated, then an error is returned. - /// - /// If the negation operator is used more than once, then an error is - /// returned. - /// - /// If no flags could be found or if the negation operation is not followed - /// by any flags, then an error is returned. - #[inline(never)] - fn parse_flags(&self) -> Result<ast::Flags> { - let mut flags = ast::Flags { span: self.span(), items: vec![] }; - let mut last_was_negation = None; - while self.char() != ':' && self.char() != ')' { - if self.char() == '-' { - last_was_negation = Some(self.span_char()); - let item = ast::FlagsItem { - span: self.span_char(), - kind: ast::FlagsItemKind::Negation, - }; - if let Some(i) = flags.add_item(item) { - return Err(self.error( - self.span_char(), - ast::ErrorKind::FlagRepeatedNegation { - original: flags.items[i].span, - }, - )); - } - } else { - last_was_negation = None; - let item = ast::FlagsItem { - span: self.span_char(), - kind: ast::FlagsItemKind::Flag(self.parse_flag()?), - }; - if let Some(i) = flags.add_item(item) { - return Err(self.error( - self.span_char(), - ast::ErrorKind::FlagDuplicate { - original: flags.items[i].span, - }, - )); - } - } - if !self.bump() { - return Err( - self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) - ); - } - } - if let Some(span) = last_was_negation { - return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); - } - flags.span.end = self.pos(); - Ok(flags) - } - - /// Parse the current character as a flag. Do not advance the parser. - /// - /// # Errors - /// - /// If the flag is not recognized, then an error is returned. - #[inline(never)] - fn parse_flag(&self) -> Result<ast::Flag> { - match self.char() { - 'i' => Ok(ast::Flag::CaseInsensitive), - 'm' => Ok(ast::Flag::MultiLine), - 's' => Ok(ast::Flag::DotMatchesNewLine), - 'U' => Ok(ast::Flag::SwapGreed), - 'u' => Ok(ast::Flag::Unicode), - 'R' => Ok(ast::Flag::CRLF), - 'x' => Ok(ast::Flag::IgnoreWhitespace), - _ => { - Err(self - .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) - } - } - } - - /// Parse a primitive AST. e.g., A literal, non-set character class or - /// assertion. - /// - /// This assumes that the parser expects a primitive at the current - /// location. i.e., All other non-primitive cases have been handled. - /// For example, if the parser's position is at `|`, then `|` will be - /// treated as a literal (e.g., inside a character class). - /// - /// This advances the parser to the first character immediately following - /// the primitive. - fn parse_primitive(&self) -> Result<Primitive> { - match self.char() { - '\\' => self.parse_escape(), - '.' => { - let ast = Primitive::Dot(self.span_char()); - self.bump(); - Ok(ast) - } - '^' => { - let ast = Primitive::Assertion(ast::Assertion { - span: self.span_char(), - kind: ast::AssertionKind::StartLine, - }); - self.bump(); - Ok(ast) - } - '$' => { - let ast = Primitive::Assertion(ast::Assertion { - span: self.span_char(), - kind: ast::AssertionKind::EndLine, - }); - self.bump(); - Ok(ast) - } - c => { - let ast = Primitive::Literal(ast::Literal { - span: self.span_char(), - kind: ast::LiteralKind::Verbatim, - c, - }); - self.bump(); - Ok(ast) - } - } - } - - /// Parse an escape sequence as a primitive AST. - /// - /// This assumes the parser is positioned at the start of the escape - /// sequence, i.e., `\`. It advances the parser to the first position - /// immediately following the escape sequence. - #[inline(never)] - fn parse_escape(&self) -> Result<Primitive> { - assert_eq!(self.char(), '\\'); - let start = self.pos(); - if !self.bump() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::EscapeUnexpectedEof, - )); - } - let c = self.char(); - // Put some of the more complicated routines into helpers. - match c { - '0'..='7' => { - if !self.parser().octal { - return Err(self.error( - Span::new(start, self.span_char().end), - ast::ErrorKind::UnsupportedBackreference, - )); - } - let mut lit = self.parse_octal(); - lit.span.start = start; - return Ok(Primitive::Literal(lit)); - } - '8'..='9' if !self.parser().octal => { - return Err(self.error( - Span::new(start, self.span_char().end), - ast::ErrorKind::UnsupportedBackreference, - )); - } - 'x' | 'u' | 'U' => { - let mut lit = self.parse_hex()?; - lit.span.start = start; - return Ok(Primitive::Literal(lit)); - } - 'p' | 'P' => { - let mut cls = self.parse_unicode_class()?; - cls.span.start = start; - return Ok(Primitive::Unicode(cls)); - } - 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { - let mut cls = self.parse_perl_class(); - cls.span.start = start; - return Ok(Primitive::Perl(cls)); - } - _ => {} - } - - // Handle all of the one letter sequences inline. - self.bump(); - let span = Span::new(start, self.pos()); - if is_meta_character(c) { - return Ok(Primitive::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Meta, - c, - })); - } - if is_escapeable_character(c) { - return Ok(Primitive::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Superfluous, - c, - })); - } - let special = |kind, c| { - Ok(Primitive::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Special(kind), - c, - })) - }; - match c { - 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), - 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), - 't' => special(ast::SpecialLiteralKind::Tab, '\t'), - 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), - 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), - 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), - 'A' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::StartText, - })), - 'z' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::EndText, - })), - 'b' => { - let mut wb = ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundary, - }; - // After a \b, we "try" to parse things like \b{start} for - // special word boundary assertions. - if !self.is_eof() && self.char() == '{' { - if let Some(kind) = - self.maybe_parse_special_word_boundary(start)? - { - wb.kind = kind; - wb.span.end = self.pos(); - } - } - Ok(Primitive::Assertion(wb)) - } - 'B' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::NotWordBoundary, - })), - '<' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundaryStartAngle, - })), - '>' => Ok(Primitive::Assertion(ast::Assertion { - span, - kind: ast::AssertionKind::WordBoundaryEndAngle, - })), - _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), - } - } - - /// Attempt to parse a specialty word boundary. That is, `\b{start}`, - /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. - /// - /// This is similar to `maybe_parse_ascii_class` in that, in most cases, - /// if it fails it will just return `None` with no error. This is done - /// because `\b{5}` is a valid expression and we want to let that be parsed - /// by the existing counted repetition parsing code. (I thought about just - /// invoking the counted repetition code from here, but it seemed a little - /// ham-fisted.) - /// - /// Unlike `maybe_parse_ascii_class` though, this can return an error. - /// Namely, if we definitely know it isn't a counted repetition, then we - /// return an error specific to the specialty word boundaries. - /// - /// This assumes the parser is positioned at a `{` immediately following - /// a `\b`. When `None` is returned, the parser is returned to the position - /// at which it started: pointing at a `{`. - /// - /// The position given should correspond to the start of the `\b`. - fn maybe_parse_special_word_boundary( - &self, - wb_start: Position, - ) -> Result<Option<ast::AssertionKind>> { - assert_eq!(self.char(), '{'); - - let is_valid_char = |c| match c { - 'A'..='Z' | 'a'..='z' | '-' => true, - _ => false, - }; - let start = self.pos(); - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(wb_start, self.pos()), - ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, - )); - } - let start_contents = self.pos(); - // This is one of the critical bits: if the first non-whitespace - // character isn't in [-A-Za-z] (i.e., this can't be a special word - // boundary), then we bail and let the counted repetition parser deal - // with this. - if !is_valid_char(self.char()) { - self.parser().pos.set(start); - return Ok(None); - } - - // Now collect up our chars until we see a '}'. - let mut scratch = self.parser().scratch.borrow_mut(); - scratch.clear(); - while !self.is_eof() && is_valid_char(self.char()) { - scratch.push(self.char()); - self.bump_and_bump_space(); - } - if self.is_eof() || self.char() != '}' { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::SpecialWordBoundaryUnclosed, - )); - } - let end = self.pos(); - self.bump(); - let kind = match scratch.as_str() { - "start" => ast::AssertionKind::WordBoundaryStart, - "end" => ast::AssertionKind::WordBoundaryEnd, - "start-half" => ast::AssertionKind::WordBoundaryStartHalf, - "end-half" => ast::AssertionKind::WordBoundaryEndHalf, - _ => { - return Err(self.error( - Span::new(start_contents, end), - ast::ErrorKind::SpecialWordBoundaryUnrecognized, - )) - } - }; - Ok(Some(kind)) - } - - /// Parse an octal representation of a Unicode codepoint up to 3 digits - /// long. This expects the parser to be positioned at the first octal - /// digit and advances the parser to the first character immediately - /// following the octal number. This also assumes that parsing octal - /// escapes is enabled. - /// - /// Assuming the preconditions are met, this routine can never fail. - #[inline(never)] - fn parse_octal(&self) -> ast::Literal { - assert!(self.parser().octal); - assert!('0' <= self.char() && self.char() <= '7'); - let start = self.pos(); - // Parse up to two more digits. - while self.bump() - && '0' <= self.char() - && self.char() <= '7' - && self.pos().offset - start.offset <= 2 - {} - let end = self.pos(); - let octal = &self.pattern()[start.offset..end.offset]; - // Parsing the octal should never fail since the above guarantees a - // valid number. - let codepoint = - u32::from_str_radix(octal, 8).expect("valid octal number"); - // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no - // invalid Unicode scalar values. - let c = char::from_u32(codepoint).expect("Unicode scalar value"); - ast::Literal { - span: Span::new(start, end), - kind: ast::LiteralKind::Octal, - c, - } - } - - /// Parse a hex representation of a Unicode codepoint. This handles both - /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to - /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to - /// the first character immediately following the hexadecimal literal. - #[inline(never)] - fn parse_hex(&self) -> Result<ast::Literal> { - assert!( - self.char() == 'x' || self.char() == 'u' || self.char() == 'U' - ); - - let hex_kind = match self.char() { - 'x' => ast::HexLiteralKind::X, - 'u' => ast::HexLiteralKind::UnicodeShort, - _ => ast::HexLiteralKind::UnicodeLong, - }; - if !self.bump_and_bump_space() { - return Err( - self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) - ); - } - if self.char() == '{' { - self.parse_hex_brace(hex_kind) - } else { - self.parse_hex_digits(hex_kind) - } - } - - /// Parse an N-digit hex representation of a Unicode codepoint. This - /// expects the parser to be positioned at the first digit and will advance - /// the parser to the first character immediately following the escape - /// sequence. - /// - /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) - /// or 8 (for `\UNNNNNNNN`). - #[inline(never)] - fn parse_hex_digits( - &self, - kind: ast::HexLiteralKind, - ) -> Result<ast::Literal> { - let mut scratch = self.parser().scratch.borrow_mut(); - scratch.clear(); - - let start = self.pos(); - for i in 0..kind.digits() { - if i > 0 && !self.bump_and_bump_space() { - return Err(self - .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); - } - if !is_hex(self.char()) { - return Err(self.error( - self.span_char(), - ast::ErrorKind::EscapeHexInvalidDigit, - )); - } - scratch.push(self.char()); - } - // The final bump just moves the parser past the literal, which may - // be EOF. - self.bump_and_bump_space(); - let end = self.pos(); - let hex = scratch.as_str(); - match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { - None => Err(self.error( - Span::new(start, end), - ast::ErrorKind::EscapeHexInvalid, - )), - Some(c) => Ok(ast::Literal { - span: Span::new(start, end), - kind: ast::LiteralKind::HexFixed(kind), - c, - }), - } - } - - /// Parse a hex representation of any Unicode scalar value. This expects - /// the parser to be positioned at the opening brace `{` and will advance - /// the parser to the first character following the closing brace `}`. - #[inline(never)] - fn parse_hex_brace( - &self, - kind: ast::HexLiteralKind, - ) -> Result<ast::Literal> { - let mut scratch = self.parser().scratch.borrow_mut(); - scratch.clear(); - - let brace_pos = self.pos(); - let start = self.span_char().end; - while self.bump_and_bump_space() && self.char() != '}' { - if !is_hex(self.char()) { - return Err(self.error( - self.span_char(), - ast::ErrorKind::EscapeHexInvalidDigit, - )); - } - scratch.push(self.char()); - } - if self.is_eof() { - return Err(self.error( - Span::new(brace_pos, self.pos()), - ast::ErrorKind::EscapeUnexpectedEof, - )); - } - let end = self.pos(); - let hex = scratch.as_str(); - assert_eq!(self.char(), '}'); - self.bump_and_bump_space(); - - if hex.is_empty() { - return Err(self.error( - Span::new(brace_pos, self.pos()), - ast::ErrorKind::EscapeHexEmpty, - )); - } - match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { - None => Err(self.error( - Span::new(start, end), - ast::ErrorKind::EscapeHexInvalid, - )), - Some(c) => Ok(ast::Literal { - span: Span::new(start, self.pos()), - kind: ast::LiteralKind::HexBrace(kind), - c, - }), - } - } - - /// Parse a decimal number into a u32 while trimming leading and trailing - /// whitespace. - /// - /// This expects the parser to be positioned at the first position where - /// a decimal digit could occur. This will advance the parser to the byte - /// immediately following the last contiguous decimal digit. - /// - /// If no decimal digit could be found or if there was a problem parsing - /// the complete set of digits into a u32, then an error is returned. - fn parse_decimal(&self) -> Result<u32> { - let mut scratch = self.parser().scratch.borrow_mut(); - scratch.clear(); - - while !self.is_eof() && self.char().is_whitespace() { - self.bump(); - } - let start = self.pos(); - while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { - scratch.push(self.char()); - self.bump_and_bump_space(); - } - let span = Span::new(start, self.pos()); - while !self.is_eof() && self.char().is_whitespace() { - self.bump_and_bump_space(); - } - let digits = scratch.as_str(); - if digits.is_empty() { - return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); - } - match u32::from_str_radix(digits, 10).ok() { - Some(n) => Ok(n), - None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), - } - } - - /// Parse a standard character class consisting primarily of characters or - /// character ranges, but can also contain nested character classes of - /// any type (sans `.`). - /// - /// This assumes the parser is positioned at the opening `[`. If parsing - /// is successful, then the parser is advanced to the position immediately - /// following the closing `]`. - #[inline(never)] - fn parse_set_class(&self) -> Result<ast::ClassBracketed> { - assert_eq!(self.char(), '['); - - let mut union = - ast::ClassSetUnion { span: self.span(), items: vec![] }; - loop { - self.bump_space(); - if self.is_eof() { - return Err(self.unclosed_class_error()); - } - match self.char() { - '[' => { - // If we've already parsed the opening bracket, then - // attempt to treat this as the beginning of an ASCII - // class. If ASCII class parsing fails, then the parser - // backs up to `[`. - if !self.parser().stack_class.borrow().is_empty() { - if let Some(cls) = self.maybe_parse_ascii_class() { - union.push(ast::ClassSetItem::Ascii(cls)); - continue; - } - } - union = self.push_class_open(union)?; - } - ']' => match self.pop_class(union)? { - Either::Left(nested_union) => { - union = nested_union; - } - Either::Right(class) => return Ok(class), - }, - '&' if self.peek() == Some('&') => { - assert!(self.bump_if("&&")); - union = self.push_class_op( - ast::ClassSetBinaryOpKind::Intersection, - union, - ); - } - '-' if self.peek() == Some('-') => { - assert!(self.bump_if("--")); - union = self.push_class_op( - ast::ClassSetBinaryOpKind::Difference, - union, - ); - } - '~' if self.peek() == Some('~') => { - assert!(self.bump_if("~~")); - union = self.push_class_op( - ast::ClassSetBinaryOpKind::SymmetricDifference, - union, - ); - } - _ => { - union.push(self.parse_set_class_range()?); - } - } - } - } - - /// Parse a single primitive item in a character class set. The item to - /// be parsed can either be one of a simple literal character, a range - /// between two simple literal characters or a "primitive" character - /// class like \w or \p{Greek}. - /// - /// If an invalid escape is found, or if a character class is found where - /// a simple literal is expected (e.g., in a range), then an error is - /// returned. - #[inline(never)] - fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> { - let prim1 = self.parse_set_class_item()?; - self.bump_space(); - if self.is_eof() { - return Err(self.unclosed_class_error()); - } - // If the next char isn't a `-`, then we don't have a range. - // There are two exceptions. If the char after a `-` is a `]`, then - // `-` is interpreted as a literal `-`. Alternatively, if the char - // after a `-` is a `-`, then `--` corresponds to a "difference" - // operation. - if self.char() != '-' - || self.peek_space() == Some(']') - || self.peek_space() == Some('-') - { - return prim1.into_class_set_item(self); - } - // OK, now we're parsing a range, so bump past the `-` and parse the - // second half of the range. - if !self.bump_and_bump_space() { - return Err(self.unclosed_class_error()); - } - let prim2 = self.parse_set_class_item()?; - let range = ast::ClassSetRange { - span: Span::new(prim1.span().start, prim2.span().end), - start: prim1.into_class_literal(self)?, - end: prim2.into_class_literal(self)?, - }; - if !range.is_valid() { - return Err( - self.error(range.span, ast::ErrorKind::ClassRangeInvalid) - ); - } - Ok(ast::ClassSetItem::Range(range)) - } - - /// Parse a single item in a character class as a primitive, where the - /// primitive either consists of a verbatim literal or a single escape - /// sequence. - /// - /// This assumes the parser is positioned at the beginning of a primitive, - /// and advances the parser to the first position after the primitive if - /// successful. - /// - /// Note that it is the caller's responsibility to report an error if an - /// illegal primitive was parsed. - #[inline(never)] - fn parse_set_class_item(&self) -> Result<Primitive> { - if self.char() == '\\' { - self.parse_escape() - } else { - let x = Primitive::Literal(ast::Literal { - span: self.span_char(), - kind: ast::LiteralKind::Verbatim, - c: self.char(), - }); - self.bump(); - Ok(x) - } - } - - /// Parses the opening of a character class set. This includes the opening - /// bracket along with `^` if present to indicate negation. This also - /// starts parsing the opening set of unioned items if applicable, since - /// there are special rules applied to certain characters in the opening - /// of a character class. For example, `[^]]` is the class of all - /// characters not equal to `]`. (`]` would need to be escaped in any other - /// position.) Similarly for `-`. - /// - /// In all cases, the op inside the returned `ast::ClassBracketed` is an - /// empty union. This empty union should be replaced with the actual item - /// when it is popped from the parser's stack. - /// - /// This assumes the parser is positioned at the opening `[` and advances - /// the parser to the first non-special byte of the character class. - /// - /// An error is returned if EOF is found. - #[inline(never)] - fn parse_set_class_open( - &self, - ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { - assert_eq!(self.char(), '['); - let start = self.pos(); - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::ClassUnclosed, - )); - } - - let negated = if self.char() != '^' { - false - } else { - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::ClassUnclosed, - )); - } - true - }; - // Accept any number of `-` as literal `-`. - let mut union = - ast::ClassSetUnion { span: self.span(), items: vec![] }; - while self.char() == '-' { - union.push(ast::ClassSetItem::Literal(ast::Literal { - span: self.span_char(), - kind: ast::LiteralKind::Verbatim, - c: '-', - })); - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, start), - ast::ErrorKind::ClassUnclosed, - )); - } - } - // If `]` is the *first* char in a set, then interpret it as a literal - // `]`. That is, an empty class is impossible to write. - if union.items.is_empty() && self.char() == ']' { - union.push(ast::ClassSetItem::Literal(ast::Literal { - span: self.span_char(), - kind: ast::LiteralKind::Verbatim, - c: ']', - })); - if !self.bump_and_bump_space() { - return Err(self.error( - Span::new(start, self.pos()), - ast::ErrorKind::ClassUnclosed, - )); - } - } - let set = ast::ClassBracketed { - span: Span::new(start, self.pos()), - negated, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: Span::new(union.span.start, union.span.start), - items: vec![], - }), - }; - Ok((set, union)) - } - - /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. - /// - /// This assumes the parser is positioned at the opening `[`. - /// - /// If no valid ASCII character class could be found, then this does not - /// advance the parser and `None` is returned. Otherwise, the parser is - /// advanced to the first byte following the closing `]` and the - /// corresponding ASCII class is returned. - #[inline(never)] - fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> { - // ASCII character classes are interesting from a parsing perspective - // because parsing cannot fail with any interesting error. For example, - // in order to use an ASCII character class, it must be enclosed in - // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think - // of it as "ASCII character classes have the syntax `[:NAME:]` which - // can only appear within character brackets." This means that things - // like `[[:lower:]A]` are legal constructs. - // - // However, if one types an incorrect ASCII character class, e.g., - // `[[:loower:]]`, then we treat that as a normal nested character - // class containing the characters `:elorw`. One might argue that we - // should return an error instead since the repeated colons give away - // the intent to write an ASCII class. But what if the user typed - // `[[:lower]]` instead? How can we tell that was intended to be an - // ASCII class and not just a normal nested class? - // - // Reasonable people can probably disagree over this, but for better - // or worse, we implement semantics that never fails at the expense - // of better failure modes. - assert_eq!(self.char(), '['); - // If parsing fails, then we back up the parser to this starting point. - let start = self.pos(); - let mut negated = false; - if !self.bump() || self.char() != ':' { - self.parser().pos.set(start); - return None; - } - if !self.bump() { - self.parser().pos.set(start); - return None; - } - if self.char() == '^' { - negated = true; - if !self.bump() { - self.parser().pos.set(start); - return None; - } - } - let name_start = self.offset(); - while self.char() != ':' && self.bump() {} - if self.is_eof() { - self.parser().pos.set(start); - return None; - } - let name = &self.pattern()[name_start..self.offset()]; - if !self.bump_if(":]") { - self.parser().pos.set(start); - return None; - } - let kind = match ast::ClassAsciiKind::from_name(name) { - Some(kind) => kind, - None => { - self.parser().pos.set(start); - return None; - } - }; - Some(ast::ClassAscii { - span: Span::new(start, self.pos()), - kind, - negated, - }) - } - - /// Parse a Unicode class in either the single character notation, `\pN` - /// or the multi-character bracketed notation, `\p{Greek}`. This assumes - /// the parser is positioned at the `p` (or `P` for negation) and will - /// advance the parser to the character immediately following the class. - /// - /// Note that this does not check whether the class name is valid or not. - #[inline(never)] - fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> { - assert!(self.char() == 'p' || self.char() == 'P'); - - let mut scratch = self.parser().scratch.borrow_mut(); - scratch.clear(); - - let negated = self.char() == 'P'; - if !self.bump_and_bump_space() { - return Err( - self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) - ); - } - let (start, kind) = if self.char() == '{' { - let start = self.span_char().end; - while self.bump_and_bump_space() && self.char() != '}' { - scratch.push(self.char()); - } - if self.is_eof() { - return Err(self - .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); - } - assert_eq!(self.char(), '}'); - self.bump(); - - let name = scratch.as_str(); - if let Some(i) = name.find("!=") { - ( - start, - ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::NotEqual, - name: name[..i].to_string(), - value: name[i + 2..].to_string(), - }, - ) - } else if let Some(i) = name.find(':') { - ( - start, - ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Colon, - name: name[..i].to_string(), - value: name[i + 1..].to_string(), - }, - ) - } else if let Some(i) = name.find('=') { - ( - start, - ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Equal, - name: name[..i].to_string(), - value: name[i + 1..].to_string(), - }, - ) - } else { - (start, ast::ClassUnicodeKind::Named(name.to_string())) - } - } else { - let start = self.pos(); - let c = self.char(); - if c == '\\' { - return Err(self.error( - self.span_char(), - ast::ErrorKind::UnicodeClassInvalid, - )); - } - self.bump_and_bump_space(); - let kind = ast::ClassUnicodeKind::OneLetter(c); - (start, kind) - }; - Ok(ast::ClassUnicode { - span: Span::new(start, self.pos()), - negated, - kind, - }) - } - - /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the - /// parser is currently at a valid character class name and will be - /// advanced to the character immediately following the class. - #[inline(never)] - fn parse_perl_class(&self) -> ast::ClassPerl { - let c = self.char(); - let span = self.span_char(); - self.bump(); - let (negated, kind) = match c { - 'd' => (false, ast::ClassPerlKind::Digit), - 'D' => (true, ast::ClassPerlKind::Digit), - 's' => (false, ast::ClassPerlKind::Space), - 'S' => (true, ast::ClassPerlKind::Space), - 'w' => (false, ast::ClassPerlKind::Word), - 'W' => (true, ast::ClassPerlKind::Word), - c => panic!("expected valid Perl class but got '{}'", c), - }; - ast::ClassPerl { span, kind, negated } - } -} - -/// A type that traverses a fully parsed Ast and checks whether its depth -/// exceeds the specified nesting limit. If it does, then an error is returned. -#[derive(Debug)] -struct NestLimiter<'p, 's, P> { - /// The parser that is checking the nest limit. - p: &'p ParserI<'s, P>, - /// The current depth while walking an Ast. - depth: u32, -} - -impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> { - fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { - NestLimiter { p, depth: 0 } - } - - #[inline(never)] - fn check(self, ast: &Ast) -> Result<()> { - ast::visit(ast, self) - } - - fn increment_depth(&mut self, span: &Span) -> Result<()> { - let new = self.depth.checked_add(1).ok_or_else(|| { - self.p.error( - span.clone(), - ast::ErrorKind::NestLimitExceeded(u32::MAX), - ) - })?; - let limit = self.p.parser().nest_limit; - if new > limit { - return Err(self.p.error( - span.clone(), - ast::ErrorKind::NestLimitExceeded(limit), - )); - } - self.depth = new; - Ok(()) - } - - fn decrement_depth(&mut self) { - // Assuming the correctness of the visitor, this should never drop - // below 0. - self.depth = self.depth.checked_sub(1).unwrap(); - } -} - -impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> { - type Output = (); - type Err = ast::Error; - - fn finish(self) -> Result<()> { - Ok(()) - } - - fn visit_pre(&mut self, ast: &Ast) -> Result<()> { - let span = match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::ClassUnicode(_) - | Ast::ClassPerl(_) => { - // These are all base cases, so we don't increment depth. - return Ok(()); - } - Ast::ClassBracketed(ref x) => &x.span, - Ast::Repetition(ref x) => &x.span, - Ast::Group(ref x) => &x.span, - Ast::Alternation(ref x) => &x.span, - Ast::Concat(ref x) => &x.span, - }; - self.increment_depth(span) - } - - fn visit_post(&mut self, ast: &Ast) -> Result<()> { - match *ast { - Ast::Empty(_) - | Ast::Flags(_) - | Ast::Literal(_) - | Ast::Dot(_) - | Ast::Assertion(_) - | Ast::ClassUnicode(_) - | Ast::ClassPerl(_) => { - // These are all base cases, so we don't decrement depth. - Ok(()) - } - Ast::ClassBracketed(_) - | Ast::Repetition(_) - | Ast::Group(_) - | Ast::Alternation(_) - | Ast::Concat(_) => { - self.decrement_depth(); - Ok(()) - } - } - } - - fn visit_class_set_item_pre( - &mut self, - ast: &ast::ClassSetItem, - ) -> Result<()> { - let span = match *ast { - ast::ClassSetItem::Empty(_) - | ast::ClassSetItem::Literal(_) - | ast::ClassSetItem::Range(_) - | ast::ClassSetItem::Ascii(_) - | ast::ClassSetItem::Unicode(_) - | ast::ClassSetItem::Perl(_) => { - // These are all base cases, so we don't increment depth. - return Ok(()); - } - ast::ClassSetItem::Bracketed(ref x) => &x.span, - ast::ClassSetItem::Union(ref x) => &x.span, - }; - self.increment_depth(span) - } - - fn visit_class_set_item_post( - &mut self, - ast: &ast::ClassSetItem, - ) -> Result<()> { - match *ast { - ast::ClassSetItem::Empty(_) - | ast::ClassSetItem::Literal(_) - | ast::ClassSetItem::Range(_) - | ast::ClassSetItem::Ascii(_) - | ast::ClassSetItem::Unicode(_) - | ast::ClassSetItem::Perl(_) => { - // These are all base cases, so we don't decrement depth. - Ok(()) - } - ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { - self.decrement_depth(); - Ok(()) - } - } - } - - fn visit_class_set_binary_op_pre( - &mut self, - ast: &ast::ClassSetBinaryOp, - ) -> Result<()> { - self.increment_depth(&ast.span) - } - - fn visit_class_set_binary_op_post( - &mut self, - _ast: &ast::ClassSetBinaryOp, - ) -> Result<()> { - self.decrement_depth(); - Ok(()) - } -} - -/// When the result is an error, transforms the ast::ErrorKind from the source -/// Result into another one. This function is used to return clearer error -/// messages when possible. -fn specialize_err<T>( - result: Result<T>, - from: ast::ErrorKind, - to: ast::ErrorKind, -) -> Result<T> { - if let Err(e) = result { - if e.kind == from { - Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) - } else { - Err(e) - } - } else { - result - } -} - -#[cfg(test)] -mod tests { - use core::ops::Range; - - use alloc::format; - - use super::*; - - // Our own assert_eq, which has slightly better formatting (but honestly - // still kind of crappy). - macro_rules! assert_eq { - ($left:expr, $right:expr) => {{ - match (&$left, &$right) { - (left_val, right_val) => { - if !(*left_val == *right_val) { - panic!( - "assertion failed: `(left == right)`\n\n\ - left: `{:?}`\nright: `{:?}`\n\n", - left_val, right_val - ) - } - } - } - }}; - } - - // We create these errors to compare with real ast::Errors in the tests. - // We define equality between TestError and ast::Error to disregard the - // pattern string in ast::Error, which is annoying to provide in tests. - #[derive(Clone, Debug)] - struct TestError { - span: Span, - kind: ast::ErrorKind, - } - - impl PartialEq<ast::Error> for TestError { - fn eq(&self, other: &ast::Error) -> bool { - self.span == other.span && self.kind == other.kind - } - } - - impl PartialEq<TestError> for ast::Error { - fn eq(&self, other: &TestError) -> bool { - self.span == other.span && self.kind == other.kind - } - } - - fn s(str: &str) -> String { - str.to_string() - } - - fn parser(pattern: &str) -> ParserI<'_, Parser> { - ParserI::new(Parser::new(), pattern) - } - - fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { - let parser = ParserBuilder::new().octal(true).build(); - ParserI::new(parser, pattern) - } - - fn parser_empty_min_range(pattern: &str) -> ParserI<'_, Parser> { - let parser = ParserBuilder::new().empty_min_range(true).build(); - ParserI::new(parser, pattern) - } - - fn parser_nest_limit( - pattern: &str, - nest_limit: u32, - ) -> ParserI<'_, Parser> { - let p = ParserBuilder::new().nest_limit(nest_limit).build(); - ParserI::new(p, pattern) - } - - fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { - let p = ParserBuilder::new().ignore_whitespace(true).build(); - ParserI::new(p, pattern) - } - - /// Short alias for creating a new span. - fn nspan(start: Position, end: Position) -> Span { - Span::new(start, end) - } - - /// Short alias for creating a new position. - fn npos(offset: usize, line: usize, column: usize) -> Position { - Position::new(offset, line, column) - } - - /// Create a new span from the given offset range. This assumes a single - /// line and sets the columns based on the offsets. i.e., This only works - /// out of the box for ASCII, which is fine for most tests. - fn span(range: Range<usize>) -> Span { - let start = Position::new(range.start, 1, range.start + 1); - let end = Position::new(range.end, 1, range.end + 1); - Span::new(start, end) - } - - /// Create a new span for the corresponding byte range in the given string. - fn span_range(subject: &str, range: Range<usize>) -> Span { - let start = Position { - offset: range.start, - line: 1 + subject[..range.start].matches('\n').count(), - column: 1 + subject[..range.start] - .chars() - .rev() - .position(|c| c == '\n') - .unwrap_or(subject[..range.start].chars().count()), - }; - let end = Position { - offset: range.end, - line: 1 + subject[..range.end].matches('\n').count(), - column: 1 + subject[..range.end] - .chars() - .rev() - .position(|c| c == '\n') - .unwrap_or(subject[..range.end].chars().count()), - }; - Span::new(start, end) - } - - /// Create a verbatim literal starting at the given position. - fn lit(c: char, start: usize) -> Ast { - lit_with(c, span(start..start + c.len_utf8())) - } - - /// Create a meta literal starting at the given position. - fn meta_lit(c: char, span: Span) -> Ast { - Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) - } - - /// Create a verbatim literal with the given span. - fn lit_with(c: char, span: Span) -> Ast { - Ast::literal(ast::Literal { - span, - kind: ast::LiteralKind::Verbatim, - c, - }) - } - - /// Create a concatenation with the given range. - fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast { - concat_with(span(range), asts) - } - - /// Create a concatenation with the given span. - fn concat_with(span: Span, asts: Vec<Ast>) -> Ast { - Ast::concat(ast::Concat { span, asts }) - } - - /// Create an alternation with the given span. - fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast { - Ast::alternation(ast::Alternation { span: span(range), asts }) - } - - /// Create a capturing group with the given span. - fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast { - Ast::group(ast::Group { - span: span(range), - kind: ast::GroupKind::CaptureIndex(index), - ast: Box::new(ast), - }) - } - - /// Create an ast::SetFlags. - /// - /// The given pattern should be the full pattern string. The range given - /// should correspond to the byte offsets where the flag set occurs. - /// - /// If negated is true, then the set is interpreted as beginning with a - /// negation. - fn flag_set( - pat: &str, - range: Range<usize>, - flag: ast::Flag, - negated: bool, - ) -> Ast { - let mut items = vec![ast::FlagsItem { - span: span_range(pat, (range.end - 2)..(range.end - 1)), - kind: ast::FlagsItemKind::Flag(flag), - }]; - if negated { - items.insert( - 0, - ast::FlagsItem { - span: span_range(pat, (range.start + 2)..(range.end - 2)), - kind: ast::FlagsItemKind::Negation, - }, - ); - } - Ast::flags(ast::SetFlags { - span: span_range(pat, range.clone()), - flags: ast::Flags { - span: span_range(pat, (range.start + 2)..(range.end - 1)), - items, - }, - }) - } - - #[test] - fn parse_nest_limit() { - // A nest limit of 0 still allows some types of regexes. - assert_eq!( - parser_nest_limit("", 0).parse(), - Ok(Ast::empty(span(0..0))) - ); - assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); - - // Test repetition operations, which require one level of nesting. - assert_eq!( - parser_nest_limit("a+", 0).parse().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::NestLimitExceeded(0), - } - ); - assert_eq!( - parser_nest_limit("a+", 1).parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::OneOrMore, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser_nest_limit("(a)+", 1).parse().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::NestLimitExceeded(1), - } - ); - assert_eq!( - parser_nest_limit("a+*", 1).parse().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::NestLimitExceeded(1), - } - ); - assert_eq!( - parser_nest_limit("a+*", 2).parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..3), - op: ast::RepetitionOp { - span: span(2..3), - kind: ast::RepetitionKind::ZeroOrMore, - }, - greedy: true, - ast: Box::new(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::OneOrMore, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })), - })) - ); - - // Test concatenations. A concatenation requires one level of nesting. - assert_eq!( - parser_nest_limit("ab", 0).parse().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::NestLimitExceeded(0), - } - ); - assert_eq!( - parser_nest_limit("ab", 1).parse(), - Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) - ); - assert_eq!( - parser_nest_limit("abc", 1).parse(), - Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) - ); - - // Test alternations. An alternation requires one level of nesting. - assert_eq!( - parser_nest_limit("a|b", 0).parse().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::NestLimitExceeded(0), - } - ); - assert_eq!( - parser_nest_limit("a|b", 1).parse(), - Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) - ); - assert_eq!( - parser_nest_limit("a|b|c", 1).parse(), - Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) - ); - - // Test character classes. Classes form their own mini-recursive - // syntax! - assert_eq!( - parser_nest_limit("[a]", 0).parse().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::NestLimitExceeded(0), - } - ); - assert_eq!( - parser_nest_limit("[a]", 1).parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..3), - negated: false, - kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( - ast::Literal { - span: span(1..2), - kind: ast::LiteralKind::Verbatim, - c: 'a', - } - )), - })) - ); - assert_eq!( - parser_nest_limit("[ab]", 1).parse().unwrap_err(), - TestError { - span: span(1..3), - kind: ast::ErrorKind::NestLimitExceeded(1), - } - ); - assert_eq!( - parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), - TestError { - span: span(3..7), - kind: ast::ErrorKind::NestLimitExceeded(2), - } - ); - assert_eq!( - parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), - TestError { - span: span(4..6), - kind: ast::ErrorKind::NestLimitExceeded(3), - } - ); - assert_eq!( - parser_nest_limit("[a--b]", 1).parse().unwrap_err(), - TestError { - span: span(1..5), - kind: ast::ErrorKind::NestLimitExceeded(1), - } - ); - assert_eq!( - parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), - TestError { - span: span(4..6), - kind: ast::ErrorKind::NestLimitExceeded(2), - } - ); - } - - #[test] - fn parse_comments() { - let pat = "(?x) -# This is comment 1. -foo # This is comment 2. - # This is comment 3. -bar -# This is comment 4."; - let astc = parser(pat).parse_with_comments().unwrap(); - assert_eq!( - astc.ast, - concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - lit_with('f', span_range(pat, 26..27)), - lit_with('o', span_range(pat, 27..28)), - lit_with('o', span_range(pat, 28..29)), - lit_with('b', span_range(pat, 74..75)), - lit_with('a', span_range(pat, 75..76)), - lit_with('r', span_range(pat, 76..77)), - ] - ) - ); - assert_eq!( - astc.comments, - vec![ - ast::Comment { - span: span_range(pat, 5..26), - comment: s(" This is comment 1."), - }, - ast::Comment { - span: span_range(pat, 30..51), - comment: s(" This is comment 2."), - }, - ast::Comment { - span: span_range(pat, 53..74), - comment: s(" This is comment 3."), - }, - ast::Comment { - span: span_range(pat, 78..98), - comment: s(" This is comment 4."), - }, - ] - ); - } - - #[test] - fn parse_holistic() { - assert_eq!(parser("]").parse(), Ok(lit(']', 0))); - assert_eq!( - parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), - Ok(concat( - 0..36, - vec![ - meta_lit('\\', span(0..2)), - meta_lit('.', span(2..4)), - meta_lit('+', span(4..6)), - meta_lit('*', span(6..8)), - meta_lit('?', span(8..10)), - meta_lit('(', span(10..12)), - meta_lit(')', span(12..14)), - meta_lit('|', span(14..16)), - meta_lit('[', span(16..18)), - meta_lit(']', span(18..20)), - meta_lit('{', span(20..22)), - meta_lit('}', span(22..24)), - meta_lit('^', span(24..26)), - meta_lit('$', span(26..28)), - meta_lit('#', span(28..30)), - meta_lit('&', span(30..32)), - meta_lit('-', span(32..34)), - meta_lit('~', span(34..36)), - ] - )) - ); - } - - #[test] - fn parse_ignore_whitespace() { - // Test that basic whitespace insensitivity works. - let pat = "(?x)a b"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - nspan(npos(0, 1, 1), npos(7, 1, 8)), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), - lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), - ] - )) - ); - - // Test that we can toggle whitespace insensitivity. - let pat = "(?x)a b(?-x)a b"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - nspan(npos(0, 1, 1), npos(15, 1, 16)), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), - lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), - flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), - lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), - lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), - lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), - ] - )) - ); - - // Test that nesting whitespace insensitive flags works. - let pat = "a (?x:a )a "; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..11), - vec![ - lit_with('a', span_range(pat, 0..1)), - lit_with(' ', span_range(pat, 1..2)), - Ast::group(ast::Group { - span: span_range(pat, 2..9), - kind: ast::GroupKind::NonCapturing(ast::Flags { - span: span_range(pat, 4..5), - items: vec![ast::FlagsItem { - span: span_range(pat, 4..5), - kind: ast::FlagsItemKind::Flag( - ast::Flag::IgnoreWhitespace - ), - },], - }), - ast: Box::new(lit_with('a', span_range(pat, 6..7))), - }), - lit_with('a', span_range(pat, 9..10)), - lit_with(' ', span_range(pat, 10..11)), - ] - )) - ); - - // Test that whitespace after an opening paren is insignificant. - let pat = "(?x)( ?P<foo> a )"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::group(ast::Group { - span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span_range(pat, 9..12), - name: s("foo"), - index: 1, - } - }, - ast: Box::new(lit_with('a', span_range(pat, 14..15))), - }), - ] - )) - ); - let pat = "(?x)( a )"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::group(ast::Group { - span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(lit_with('a', span_range(pat, 7..8))), - }), - ] - )) - ); - let pat = "(?x)( ?: a )"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::group(ast::Group { - span: span_range(pat, 4..pat.len()), - kind: ast::GroupKind::NonCapturing(ast::Flags { - span: span_range(pat, 8..8), - items: vec![], - }), - ast: Box::new(lit_with('a', span_range(pat, 11..12))), - }), - ] - )) - ); - let pat = r"(?x)\x { 53 }"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::literal(ast::Literal { - span: span(4..13), - kind: ast::LiteralKind::HexBrace( - ast::HexLiteralKind::X - ), - c: 'S', - }), - ] - )) - ); - - // Test that whitespace after an escape is OK. - let pat = r"(?x)\ "; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), - Ast::literal(ast::Literal { - span: span_range(pat, 4..6), - kind: ast::LiteralKind::Superfluous, - c: ' ', - }), - ] - )) - ); - } - - #[test] - fn parse_newlines() { - let pat = ".\n."; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..3), - vec![ - Ast::dot(span_range(pat, 0..1)), - lit_with('\n', span_range(pat, 1..2)), - Ast::dot(span_range(pat, 2..3)), - ] - )) - ); - - let pat = "foobar\nbaz\nquux\n"; - assert_eq!( - parser(pat).parse(), - Ok(concat_with( - span_range(pat, 0..pat.len()), - vec![ - lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), - lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), - lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), - lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), - lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), - lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), - lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), - lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), - lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), - lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), - lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), - lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), - lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), - lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), - lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), - lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), - ] - )) - ); - } - - #[test] - fn parse_uncounted_repetition() { - assert_eq!( - parser(r"a*").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::ZeroOrMore, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a+").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::OneOrMore, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - - assert_eq!( - parser(r"a?").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a??").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..3), - op: ast::RepetitionOp { - span: span(1..3), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: false, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a?").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a?b").parse(), - Ok(concat( - 0..3, - vec![ - Ast::repetition(ast::Repetition { - span: span(0..2), - op: ast::RepetitionOp { - span: span(1..2), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(lit('a', 0)), - }), - lit('b', 2), - ] - )) - ); - assert_eq!( - parser(r"a??b").parse(), - Ok(concat( - 0..4, - vec![ - Ast::repetition(ast::Repetition { - span: span(0..3), - op: ast::RepetitionOp { - span: span(1..3), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: false, - ast: Box::new(lit('a', 0)), - }), - lit('b', 3), - ] - )) - ); - assert_eq!( - parser(r"ab?").parse(), - Ok(concat( - 0..3, - vec![ - lit('a', 0), - Ast::repetition(ast::Repetition { - span: span(1..3), - op: ast::RepetitionOp { - span: span(2..3), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(lit('b', 1)), - }), - ] - )) - ); - assert_eq!( - parser(r"(ab)?").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..5), - op: ast::RepetitionOp { - span: span(4..5), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(group( - 0..4, - 1, - concat(1..3, vec![lit('a', 1), lit('b', 2),]) - )), - })) - ); - assert_eq!( - parser(r"|a?").parse(), - Ok(alt( - 0..3, - vec![ - Ast::empty(span(0..0)), - Ast::repetition(ast::Repetition { - span: span(1..3), - op: ast::RepetitionOp { - span: span(2..3), - kind: ast::RepetitionKind::ZeroOrOne, - }, - greedy: true, - ast: Box::new(lit('a', 1)), - }), - ] - )) - ); - - assert_eq!( - parser(r"*").parse().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"(?i)*").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"(*)").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"(?:?)").parse().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"+").parse().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"?").parse().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"(?)").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"|*").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"|+").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"|?").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - } - - #[test] - fn parse_counted_repetition() { - assert_eq!( - parser(r"a{5}").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..4), - op: ast::RepetitionOp { - span: span(1..4), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Exactly(5) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a{5,}").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..5), - op: ast::RepetitionOp { - span: span(1..5), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::AtLeast(5) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a{5,9}").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..6), - op: ast::RepetitionOp { - span: span(1..6), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Bounded(5, 9) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a{5}?").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..5), - op: ast::RepetitionOp { - span: span(1..5), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Exactly(5) - ), - }, - greedy: false, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"ab{5}").parse(), - Ok(concat( - 0..5, - vec![ - lit('a', 0), - Ast::repetition(ast::Repetition { - span: span(1..5), - op: ast::RepetitionOp { - span: span(2..5), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Exactly(5) - ), - }, - greedy: true, - ast: Box::new(lit('b', 1)), - }), - ] - )) - ); - assert_eq!( - parser(r"ab{5}c").parse(), - Ok(concat( - 0..6, - vec![ - lit('a', 0), - Ast::repetition(ast::Repetition { - span: span(1..5), - op: ast::RepetitionOp { - span: span(2..5), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Exactly(5) - ), - }, - greedy: true, - ast: Box::new(lit('b', 1)), - }), - lit('c', 5), - ] - )) - ); - - assert_eq!( - parser(r"a{ 5 }").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..6), - op: ast::RepetitionOp { - span: span(1..6), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Exactly(5) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"a{ 5 , 9 }").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..10), - op: ast::RepetitionOp { - span: span(1..10), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Bounded(5, 9) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser_empty_min_range(r"a{,9}").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..5), - op: ast::RepetitionOp { - span: span(1..5), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Bounded(0, 9) - ), - }, - greedy: true, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser_ignore_whitespace(r"a{5,9} ?").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..8), - op: ast::RepetitionOp { - span: span(1..8), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Bounded(5, 9) - ), - }, - greedy: false, - ast: Box::new(lit('a', 0)), - })) - ); - assert_eq!( - parser(r"\b{5,9}").parse(), - Ok(Ast::repetition(ast::Repetition { - span: span(0..7), - op: ast::RepetitionOp { - span: span(2..7), - kind: ast::RepetitionKind::Range( - ast::RepetitionRange::Bounded(5, 9) - ), - }, - greedy: true, - ast: Box::new(Ast::assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::WordBoundary, - })), - })) - ); - - assert_eq!( - parser(r"(?i){0}").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"(?m){1,1}").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"a{]}").parse().unwrap_err(), - TestError { - span: span(2..2), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, - } - ); - assert_eq!( - parser(r"a{1,]}").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, - } - ); - assert_eq!( - parser(r"a{").parse().unwrap_err(), - TestError { - span: span(1..2), - kind: ast::ErrorKind::RepetitionCountUnclosed, - } - ); - assert_eq!( - parser(r"a{}").parse().unwrap_err(), - TestError { - span: span(2..2), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, - } - ); - assert_eq!( - parser(r"a{a").parse().unwrap_err(), - TestError { - span: span(2..2), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, - } - ); - assert_eq!( - parser(r"a{9999999999}").parse().unwrap_err(), - TestError { - span: span(2..12), - kind: ast::ErrorKind::DecimalInvalid, - } - ); - assert_eq!( - parser(r"a{9").parse().unwrap_err(), - TestError { - span: span(1..3), - kind: ast::ErrorKind::RepetitionCountUnclosed, - } - ); - assert_eq!( - parser(r"a{9,a").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::RepetitionCountDecimalEmpty, - } - ); - assert_eq!( - parser(r"a{9,9999999999}").parse().unwrap_err(), - TestError { - span: span(4..14), - kind: ast::ErrorKind::DecimalInvalid, - } - ); - assert_eq!( - parser(r"a{9,").parse().unwrap_err(), - TestError { - span: span(1..4), - kind: ast::ErrorKind::RepetitionCountUnclosed, - } - ); - assert_eq!( - parser(r"a{9,11").parse().unwrap_err(), - TestError { - span: span(1..6), - kind: ast::ErrorKind::RepetitionCountUnclosed, - } - ); - assert_eq!( - parser(r"a{2,1}").parse().unwrap_err(), - TestError { - span: span(1..6), - kind: ast::ErrorKind::RepetitionCountInvalid, - } - ); - assert_eq!( - parser(r"{5}").parse().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - assert_eq!( - parser(r"|{5}").parse().unwrap_err(), - TestError { - span: span(1..1), - kind: ast::ErrorKind::RepetitionMissing, - } - ); - } - - #[test] - fn parse_alternate() { - assert_eq!( - parser(r"a|b").parse(), - Ok(Ast::alternation(ast::Alternation { - span: span(0..3), - asts: vec![lit('a', 0), lit('b', 2)], - })) - ); - assert_eq!( - parser(r"(a|b)").parse(), - Ok(group( - 0..5, - 1, - Ast::alternation(ast::Alternation { - span: span(1..4), - asts: vec![lit('a', 1), lit('b', 3)], - }) - )) - ); - - assert_eq!( - parser(r"a|b|c").parse(), - Ok(Ast::alternation(ast::Alternation { - span: span(0..5), - asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], - })) - ); - assert_eq!( - parser(r"ax|by|cz").parse(), - Ok(Ast::alternation(ast::Alternation { - span: span(0..8), - asts: vec![ - concat(0..2, vec![lit('a', 0), lit('x', 1)]), - concat(3..5, vec![lit('b', 3), lit('y', 4)]), - concat(6..8, vec![lit('c', 6), lit('z', 7)]), - ], - })) - ); - assert_eq!( - parser(r"(ax|by|cz)").parse(), - Ok(group( - 0..10, - 1, - Ast::alternation(ast::Alternation { - span: span(1..9), - asts: vec![ - concat(1..3, vec![lit('a', 1), lit('x', 2)]), - concat(4..6, vec![lit('b', 4), lit('y', 5)]), - concat(7..9, vec![lit('c', 7), lit('z', 8)]), - ], - }) - )) - ); - assert_eq!( - parser(r"(ax|(by|(cz)))").parse(), - Ok(group( - 0..14, - 1, - alt( - 1..13, - vec![ - concat(1..3, vec![lit('a', 1), lit('x', 2)]), - group( - 4..13, - 2, - alt( - 5..12, - vec![ - concat( - 5..7, - vec![lit('b', 5), lit('y', 6)] - ), - group( - 8..12, - 3, - concat( - 9..11, - vec![lit('c', 9), lit('z', 10),] - ) - ), - ] - ) - ), - ] - ) - )) - ); - - assert_eq!( - parser(r"|").parse(), - Ok(alt( - 0..1, - vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] - )) - ); - assert_eq!( - parser(r"||").parse(), - Ok(alt( - 0..2, - vec![ - Ast::empty(span(0..0)), - Ast::empty(span(1..1)), - Ast::empty(span(2..2)), - ] - )) - ); - assert_eq!( - parser(r"a|").parse(), - Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) - ); - assert_eq!( - parser(r"|a").parse(), - Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) - ); - - assert_eq!( - parser(r"(|)").parse(), - Ok(group( - 0..3, - 1, - alt( - 1..2, - vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] - ) - )) - ); - assert_eq!( - parser(r"(a|)").parse(), - Ok(group( - 0..4, - 1, - alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) - )) - ); - assert_eq!( - parser(r"(|a)").parse(), - Ok(group( - 0..4, - 1, - alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) - )) - ); - - assert_eq!( - parser(r"a|b)").parse().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::GroupUnopened, - } - ); - assert_eq!( - parser(r"(a|b").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnclosed, - } - ); - } - - #[test] - fn parse_unsupported_lookaround() { - assert_eq!( - parser(r"(?=a)").parse().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, - } - ); - assert_eq!( - parser(r"(?!a)").parse().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::UnsupportedLookAround, - } - ); - assert_eq!( - parser(r"(?<=a)").parse().unwrap_err(), - TestError { - span: span(0..4), - kind: ast::ErrorKind::UnsupportedLookAround, - } - ); - assert_eq!( - parser(r"(?<!a)").parse().unwrap_err(), - TestError { - span: span(0..4), - kind: ast::ErrorKind::UnsupportedLookAround, - } - ); - } - - #[test] - fn parse_group() { - assert_eq!( - parser("(?i)").parse(), - Ok(Ast::flags(ast::SetFlags { - span: span(0..4), - flags: ast::Flags { - span: span(2..3), - items: vec![ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }], - }, - })) - ); - assert_eq!( - parser("(?iU)").parse(), - Ok(Ast::flags(ast::SetFlags { - span: span(0..5), - flags: ast::Flags { - span: span(2..4), - items: vec![ - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Flag( - ast::Flag::SwapGreed - ), - }, - ], - }, - })) - ); - assert_eq!( - parser("(?i-U)").parse(), - Ok(Ast::flags(ast::SetFlags { - span: span(0..6), - flags: ast::Flags { - span: span(2..5), - items: vec![ - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Negation, - }, - ast::FlagsItem { - span: span(4..5), - kind: ast::FlagsItemKind::Flag( - ast::Flag::SwapGreed - ), - }, - ], - }, - })) - ); - - assert_eq!( - parser("()").parse(), - Ok(Ast::group(ast::Group { - span: span(0..2), - kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::empty(span(1..1))), - })) - ); - assert_eq!( - parser("(a)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..3), - kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(lit('a', 1)), - })) - ); - assert_eq!( - parser("(())").parse(), - Ok(Ast::group(ast::Group { - span: span(0..4), - kind: ast::GroupKind::CaptureIndex(1), - ast: Box::new(Ast::group(ast::Group { - span: span(1..3), - kind: ast::GroupKind::CaptureIndex(2), - ast: Box::new(Ast::empty(span(2..2))), - })), - })) - ); - - assert_eq!( - parser("(?:a)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..5), - kind: ast::GroupKind::NonCapturing(ast::Flags { - span: span(2..2), - items: vec![], - }), - ast: Box::new(lit('a', 3)), - })) - ); - - assert_eq!( - parser("(?i:a)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..6), - kind: ast::GroupKind::NonCapturing(ast::Flags { - span: span(2..3), - items: vec![ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - },], - }), - ast: Box::new(lit('a', 4)), - })) - ); - assert_eq!( - parser("(?i-U:a)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..8), - kind: ast::GroupKind::NonCapturing(ast::Flags { - span: span(2..5), - items: vec![ - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Negation, - }, - ast::FlagsItem { - span: span(4..5), - kind: ast::FlagsItemKind::Flag( - ast::Flag::SwapGreed - ), - }, - ], - }), - ast: Box::new(lit('a', 6)), - })) - ); - - assert_eq!( - parser("(").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnclosed, - } - ); - assert_eq!( - parser("(?").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnclosed, - } - ); - assert_eq!( - parser("(?P").parse().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::FlagUnrecognized, - } - ); - assert_eq!( - parser("(?P<").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::GroupNameUnexpectedEof, - } - ); - assert_eq!( - parser("(a").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnclosed, - } - ); - assert_eq!( - parser("(()").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnclosed, - } - ); - assert_eq!( - parser(")").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::GroupUnopened, - } - ); - assert_eq!( - parser("a)").parse().unwrap_err(), - TestError { - span: span(1..2), - kind: ast::ErrorKind::GroupUnopened, - } - ); - } - - #[test] - fn parse_capture_name() { - assert_eq!( - parser("(?<a>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..7), - kind: ast::GroupKind::CaptureName { - starts_with_p: false, - name: ast::CaptureName { - span: span(3..4), - name: s("a"), - index: 1, - } - }, - ast: Box::new(lit('z', 5)), - })) - ); - assert_eq!( - parser("(?P<a>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..8), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span(4..5), - name: s("a"), - index: 1, - } - }, - ast: Box::new(lit('z', 6)), - })) - ); - assert_eq!( - parser("(?P<abc>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..10), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span(4..7), - name: s("abc"), - index: 1, - } - }, - ast: Box::new(lit('z', 8)), - })) - ); - - assert_eq!( - parser("(?P<a_1>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..10), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span(4..7), - name: s("a_1"), - index: 1, - } - }, - ast: Box::new(lit('z', 8)), - })) - ); - - assert_eq!( - parser("(?P<a.1>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..10), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span(4..7), - name: s("a.1"), - index: 1, - } - }, - ast: Box::new(lit('z', 8)), - })) - ); - - assert_eq!( - parser("(?P<a[1]>z)").parse(), - Ok(Ast::group(ast::Group { - span: span(0..11), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: span(4..8), - name: s("a[1]"), - index: 1, - } - }, - ast: Box::new(lit('z', 9)), - })) - ); - - assert_eq!( - parser("(?P<a¾>)").parse(), - Ok(Ast::group(ast::Group { - span: Span::new( - Position::new(0, 1, 1), - Position::new(9, 1, 9), - ), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: Span::new( - Position::new(4, 1, 5), - Position::new(7, 1, 7), - ), - name: s("a¾"), - index: 1, - } - }, - ast: Box::new(Ast::empty(Span::new( - Position::new(8, 1, 8), - Position::new(8, 1, 8), - ))), - })) - ); - assert_eq!( - parser("(?P<名字>)").parse(), - Ok(Ast::group(ast::Group { - span: Span::new( - Position::new(0, 1, 1), - Position::new(12, 1, 9), - ), - kind: ast::GroupKind::CaptureName { - starts_with_p: true, - name: ast::CaptureName { - span: Span::new( - Position::new(4, 1, 5), - Position::new(10, 1, 7), - ), - name: s("名字"), - index: 1, - } - }, - ast: Box::new(Ast::empty(Span::new( - Position::new(11, 1, 8), - Position::new(11, 1, 8), - ))), - })) - ); - - assert_eq!( - parser("(?P<").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::GroupNameUnexpectedEof, - } - ); - assert_eq!( - parser("(?P<>z)").parse().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::GroupNameEmpty, - } - ); - assert_eq!( - parser("(?P<a").parse().unwrap_err(), - TestError { - span: span(5..5), - kind: ast::ErrorKind::GroupNameUnexpectedEof, - } - ); - assert_eq!( - parser("(?P<ab").parse().unwrap_err(), - TestError { - span: span(6..6), - kind: ast::ErrorKind::GroupNameUnexpectedEof, - } - ); - assert_eq!( - parser("(?P<0a").parse().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<~").parse().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<abc~").parse().unwrap_err(), - TestError { - span: span(7..8), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(), - TestError { - span: span(12..13), - kind: ast::ErrorKind::GroupNameDuplicate { - original: span(4..5), - }, - } - ); - assert_eq!( - parser("(?P<5>)").parse().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<5a>)").parse().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<¾>)").parse().unwrap_err(), - TestError { - span: Span::new( - Position::new(4, 1, 5), - Position::new(6, 1, 6), - ), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<¾a>)").parse().unwrap_err(), - TestError { - span: Span::new( - Position::new(4, 1, 5), - Position::new(6, 1, 6), - ), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<☃>)").parse().unwrap_err(), - TestError { - span: Span::new( - Position::new(4, 1, 5), - Position::new(7, 1, 6), - ), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - assert_eq!( - parser("(?P<a☃>)").parse().unwrap_err(), - TestError { - span: Span::new( - Position::new(5, 1, 6), - Position::new(8, 1, 7), - ), - kind: ast::ErrorKind::GroupNameInvalid, - } - ); - } - - #[test] - fn parse_flags() { - assert_eq!( - parser("i:").parse_flags(), - Ok(ast::Flags { - span: span(0..1), - items: vec![ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), - }], - }) - ); - assert_eq!( - parser("i)").parse_flags(), - Ok(ast::Flags { - span: span(0..1), - items: vec![ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), - }], - }) - ); - - assert_eq!( - parser("isU:").parse_flags(), - Ok(ast::Flags { - span: span(0..3), - items: vec![ - ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(1..2), - kind: ast::FlagsItemKind::Flag( - ast::Flag::DotMatchesNewLine - ), - }, - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), - }, - ], - }) - ); - - assert_eq!( - parser("-isU:").parse_flags(), - Ok(ast::Flags { - span: span(0..4), - items: vec![ - ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Negation, - }, - ast::FlagsItem { - span: span(1..2), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::DotMatchesNewLine - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), - }, - ], - }) - ); - assert_eq!( - parser("i-sU:").parse_flags(), - Ok(ast::Flags { - span: span(0..4), - items: vec![ - ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(1..2), - kind: ast::FlagsItemKind::Negation, - }, - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::DotMatchesNewLine - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), - }, - ], - }) - ); - assert_eq!( - parser("i-sR:").parse_flags(), - Ok(ast::Flags { - span: span(0..4), - items: vec![ - ast::FlagsItem { - span: span(0..1), - kind: ast::FlagsItemKind::Flag( - ast::Flag::CaseInsensitive - ), - }, - ast::FlagsItem { - span: span(1..2), - kind: ast::FlagsItemKind::Negation, - }, - ast::FlagsItem { - span: span(2..3), - kind: ast::FlagsItemKind::Flag( - ast::Flag::DotMatchesNewLine - ), - }, - ast::FlagsItem { - span: span(3..4), - kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), - }, - ], - }) - ); - - assert_eq!( - parser("isU").parse_flags().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::FlagUnexpectedEof, - } - ); - assert_eq!( - parser("isUa:").parse_flags().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::FlagUnrecognized, - } - ); - assert_eq!( - parser("isUi:").parse_flags().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, - } - ); - assert_eq!( - parser("i-sU-i:").parse_flags().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::FlagRepeatedNegation { - original: span(1..2), - }, - } - ); - assert_eq!( - parser("-)").parse_flags().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::FlagDanglingNegation, - } - ); - assert_eq!( - parser("i-)").parse_flags().unwrap_err(), - TestError { - span: span(1..2), - kind: ast::ErrorKind::FlagDanglingNegation, - } - ); - assert_eq!( - parser("iU-)").parse_flags().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::FlagDanglingNegation, - } - ); - } - - #[test] - fn parse_flag() { - assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); - assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); - assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); - assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); - assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); - assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); - assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); - - assert_eq!( - parser("a").parse_flag().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::FlagUnrecognized, - } - ); - assert_eq!( - parser("☃").parse_flag().unwrap_err(), - TestError { - span: span_range("☃", 0..3), - kind: ast::ErrorKind::FlagUnrecognized, - } - ); - } - - #[test] - fn parse_primitive_non_escape() { - assert_eq!( - parser(r".").parse_primitive(), - Ok(Primitive::Dot(span(0..1))) - ); - assert_eq!( - parser(r"^").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..1), - kind: ast::AssertionKind::StartLine, - })) - ); - assert_eq!( - parser(r"$").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..1), - kind: ast::AssertionKind::EndLine, - })) - ); - - assert_eq!( - parser(r"a").parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..1), - kind: ast::LiteralKind::Verbatim, - c: 'a', - })) - ); - assert_eq!( - parser(r"|").parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..1), - kind: ast::LiteralKind::Verbatim, - c: '|', - })) - ); - assert_eq!( - parser(r"☃").parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span_range("☃", 0..3), - kind: ast::LiteralKind::Verbatim, - c: '☃', - })) - ); - } - - #[test] - fn parse_escape() { - assert_eq!( - parser(r"\|").parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..2), - kind: ast::LiteralKind::Meta, - c: '|', - })) - ); - let specials = &[ - (r"\a", '\x07', ast::SpecialLiteralKind::Bell), - (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), - (r"\t", '\t', ast::SpecialLiteralKind::Tab), - (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), - (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), - (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), - ]; - for &(pat, c, ref kind) in specials { - assert_eq!( - parser(pat).parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..2), - kind: ast::LiteralKind::Special(kind.clone()), - c, - })) - ); - } - assert_eq!( - parser(r"\A").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::StartText, - })) - ); - assert_eq!( - parser(r"\z").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::EndText, - })) - ); - assert_eq!( - parser(r"\b").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::WordBoundary, - })) - ); - assert_eq!( - parser(r"\b{start}").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..9), - kind: ast::AssertionKind::WordBoundaryStart, - })) - ); - assert_eq!( - parser(r"\b{end}").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..7), - kind: ast::AssertionKind::WordBoundaryEnd, - })) - ); - assert_eq!( - parser(r"\b{start-half}").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..14), - kind: ast::AssertionKind::WordBoundaryStartHalf, - })) - ); - assert_eq!( - parser(r"\b{end-half}").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..12), - kind: ast::AssertionKind::WordBoundaryEndHalf, - })) - ); - assert_eq!( - parser(r"\<").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::WordBoundaryStartAngle, - })) - ); - assert_eq!( - parser(r"\>").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::WordBoundaryEndAngle, - })) - ); - assert_eq!( - parser(r"\B").parse_primitive(), - Ok(Primitive::Assertion(ast::Assertion { - span: span(0..2), - kind: ast::AssertionKind::NotWordBoundary, - })) - ); - - // We also support superfluous escapes in most cases now too. - for c in ['!', '@', '%', '"', '\'', '/', ' '] { - let pat = format!(r"\{}", c); - assert_eq!( - parser(&pat).parse_primitive(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..2), - kind: ast::LiteralKind::Superfluous, - c, - })) - ); - } - - // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This - // gives flexibility for future evolution. - assert_eq!( - parser(r"\e").parse_escape().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); - assert_eq!( - parser(r"\y").parse_escape().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); - - // Starting a special word boundary without any non-whitespace chars - // after the brace makes it ambiguous whether the user meant to write - // a counted repetition (probably not?) or an actual special word - // boundary assertion. - assert_eq!( - parser(r"\b{").parse_escape().unwrap_err(), - TestError { - span: span(0..3), - kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, - } - ); - assert_eq!( - parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), - TestError { - span: span(0..4), - kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, - } - ); - // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, - // and thus causes the parser to treat it as a counted repetition. - assert_eq!( - parser(r"\b{ ").parse().unwrap_err(), - TestError { - span: span(2..4), - kind: ast::ErrorKind::RepetitionCountUnclosed, - } - ); - // In this case, we got some valid chars that makes it look like the - // user is writing one of the special word boundary assertions, but - // we forget to close the brace. - assert_eq!( - parser(r"\b{foo").parse_escape().unwrap_err(), - TestError { - span: span(2..6), - kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, - } - ); - // We get the same error as above, except it is provoked by seeing a - // char that we know is invalid before seeing a closing brace. - assert_eq!( - parser(r"\b{foo!}").parse_escape().unwrap_err(), - TestError { - span: span(2..6), - kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, - } - ); - // And this one occurs when, syntactically, everything looks okay, but - // we don't use a valid spelling of a word boundary assertion. - assert_eq!( - parser(r"\b{foo}").parse_escape().unwrap_err(), - TestError { - span: span(3..6), - kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, - } - ); - - // An unfinished escape is illegal. - assert_eq!( - parser(r"\").parse_escape().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - } - - #[test] - fn parse_unsupported_backreference() { - assert_eq!( - parser(r"\0").parse_escape().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::UnsupportedBackreference, - } - ); - assert_eq!( - parser(r"\9").parse_escape().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::UnsupportedBackreference, - } - ); - } - - #[test] - fn parse_octal() { - for i in 0..511 { - let pat = format!(r"\{:o}", i); - assert_eq!( - parser_octal(&pat).parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..pat.len()), - kind: ast::LiteralKind::Octal, - c: char::from_u32(i).unwrap(), - })) - ); - } - assert_eq!( - parser_octal(r"\778").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..3), - kind: ast::LiteralKind::Octal, - c: '?', - })) - ); - assert_eq!( - parser_octal(r"\7777").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..4), - kind: ast::LiteralKind::Octal, - c: '\u{01FF}', - })) - ); - assert_eq!( - parser_octal(r"\778").parse(), - Ok(Ast::concat(ast::Concat { - span: span(0..4), - asts: vec![ - Ast::literal(ast::Literal { - span: span(0..3), - kind: ast::LiteralKind::Octal, - c: '?', - }), - Ast::literal(ast::Literal { - span: span(3..4), - kind: ast::LiteralKind::Verbatim, - c: '8', - }), - ], - })) - ); - assert_eq!( - parser_octal(r"\7777").parse(), - Ok(Ast::concat(ast::Concat { - span: span(0..5), - asts: vec![ - Ast::literal(ast::Literal { - span: span(0..4), - kind: ast::LiteralKind::Octal, - c: '\u{01FF}', - }), - Ast::literal(ast::Literal { - span: span(4..5), - kind: ast::LiteralKind::Verbatim, - c: '7', - }), - ], - })) - ); - - assert_eq!( - parser_octal(r"\8").parse_escape().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::EscapeUnrecognized, - } - ); - } - - #[test] - fn parse_hex_two() { - for i in 0..256 { - let pat = format!(r"\x{:02x}", i); - assert_eq!( - parser(&pat).parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..pat.len()), - kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), - c: char::from_u32(i).unwrap(), - })) - ); - } - - assert_eq!( - parser(r"\xF").parse_escape().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\xG").parse_escape().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\xFG").parse_escape().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - } - - #[test] - fn parse_hex_four() { - for i in 0..65536 { - let c = match char::from_u32(i) { - None => continue, - Some(c) => c, - }; - let pat = format!(r"\u{:04x}", i); - assert_eq!( - parser(&pat).parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..pat.len()), - kind: ast::LiteralKind::HexFixed( - ast::HexLiteralKind::UnicodeShort - ), - c, - })) - ); - } - - assert_eq!( - parser(r"\uF").parse_escape().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\uG").parse_escape().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\uFG").parse_escape().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\uFFG").parse_escape().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\uFFFG").parse_escape().unwrap_err(), - TestError { - span: span(5..6), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\uD800").parse_escape().unwrap_err(), - TestError { - span: span(2..6), - kind: ast::ErrorKind::EscapeHexInvalid, - } - ); - } - - #[test] - fn parse_hex_eight() { - for i in 0..65536 { - let c = match char::from_u32(i) { - None => continue, - Some(c) => c, - }; - let pat = format!(r"\U{:08x}", i); - assert_eq!( - parser(&pat).parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..pat.len()), - kind: ast::LiteralKind::HexFixed( - ast::HexLiteralKind::UnicodeLong - ), - c, - })) - ); - } - - assert_eq!( - parser(r"\UF").parse_escape().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\UG").parse_escape().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFG").parse_escape().unwrap_err(), - TestError { - span: span(3..4), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFG").parse_escape().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFFG").parse_escape().unwrap_err(), - TestError { - span: span(5..6), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFFFG").parse_escape().unwrap_err(), - TestError { - span: span(6..7), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFFFFG").parse_escape().unwrap_err(), - TestError { - span: span(7..8), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFFFFFG").parse_escape().unwrap_err(), - TestError { - span: span(8..9), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), - TestError { - span: span(9..10), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - } - - #[test] - fn parse_hex_brace() { - assert_eq!( - parser(r"\u{26c4}").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..8), - kind: ast::LiteralKind::HexBrace( - ast::HexLiteralKind::UnicodeShort - ), - c: '⛄', - })) - ); - assert_eq!( - parser(r"\U{26c4}").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..8), - kind: ast::LiteralKind::HexBrace( - ast::HexLiteralKind::UnicodeLong - ), - c: '⛄', - })) - ); - assert_eq!( - parser(r"\x{26c4}").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..8), - kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), - c: '⛄', - })) - ); - assert_eq!( - parser(r"\x{26C4}").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..8), - kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), - c: '⛄', - })) - ); - assert_eq!( - parser(r"\x{10fFfF}").parse_escape(), - Ok(Primitive::Literal(ast::Literal { - span: span(0..10), - kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), - c: '\u{10FFFF}', - })) - ); - - assert_eq!( - parser(r"\x").parse_escape().unwrap_err(), - TestError { - span: span(2..2), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\x{").parse_escape().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\x{FF").parse_escape().unwrap_err(), - TestError { - span: span(2..5), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\x{}").parse_escape().unwrap_err(), - TestError { - span: span(2..4), - kind: ast::ErrorKind::EscapeHexEmpty, - } - ); - assert_eq!( - parser(r"\x{FGF}").parse_escape().unwrap_err(), - TestError { - span: span(4..5), - kind: ast::ErrorKind::EscapeHexInvalidDigit, - } - ); - assert_eq!( - parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), - TestError { - span: span(3..9), - kind: ast::ErrorKind::EscapeHexInvalid, - } - ); - assert_eq!( - parser(r"\x{D800}").parse_escape().unwrap_err(), - TestError { - span: span(3..7), - kind: ast::ErrorKind::EscapeHexInvalid, - } - ); - assert_eq!( - parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), - TestError { - span: span(3..12), - kind: ast::ErrorKind::EscapeHexInvalid, - } - ); - } - - #[test] - fn parse_decimal() { - assert_eq!(parser("123").parse_decimal(), Ok(123)); - assert_eq!(parser("0").parse_decimal(), Ok(0)); - assert_eq!(parser("01").parse_decimal(), Ok(1)); - - assert_eq!( - parser("-1").parse_decimal().unwrap_err(), - TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } - ); - assert_eq!( - parser("").parse_decimal().unwrap_err(), - TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } - ); - assert_eq!( - parser("9999999999").parse_decimal().unwrap_err(), - TestError { - span: span(0..10), - kind: ast::ErrorKind::DecimalInvalid, - } - ); - } - - #[test] - fn parse_set_class() { - fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet { - ast::ClassSet::union(ast::ClassSetUnion { span, items }) - } - - fn intersection( - span: Span, - lhs: ast::ClassSet, - rhs: ast::ClassSet, - ) -> ast::ClassSet { - ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { - span, - kind: ast::ClassSetBinaryOpKind::Intersection, - lhs: Box::new(lhs), - rhs: Box::new(rhs), - }) - } - - fn difference( - span: Span, - lhs: ast::ClassSet, - rhs: ast::ClassSet, - ) -> ast::ClassSet { - ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { - span, - kind: ast::ClassSetBinaryOpKind::Difference, - lhs: Box::new(lhs), - rhs: Box::new(rhs), - }) - } - - fn symdifference( - span: Span, - lhs: ast::ClassSet, - rhs: ast::ClassSet, - ) -> ast::ClassSet { - ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { - span, - kind: ast::ClassSetBinaryOpKind::SymmetricDifference, - lhs: Box::new(lhs), - rhs: Box::new(rhs), - }) - } - - fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { - ast::ClassSet::Item(item) - } - - fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { - ast::ClassSetItem::Ascii(cls) - } - - fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { - ast::ClassSetItem::Unicode(cls) - } - - fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { - ast::ClassSetItem::Perl(cls) - } - - fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { - ast::ClassSetItem::Bracketed(Box::new(cls)) - } - - fn lit(span: Span, c: char) -> ast::ClassSetItem { - ast::ClassSetItem::Literal(ast::Literal { - span, - kind: ast::LiteralKind::Verbatim, - c, - }) - } - - fn empty(span: Span) -> ast::ClassSetItem { - ast::ClassSetItem::Empty(span) - } - - fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { - let pos1 = Position { - offset: span.start.offset + start.len_utf8(), - column: span.start.column + 1, - ..span.start - }; - let pos2 = Position { - offset: span.end.offset - end.len_utf8(), - column: span.end.column - 1, - ..span.end - }; - ast::ClassSetItem::Range(ast::ClassSetRange { - span, - start: ast::Literal { - span: Span { end: pos1, ..span }, - kind: ast::LiteralKind::Verbatim, - c: start, - }, - end: ast::Literal { - span: Span { start: pos2, ..span }, - kind: ast::LiteralKind::Verbatim, - c: end, - }, - }) - } - - fn alnum(span: Span, negated: bool) -> ast::ClassAscii { - ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } - } - - fn lower(span: Span, negated: bool) -> ast::ClassAscii { - ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } - } - - assert_eq!( - parser("[[:alnum:]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..11), - negated: false, - kind: itemset(item_ascii(alnum(span(1..10), false))), - })) - ); - assert_eq!( - parser("[[[:alnum:]]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..13), - negated: false, - kind: itemset(item_bracket(ast::ClassBracketed { - span: span(1..12), - negated: false, - kind: itemset(item_ascii(alnum(span(2..11), false))), - })), - })) - ); - assert_eq!( - parser("[[:alnum:]&&[:lower:]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..22), - negated: false, - kind: intersection( - span(1..21), - itemset(item_ascii(alnum(span(1..10), false))), - itemset(item_ascii(lower(span(12..21), false))), - ), - })) - ); - assert_eq!( - parser("[[:alnum:]--[:lower:]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..22), - negated: false, - kind: difference( - span(1..21), - itemset(item_ascii(alnum(span(1..10), false))), - itemset(item_ascii(lower(span(12..21), false))), - ), - })) - ); - assert_eq!( - parser("[[:alnum:]~~[:lower:]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..22), - negated: false, - kind: symdifference( - span(1..21), - itemset(item_ascii(alnum(span(1..10), false))), - itemset(item_ascii(lower(span(12..21), false))), - ), - })) - ); - - assert_eq!( - parser("[a]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..3), - negated: false, - kind: itemset(lit(span(1..2), 'a')), - })) - ); - assert_eq!( - parser(r"[a\]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..5), - negated: false, - kind: union( - span(1..4), - vec![ - lit(span(1..2), 'a'), - ast::ClassSetItem::Literal(ast::Literal { - span: span(2..4), - kind: ast::LiteralKind::Meta, - c: ']', - }), - ] - ), - })) - ); - assert_eq!( - parser(r"[a\-z]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..6), - negated: false, - kind: union( - span(1..5), - vec![ - lit(span(1..2), 'a'), - ast::ClassSetItem::Literal(ast::Literal { - span: span(2..4), - kind: ast::LiteralKind::Meta, - c: '-', - }), - lit(span(4..5), 'z'), - ] - ), - })) - ); - assert_eq!( - parser("[ab]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: union( - span(1..3), - vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] - ), - })) - ); - assert_eq!( - parser("[a-]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: union( - span(1..3), - vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] - ), - })) - ); - assert_eq!( - parser("[-a]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: union( - span(1..3), - vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] - ), - })) - ); - assert_eq!( - parser(r"[\pL]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..5), - negated: false, - kind: itemset(item_unicode(ast::ClassUnicode { - span: span(1..4), - negated: false, - kind: ast::ClassUnicodeKind::OneLetter('L'), - })), - })) - ); - assert_eq!( - parser(r"[\w]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: itemset(item_perl(ast::ClassPerl { - span: span(1..3), - kind: ast::ClassPerlKind::Word, - negated: false, - })), - })) - ); - assert_eq!( - parser(r"[a\wz]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..6), - negated: false, - kind: union( - span(1..5), - vec![ - lit(span(1..2), 'a'), - item_perl(ast::ClassPerl { - span: span(2..4), - kind: ast::ClassPerlKind::Word, - negated: false, - }), - lit(span(4..5), 'z'), - ] - ), - })) - ); - - assert_eq!( - parser("[a-z]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..5), - negated: false, - kind: itemset(range(span(1..4), 'a', 'z')), - })) - ); - assert_eq!( - parser("[a-cx-z]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..8), - negated: false, - kind: union( - span(1..7), - vec![ - range(span(1..4), 'a', 'c'), - range(span(4..7), 'x', 'z'), - ] - ), - })) - ); - assert_eq!( - parser(r"[\w&&a-cx-z]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..12), - negated: false, - kind: intersection( - span(1..11), - itemset(item_perl(ast::ClassPerl { - span: span(1..3), - kind: ast::ClassPerlKind::Word, - negated: false, - })), - union( - span(5..11), - vec![ - range(span(5..8), 'a', 'c'), - range(span(8..11), 'x', 'z'), - ] - ), - ), - })) - ); - assert_eq!( - parser(r"[a-cx-z&&\w]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..12), - negated: false, - kind: intersection( - span(1..11), - union( - span(1..7), - vec![ - range(span(1..4), 'a', 'c'), - range(span(4..7), 'x', 'z'), - ] - ), - itemset(item_perl(ast::ClassPerl { - span: span(9..11), - kind: ast::ClassPerlKind::Word, - negated: false, - })), - ), - })) - ); - assert_eq!( - parser(r"[a--b--c]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..9), - negated: false, - kind: difference( - span(1..8), - difference( - span(1..5), - itemset(lit(span(1..2), 'a')), - itemset(lit(span(4..5), 'b')), - ), - itemset(lit(span(7..8), 'c')), - ), - })) - ); - assert_eq!( - parser(r"[a~~b~~c]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..9), - negated: false, - kind: symdifference( - span(1..8), - symdifference( - span(1..5), - itemset(lit(span(1..2), 'a')), - itemset(lit(span(4..5), 'b')), - ), - itemset(lit(span(7..8), 'c')), - ), - })) - ); - assert_eq!( - parser(r"[\^&&^]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..7), - negated: false, - kind: intersection( - span(1..6), - itemset(ast::ClassSetItem::Literal(ast::Literal { - span: span(1..3), - kind: ast::LiteralKind::Meta, - c: '^', - })), - itemset(lit(span(5..6), '^')), - ), - })) - ); - assert_eq!( - parser(r"[\&&&&]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..7), - negated: false, - kind: intersection( - span(1..6), - itemset(ast::ClassSetItem::Literal(ast::Literal { - span: span(1..3), - kind: ast::LiteralKind::Meta, - c: '&', - })), - itemset(lit(span(5..6), '&')), - ), - })) - ); - assert_eq!( - parser(r"[&&&&]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..6), - negated: false, - kind: intersection( - span(1..5), - intersection( - span(1..3), - itemset(empty(span(1..1))), - itemset(empty(span(3..3))), - ), - itemset(empty(span(5..5))), - ), - })) - ); - - let pat = "[☃-⛄]"; - assert_eq!( - parser(pat).parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span_range(pat, 0..9), - negated: false, - kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { - span: span_range(pat, 1..8), - start: ast::Literal { - span: span_range(pat, 1..4), - kind: ast::LiteralKind::Verbatim, - c: '☃', - }, - end: ast::Literal { - span: span_range(pat, 5..8), - kind: ast::LiteralKind::Verbatim, - c: '⛄', - }, - })), - })) - ); - - assert_eq!( - parser(r"[]]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..3), - negated: false, - kind: itemset(lit(span(1..2), ']')), - })) - ); - assert_eq!( - parser(r"[]\[]").parse(), - Ok(Ast::class_bracketed(ast::ClassBracketed { - span: span(0..5), - negated: false, - kind: union( - span(1..4), - vec![ - lit(span(1..2), ']'), - ast::ClassSetItem::Literal(ast::Literal { - span: span(2..4), - kind: ast::LiteralKind::Meta, - c: '[', - }), - ] - ), - })) - ); - assert_eq!( - parser(r"[\[]]").parse(), - Ok(concat( - 0..5, - vec![ - Ast::class_bracketed(ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: itemset(ast::ClassSetItem::Literal( - ast::Literal { - span: span(1..3), - kind: ast::LiteralKind::Meta, - c: '[', - } - )), - }), - Ast::literal(ast::Literal { - span: span(4..5), - kind: ast::LiteralKind::Verbatim, - c: ']', - }), - ] - )) - ); - - assert_eq!( - parser("[").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[[").parse().unwrap_err(), - TestError { - span: span(1..2), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[[-]").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[[[:alnum:]").parse().unwrap_err(), - TestError { - span: span(1..2), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser(r"[\b]").parse().unwrap_err(), - TestError { - span: span(1..3), - kind: ast::ErrorKind::ClassEscapeInvalid, - } - ); - assert_eq!( - parser(r"[\w-a]").parse().unwrap_err(), - TestError { - span: span(1..3), - kind: ast::ErrorKind::ClassRangeLiteral, - } - ); - assert_eq!( - parser(r"[a-\w]").parse().unwrap_err(), - TestError { - span: span(3..5), - kind: ast::ErrorKind::ClassRangeLiteral, - } - ); - assert_eq!( - parser(r"[z-a]").parse().unwrap_err(), - TestError { - span: span(1..4), - kind: ast::ErrorKind::ClassRangeInvalid, - } - ); - - assert_eq!( - parser_ignore_whitespace("[a ").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser_ignore_whitespace("[a- ").parse().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - } - - #[test] - fn parse_set_class_open() { - assert_eq!(parser("[a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..1), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(1..1), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; - Ok((set, union)) - }); - assert_eq!( - parser_ignore_whitespace("[ a]").parse_set_class_open(), - { - let set = ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(4..4), - items: vec![], - }), - }; - let union = - ast::ClassSetUnion { span: span(4..4), items: vec![] }; - Ok((set, union)) - } - ); - assert_eq!(parser("[^a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..2), - negated: true, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(2..2), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; - Ok((set, union)) - }); - assert_eq!( - parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), - { - let set = ast::ClassBracketed { - span: span(0..4), - negated: true, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(4..4), - items: vec![], - }), - }; - let union = - ast::ClassSetUnion { span: span(4..4), items: vec![] }; - Ok((set, union)) - } - ); - assert_eq!(parser("[-a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..2), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(1..1), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(1..2), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(1..2), - kind: ast::LiteralKind::Verbatim, - c: '-', - })], - }; - Ok((set, union)) - }); - assert_eq!( - parser_ignore_whitespace("[ - a]").parse_set_class_open(), - { - let set = ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(2..2), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(2..3), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: '-', - })], - }; - Ok((set, union)) - } - ); - assert_eq!(parser("[^-a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..3), - negated: true, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(2..2), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(2..3), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: '-', - })], - }; - Ok((set, union)) - }); - assert_eq!(parser("[--a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..3), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(1..1), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(1..3), - items: vec![ - ast::ClassSetItem::Literal(ast::Literal { - span: span(1..2), - kind: ast::LiteralKind::Verbatim, - c: '-', - }), - ast::ClassSetItem::Literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: '-', - }), - ], - }; - Ok((set, union)) - }); - assert_eq!(parser("[]a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..2), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(1..1), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(1..2), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(1..2), - kind: ast::LiteralKind::Verbatim, - c: ']', - })], - }; - Ok((set, union)) - }); - assert_eq!( - parser_ignore_whitespace("[ ] a]").parse_set_class_open(), - { - let set = ast::ClassBracketed { - span: span(0..4), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(2..2), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(2..3), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: ']', - })], - }; - Ok((set, union)) - } - ); - assert_eq!(parser("[^]a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..3), - negated: true, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(2..2), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(2..3), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: ']', - })], - }; - Ok((set, union)) - }); - assert_eq!(parser("[-]a]").parse_set_class_open(), { - let set = ast::ClassBracketed { - span: span(0..2), - negated: false, - kind: ast::ClassSet::union(ast::ClassSetUnion { - span: span(1..1), - items: vec![], - }), - }; - let union = ast::ClassSetUnion { - span: span(1..2), - items: vec![ast::ClassSetItem::Literal(ast::Literal { - span: span(1..2), - kind: ast::LiteralKind::Verbatim, - c: '-', - })], - }; - Ok((set, union)) - }); - - assert_eq!( - parser("[").parse_set_class_open().unwrap_err(), - TestError { - span: span(0..1), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser_ignore_whitespace("[ ") - .parse_set_class_open() - .unwrap_err(), - TestError { - span: span(0..5), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[^").parse_set_class_open().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[]").parse_set_class_open().unwrap_err(), - TestError { - span: span(0..2), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[-").parse_set_class_open().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - assert_eq!( - parser("[--").parse_set_class_open().unwrap_err(), - TestError { - span: span(0..0), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - - // See: https://github.com/rust-lang/regex/issues/792 - assert_eq!( - parser("(?x)[-#]").parse_with_comments().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::ClassUnclosed, - } - ); - } - - #[test] - fn maybe_parse_ascii_class() { - assert_eq!( - parser(r"[:alnum:]").maybe_parse_ascii_class(), - Some(ast::ClassAscii { - span: span(0..9), - kind: ast::ClassAsciiKind::Alnum, - negated: false, - }) - ); - assert_eq!( - parser(r"[:alnum:]A").maybe_parse_ascii_class(), - Some(ast::ClassAscii { - span: span(0..9), - kind: ast::ClassAsciiKind::Alnum, - negated: false, - }) - ); - assert_eq!( - parser(r"[:^alnum:]").maybe_parse_ascii_class(), - Some(ast::ClassAscii { - span: span(0..10), - kind: ast::ClassAsciiKind::Alnum, - negated: true, - }) - ); - - let p = parser(r"[:"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - - let p = parser(r"[:^"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - - let p = parser(r"[^:alnum:]"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - - let p = parser(r"[:alnnum:]"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - - let p = parser(r"[:alnum]"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - - let p = parser(r"[:alnum:"); - assert_eq!(p.maybe_parse_ascii_class(), None); - assert_eq!(p.offset(), 0); - } - - #[test] - fn parse_unicode_class() { - assert_eq!( - parser(r"\pN").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..3), - negated: false, - kind: ast::ClassUnicodeKind::OneLetter('N'), - })) - ); - assert_eq!( - parser(r"\PN").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..3), - negated: true, - kind: ast::ClassUnicodeKind::OneLetter('N'), - })) - ); - assert_eq!( - parser(r"\p{N}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..5), - negated: false, - kind: ast::ClassUnicodeKind::Named(s("N")), - })) - ); - assert_eq!( - parser(r"\P{N}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..5), - negated: true, - kind: ast::ClassUnicodeKind::Named(s("N")), - })) - ); - assert_eq!( - parser(r"\p{Greek}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..9), - negated: false, - kind: ast::ClassUnicodeKind::Named(s("Greek")), - })) - ); - - assert_eq!( - parser(r"\p{scx:Katakana}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..16), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Colon, - name: s("scx"), - value: s("Katakana"), - }, - })) - ); - assert_eq!( - parser(r"\p{scx=Katakana}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..16), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Equal, - name: s("scx"), - value: s("Katakana"), - }, - })) - ); - assert_eq!( - parser(r"\p{scx!=Katakana}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..17), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::NotEqual, - name: s("scx"), - value: s("Katakana"), - }, - })) - ); - - assert_eq!( - parser(r"\p{:}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..5), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Colon, - name: s(""), - value: s(""), - }, - })) - ); - assert_eq!( - parser(r"\p{=}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..5), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::Equal, - name: s(""), - value: s(""), - }, - })) - ); - assert_eq!( - parser(r"\p{!=}").parse_escape(), - Ok(Primitive::Unicode(ast::ClassUnicode { - span: span(0..6), - negated: false, - kind: ast::ClassUnicodeKind::NamedValue { - op: ast::ClassUnicodeOpKind::NotEqual, - name: s(""), - value: s(""), - }, - })) - ); - - assert_eq!( - parser(r"\p").parse_escape().unwrap_err(), - TestError { - span: span(2..2), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\p{").parse_escape().unwrap_err(), - TestError { - span: span(3..3), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\p{N").parse_escape().unwrap_err(), - TestError { - span: span(4..4), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - assert_eq!( - parser(r"\p{Greek").parse_escape().unwrap_err(), - TestError { - span: span(8..8), - kind: ast::ErrorKind::EscapeUnexpectedEof, - } - ); - - assert_eq!( - parser(r"\pNz").parse(), - Ok(Ast::concat(ast::Concat { - span: span(0..4), - asts: vec![ - Ast::class_unicode(ast::ClassUnicode { - span: span(0..3), - negated: false, - kind: ast::ClassUnicodeKind::OneLetter('N'), - }), - Ast::literal(ast::Literal { - span: span(3..4), - kind: ast::LiteralKind::Verbatim, - c: 'z', - }), - ], - })) - ); - assert_eq!( - parser(r"\p{Greek}z").parse(), - Ok(Ast::concat(ast::Concat { - span: span(0..10), - asts: vec![ - Ast::class_unicode(ast::ClassUnicode { - span: span(0..9), - negated: false, - kind: ast::ClassUnicodeKind::Named(s("Greek")), - }), - Ast::literal(ast::Literal { - span: span(9..10), - kind: ast::LiteralKind::Verbatim, - c: 'z', - }), - ], - })) - ); - assert_eq!( - parser(r"\p\{").parse().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::UnicodeClassInvalid, - } - ); - assert_eq!( - parser(r"\P\{").parse().unwrap_err(), - TestError { - span: span(2..3), - kind: ast::ErrorKind::UnicodeClassInvalid, - } - ); - } - - #[test] - fn parse_perl_class() { - assert_eq!( - parser(r"\d").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Digit, - negated: false, - })) - ); - assert_eq!( - parser(r"\D").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Digit, - negated: true, - })) - ); - assert_eq!( - parser(r"\s").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Space, - negated: false, - })) - ); - assert_eq!( - parser(r"\S").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Space, - negated: true, - })) - ); - assert_eq!( - parser(r"\w").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Word, - negated: false, - })) - ); - assert_eq!( - parser(r"\W").parse_escape(), - Ok(Primitive::Perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Word, - negated: true, - })) - ); - - assert_eq!( - parser(r"\d").parse(), - Ok(Ast::class_perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Digit, - negated: false, - })) - ); - assert_eq!( - parser(r"\dz").parse(), - Ok(Ast::concat(ast::Concat { - span: span(0..3), - asts: vec![ - Ast::class_perl(ast::ClassPerl { - span: span(0..2), - kind: ast::ClassPerlKind::Digit, - negated: false, - }), - Ast::literal(ast::Literal { - span: span(2..3), - kind: ast::LiteralKind::Verbatim, - c: 'z', - }), - ], - })) - ); - } - - // This tests a bug fix where the nest limit checker wasn't decrementing - // its depth during post-traversal, which causes long regexes to trip - // the default limit too aggressively. - #[test] - fn regression_454_nest_too_big() { - let pattern = r#" - 2(?: - [45]\d{3}| - 7(?: - 1[0-267]| - 2[0-289]| - 3[0-29]| - 4[01]| - 5[1-3]| - 6[013]| - 7[0178]| - 91 - )| - 8(?: - 0[125]| - [139][1-6]| - 2[0157-9]| - 41| - 6[1-35]| - 7[1-5]| - 8[1-8]| - 90 - )| - 9(?: - 0[0-2]| - 1[0-4]| - 2[568]| - 3[3-6]| - 5[5-7]| - 6[0167]| - 7[15]| - 8[0146-9] - ) - )\d{4} - "#; - assert!(parser_nest_limit(pattern, 50).parse().is_ok()); - } - - // This tests that we treat a trailing `-` in a character class as a - // literal `-` even when whitespace mode is enabled and there is whitespace - // after the trailing `-`. - #[test] - fn regression_455_trailing_dash_ignore_whitespace() { - assert!(parser("(?x)[ / - ]").parse().is_ok()); - assert!(parser("(?x)[ a - ]").parse().is_ok()); - assert!(parser( - "(?x)[ - a - - ] - " - ) - .parse() - .is_ok()); - assert!(parser( - "(?x)[ - a # wat - - ] - " - ) - .parse() - .is_ok()); - - assert!(parser("(?x)[ / -").parse().is_err()); - assert!(parser("(?x)[ / - ").parse().is_err()); - assert!(parser( - "(?x)[ - / - - " - ) - .parse() - .is_err()); - assert!(parser( - "(?x)[ - / - # wat - " - ) - .parse() - .is_err()); - } -} |
