diff options
Diffstat (limited to 'vendor/rustc_lexer/src/lib.rs')
| -rw-r--r-- | vendor/rustc_lexer/src/lib.rs | 562 |
1 files changed, 562 insertions, 0 deletions
diff --git a/vendor/rustc_lexer/src/lib.rs b/vendor/rustc_lexer/src/lib.rs new file mode 100644 index 00000000..30a5175d --- /dev/null +++ b/vendor/rustc_lexer/src/lib.rs @@ -0,0 +1,562 @@ +// We want to be able to build this crate with a stable compiler, so no +// `#![feature]` attributes should be added. + +mod cursor; +pub mod unescape; + +use crate::cursor::{Cursor, EOF_CHAR}; + +pub struct Token { + pub kind: TokenKind, + pub len: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TokenKind { + LineComment, + BlockComment { terminated: bool }, + Whitespace, + Ident, + RawIdent, + Literal { kind: LiteralKind, suffix_start: usize }, + Lifetime { starts_with_number: bool }, + Semi, + Comma, + Dot, + OpenParen, + CloseParen, + OpenBrace, + CloseBrace, + OpenBracket, + CloseBracket, + At, + Pound, + Tilde, + Question, + Colon, + Dollar, + Eq, + Not, + Lt, + Gt, + Minus, + And, + Or, + Plus, + Star, + Slash, + Caret, + Percent, + Unknown, +} +use self::TokenKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + Int { base: Base, empty_int: bool }, + Float { base: Base, empty_exponent: bool }, + Char { terminated: bool }, + Byte { terminated: bool }, + Str { terminated: bool }, + ByteStr { terminated: bool }, + RawStr { n_hashes: usize, started: bool, terminated: bool }, + RawByteStr { n_hashes: usize, started: bool, terminated: bool }, +} +use self::LiteralKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + Binary, + Octal, + Hexadecimal, + Decimal, +} + +impl Token { + fn new(kind: TokenKind, len: usize) -> Token { + Token { kind, len } + } +} + +pub fn strip_shebang(input: &str) -> Option<usize> { + debug_assert!(!input.is_empty()); + if !input.starts_with("#!") || input.starts_with("#![") { + return None; + } + Some(input.find('\n').unwrap_or(input.len())) +} + +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + Cursor::new(input).advance_token() +} + +pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { + std::iter::from_fn(move || { + if input.is_empty() { + return None; + } + let token = first_token(input); + input = &input[token.len..]; + Some(token) + }) +} + +// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these +// classes. + +/// True if `c` is considered a whitespace according to Rust language definition. +pub fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + match c { + // Usual ASCII suspects + | '\u{0009}' // \t + | '\u{000A}' // \n + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + => true, + _ => false, + } +} + +/// True if `c` is valid as a first character of an identifier. +pub fn is_id_start(c: char) -> bool { + // This is XID_Start OR '_' (which formally is not a XID_Start). + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) +} + +/// True if `c` is valid as a non-first character of an identifier. +pub fn is_id_continue(c: char) -> bool { + // This is exactly XID_Continue. + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) +} + + +impl Cursor<'_> { + fn advance_token(&mut self) -> Token { + let first_char = self.bump().unwrap(); + let token_kind = match first_char { + '/' => match self.nth_char(0) { + '/' => self.line_comment(), + '*' => self.block_comment(), + _ => Slash, + }, + c if is_whitespace(c) => self.whitespace(), + 'r' => match (self.nth_char(0), self.nth_char(1)) { + ('#', c1) if is_id_start(c1) => self.raw_ident(), + ('#', _) | ('"', _) => { + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + 'b' => match (self.nth_char(0), self.nth_char(1)) { + ('\'', _) => { + self.bump(); + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Byte { terminated }; + Literal { kind, suffix_start } + } + ('"', _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = ByteStr { terminated }; + Literal { kind, suffix_start } + } + ('r', '"') | ('r', '#') => { + self.bump(); + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawByteStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + c if is_id_start(c) => self.ident(), + c @ '0'..='9' => { + let literal_kind = self.number(c); + let suffix_start = self.len_consumed(); + self.eat_literal_suffix(); + TokenKind::Literal { kind: literal_kind, suffix_start } + } + ';' => Semi, + ',' => Comma, + '.' => Dot, + '(' => OpenParen, + ')' => CloseParen, + '{' => OpenBrace, + '}' => CloseBrace, + '[' => OpenBracket, + ']' => CloseBracket, + '@' => At, + '#' => Pound, + '~' => Tilde, + '?' => Question, + ':' => Colon, + '$' => Dollar, + '=' => Eq, + '!' => Not, + '<' => Lt, + '>' => Gt, + '-' => Minus, + '&' => And, + '|' => Or, + '+' => Plus, + '*' => Star, + '^' => Caret, + '%' => Percent, + '\'' => self.lifetime_or_char(), + '"' => { + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Str { terminated }; + Literal { kind, suffix_start } + } + _ => Unknown, + }; + Token::new(token_kind, self.len_consumed()) + } + + fn line_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '/'); + self.bump(); + loop { + match self.nth_char(0) { + '\n' => break, + EOF_CHAR if self.is_eof() => break, + _ => { + self.bump(); + } + } + } + LineComment + } + + fn block_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '*'); + self.bump(); + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.nth_char(0) == '*' => { + self.bump(); + depth += 1; + } + '*' if self.nth_char(0) == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + break; + } + } + _ => (), + } + } + + BlockComment { terminated: depth == 0 } + } + + fn whitespace(&mut self) -> TokenKind { + debug_assert!(is_whitespace(self.prev())); + while is_whitespace(self.nth_char(0)) { + self.bump(); + } + Whitespace + } + + fn raw_ident(&mut self) -> TokenKind { + debug_assert!( + self.prev() == 'r' + && self.nth_char(0) == '#' + && is_id_start(self.nth_char(1)) + ); + self.bump(); + self.bump(); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + RawIdent + } + + fn ident(&mut self) -> TokenKind { + debug_assert!(is_id_start(self.prev())); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + Ident + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + debug_assert!('0' <= self.prev() && self.prev() <= '9'); + let mut base = Base::Decimal; + if first_digit == '0' { + let has_digits = match self.nth_char(0) { + 'b' => { + base = Base::Binary; + self.bump(); + self.eat_decimal_digits() + } + 'o' => { + base = Base::Octal; + self.bump(); + self.eat_decimal_digits() + } + 'x' => { + base = Base::Hexadecimal; + self.bump(); + self.eat_hexadecimal_digits() + } + '0'..='9' | '_' | '.' | 'e' | 'E' => { + self.eat_decimal_digits(); + true + } + // just a 0 + _ => return Int { base, empty_int: false }, + }; + if !has_digits { + return Int { base, empty_int: true }; + } + } else { + self.eat_decimal_digits(); + }; + + match self.nth_char(0) { + // Don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + '.' if self.nth_char(1) != '.' + && !is_id_start(self.nth_char(1)) => + { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.nth_char(0).is_digit(10) { + self.eat_decimal_digits(); + match self.nth_char(0) { + 'e' | 'E' => { + self.bump(); + empty_exponent = self.float_exponent().is_err() + } + _ => (), + } + } + Float { base, empty_exponent } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = self.float_exponent().is_err(); + Float { base, empty_exponent } + } + _ => Int { base, empty_int: false }, + } + } + + fn lifetime_or_char(&mut self) -> TokenKind { + debug_assert!(self.prev() == '\''); + let mut starts_with_number = false; + if (is_id_start(self.nth_char(0)) + || self.nth_char(0).is_digit(10) && { + starts_with_number = true; + true + }) + && self.nth_char(1) != '\'' + { + self.bump(); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + + return if self.nth_char(0) == '\'' { + self.bump(); + let kind = Char { terminated: true }; + Literal { kind, suffix_start: self.len_consumed() } + } else { + Lifetime { starts_with_number } + }; + } + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Char { terminated }; + return Literal { kind, suffix_start }; + } + + fn single_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '\''); + // parse `'''` as a single char literal + if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { + self.bump(); + } + let mut first = true; + loop { + match self.nth_char(0) { + '/' if !first => break, + '\n' if self.nth_char(1) != '\'' => break, + EOF_CHAR if self.is_eof() => break, + '\'' => { + self.bump(); + return true; + } + '\\' => { + self.bump(); + self.bump(); + } + _ => { + self.bump(); + } + } + first = false; + } + false + } + + fn double_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '"'); + loop { + match self.nth_char(0) { + '"' => { + self.bump(); + return true; + } + EOF_CHAR if self.is_eof() => return false, + '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => { + self.bump(); + } + _ => (), + } + self.bump(); + } + } + + fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { + debug_assert!(self.prev() == 'r'); + let n_hashes = { + let mut acc: usize = 0; + loop { + match self.bump() { + Some('#') => acc += 1, + Some('"') => break acc, + None | Some(_) => return (acc, false, false), + } + } + }; + + loop { + match self.bump() { + Some('"') => { + let mut acc = n_hashes; + while self.nth_char(0) == '#' && acc > 0 { + self.bump(); + acc -= 1; + } + if acc == 0 { + return (n_hashes, true, true); + } + } + Some(_) => (), + None => return (n_hashes, true, false), + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn float_exponent(&mut self) -> Result<(), ()> { + debug_assert!(self.prev() == 'e' || self.prev() == 'E'); + if self.nth_char(0) == '-' || self.nth_char(0) == '+' { + self.bump(); + } + if self.eat_decimal_digits() { Ok(()) } else { Err(()) } + } + + fn eat_literal_suffix(&mut self) { + if !is_id_start(self.nth_char(0)) { + return; + } + self.bump(); + + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + } +} |
