summaryrefslogtreecommitdiff
path: root/vendor/rustc_lexer/src/lib.rs
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/rustc_lexer/src/lib.rs')
-rw-r--r--vendor/rustc_lexer/src/lib.rs562
1 files changed, 562 insertions, 0 deletions
diff --git a/vendor/rustc_lexer/src/lib.rs b/vendor/rustc_lexer/src/lib.rs
new file mode 100644
index 00000000..30a5175d
--- /dev/null
+++ b/vendor/rustc_lexer/src/lib.rs
@@ -0,0 +1,562 @@
+// We want to be able to build this crate with a stable compiler, so no
+// `#![feature]` attributes should be added.
+
+mod cursor;
+pub mod unescape;
+
+use crate::cursor::{Cursor, EOF_CHAR};
+
+pub struct Token {
+ pub kind: TokenKind,
+ pub len: usize,
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum TokenKind {
+ LineComment,
+ BlockComment { terminated: bool },
+ Whitespace,
+ Ident,
+ RawIdent,
+ Literal { kind: LiteralKind, suffix_start: usize },
+ Lifetime { starts_with_number: bool },
+ Semi,
+ Comma,
+ Dot,
+ OpenParen,
+ CloseParen,
+ OpenBrace,
+ CloseBrace,
+ OpenBracket,
+ CloseBracket,
+ At,
+ Pound,
+ Tilde,
+ Question,
+ Colon,
+ Dollar,
+ Eq,
+ Not,
+ Lt,
+ Gt,
+ Minus,
+ And,
+ Or,
+ Plus,
+ Star,
+ Slash,
+ Caret,
+ Percent,
+ Unknown,
+}
+use self::TokenKind::*;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum LiteralKind {
+ Int { base: Base, empty_int: bool },
+ Float { base: Base, empty_exponent: bool },
+ Char { terminated: bool },
+ Byte { terminated: bool },
+ Str { terminated: bool },
+ ByteStr { terminated: bool },
+ RawStr { n_hashes: usize, started: bool, terminated: bool },
+ RawByteStr { n_hashes: usize, started: bool, terminated: bool },
+}
+use self::LiteralKind::*;
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
+pub enum Base {
+ Binary,
+ Octal,
+ Hexadecimal,
+ Decimal,
+}
+
+impl Token {
+ fn new(kind: TokenKind, len: usize) -> Token {
+ Token { kind, len }
+ }
+}
+
+pub fn strip_shebang(input: &str) -> Option<usize> {
+ debug_assert!(!input.is_empty());
+ if !input.starts_with("#!") || input.starts_with("#![") {
+ return None;
+ }
+ Some(input.find('\n').unwrap_or(input.len()))
+}
+
+pub fn first_token(input: &str) -> Token {
+ debug_assert!(!input.is_empty());
+ Cursor::new(input).advance_token()
+}
+
+pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
+ std::iter::from_fn(move || {
+ if input.is_empty() {
+ return None;
+ }
+ let token = first_token(input);
+ input = &input[token.len..];
+ Some(token)
+ })
+}
+
+// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these
+// classes.
+
+/// True if `c` is considered a whitespace according to Rust language definition.
+pub fn is_whitespace(c: char) -> bool {
+ // This is Pattern_White_Space.
+ //
+ // Note that this set is stable (ie, it doesn't change with different
+ // Unicode versions), so it's ok to just hard-code the values.
+
+ match c {
+ // Usual ASCII suspects
+ | '\u{0009}' // \t
+ | '\u{000A}' // \n
+ | '\u{000B}' // vertical tab
+ | '\u{000C}' // form feed
+ | '\u{000D}' // \r
+ | '\u{0020}' // space
+
+ // NEXT LINE from latin1
+ | '\u{0085}'
+
+ // Bidi markers
+ | '\u{200E}' // LEFT-TO-RIGHT MARK
+ | '\u{200F}' // RIGHT-TO-LEFT MARK
+
+ // Dedicated whitespace characters from Unicode
+ | '\u{2028}' // LINE SEPARATOR
+ | '\u{2029}' // PARAGRAPH SEPARATOR
+ => true,
+ _ => false,
+ }
+}
+
+/// True if `c` is valid as a first character of an identifier.
+pub fn is_id_start(c: char) -> bool {
+ // This is XID_Start OR '_' (which formally is not a XID_Start).
+ // We also add fast-path for ascii idents
+ ('a' <= c && c <= 'z')
+ || ('A' <= c && c <= 'Z')
+ || c == '_'
+ || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
+}
+
+/// True if `c` is valid as a non-first character of an identifier.
+pub fn is_id_continue(c: char) -> bool {
+ // This is exactly XID_Continue.
+ // We also add fast-path for ascii idents
+ ('a' <= c && c <= 'z')
+ || ('A' <= c && c <= 'Z')
+ || ('0' <= c && c <= '9')
+ || c == '_'
+ || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
+}
+
+
+impl Cursor<'_> {
+ fn advance_token(&mut self) -> Token {
+ let first_char = self.bump().unwrap();
+ let token_kind = match first_char {
+ '/' => match self.nth_char(0) {
+ '/' => self.line_comment(),
+ '*' => self.block_comment(),
+ _ => Slash,
+ },
+ c if is_whitespace(c) => self.whitespace(),
+ 'r' => match (self.nth_char(0), self.nth_char(1)) {
+ ('#', c1) if is_id_start(c1) => self.raw_ident(),
+ ('#', _) | ('"', _) => {
+ let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = RawStr { n_hashes, started, terminated };
+ Literal { kind, suffix_start }
+ }
+ _ => self.ident(),
+ },
+ 'b' => match (self.nth_char(0), self.nth_char(1)) {
+ ('\'', _) => {
+ self.bump();
+ let terminated = self.single_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = Byte { terminated };
+ Literal { kind, suffix_start }
+ }
+ ('"', _) => {
+ self.bump();
+ let terminated = self.double_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = ByteStr { terminated };
+ Literal { kind, suffix_start }
+ }
+ ('r', '"') | ('r', '#') => {
+ self.bump();
+ let (n_hashes, started, terminated) = self.raw_double_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = RawByteStr { n_hashes, started, terminated };
+ Literal { kind, suffix_start }
+ }
+ _ => self.ident(),
+ },
+ c if is_id_start(c) => self.ident(),
+ c @ '0'..='9' => {
+ let literal_kind = self.number(c);
+ let suffix_start = self.len_consumed();
+ self.eat_literal_suffix();
+ TokenKind::Literal { kind: literal_kind, suffix_start }
+ }
+ ';' => Semi,
+ ',' => Comma,
+ '.' => Dot,
+ '(' => OpenParen,
+ ')' => CloseParen,
+ '{' => OpenBrace,
+ '}' => CloseBrace,
+ '[' => OpenBracket,
+ ']' => CloseBracket,
+ '@' => At,
+ '#' => Pound,
+ '~' => Tilde,
+ '?' => Question,
+ ':' => Colon,
+ '$' => Dollar,
+ '=' => Eq,
+ '!' => Not,
+ '<' => Lt,
+ '>' => Gt,
+ '-' => Minus,
+ '&' => And,
+ '|' => Or,
+ '+' => Plus,
+ '*' => Star,
+ '^' => Caret,
+ '%' => Percent,
+ '\'' => self.lifetime_or_char(),
+ '"' => {
+ let terminated = self.double_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = Str { terminated };
+ Literal { kind, suffix_start }
+ }
+ _ => Unknown,
+ };
+ Token::new(token_kind, self.len_consumed())
+ }
+
+ fn line_comment(&mut self) -> TokenKind {
+ debug_assert!(self.prev() == '/' && self.nth_char(0) == '/');
+ self.bump();
+ loop {
+ match self.nth_char(0) {
+ '\n' => break,
+ EOF_CHAR if self.is_eof() => break,
+ _ => {
+ self.bump();
+ }
+ }
+ }
+ LineComment
+ }
+
+ fn block_comment(&mut self) -> TokenKind {
+ debug_assert!(self.prev() == '/' && self.nth_char(0) == '*');
+ self.bump();
+ let mut depth = 1usize;
+ while let Some(c) = self.bump() {
+ match c {
+ '/' if self.nth_char(0) == '*' => {
+ self.bump();
+ depth += 1;
+ }
+ '*' if self.nth_char(0) == '/' => {
+ self.bump();
+ depth -= 1;
+ if depth == 0 {
+ break;
+ }
+ }
+ _ => (),
+ }
+ }
+
+ BlockComment { terminated: depth == 0 }
+ }
+
+ fn whitespace(&mut self) -> TokenKind {
+ debug_assert!(is_whitespace(self.prev()));
+ while is_whitespace(self.nth_char(0)) {
+ self.bump();
+ }
+ Whitespace
+ }
+
+ fn raw_ident(&mut self) -> TokenKind {
+ debug_assert!(
+ self.prev() == 'r'
+ && self.nth_char(0) == '#'
+ && is_id_start(self.nth_char(1))
+ );
+ self.bump();
+ self.bump();
+ while is_id_continue(self.nth_char(0)) {
+ self.bump();
+ }
+ RawIdent
+ }
+
+ fn ident(&mut self) -> TokenKind {
+ debug_assert!(is_id_start(self.prev()));
+ while is_id_continue(self.nth_char(0)) {
+ self.bump();
+ }
+ Ident
+ }
+
+ fn number(&mut self, first_digit: char) -> LiteralKind {
+ debug_assert!('0' <= self.prev() && self.prev() <= '9');
+ let mut base = Base::Decimal;
+ if first_digit == '0' {
+ let has_digits = match self.nth_char(0) {
+ 'b' => {
+ base = Base::Binary;
+ self.bump();
+ self.eat_decimal_digits()
+ }
+ 'o' => {
+ base = Base::Octal;
+ self.bump();
+ self.eat_decimal_digits()
+ }
+ 'x' => {
+ base = Base::Hexadecimal;
+ self.bump();
+ self.eat_hexadecimal_digits()
+ }
+ '0'..='9' | '_' | '.' | 'e' | 'E' => {
+ self.eat_decimal_digits();
+ true
+ }
+ // just a 0
+ _ => return Int { base, empty_int: false },
+ };
+ if !has_digits {
+ return Int { base, empty_int: true };
+ }
+ } else {
+ self.eat_decimal_digits();
+ };
+
+ match self.nth_char(0) {
+ // Don't be greedy if this is actually an
+ // integer literal followed by field/method access or a range pattern
+ // (`0..2` and `12.foo()`)
+ '.' if self.nth_char(1) != '.'
+ && !is_id_start(self.nth_char(1)) =>
+ {
+ // might have stuff after the ., and if it does, it needs to start
+ // with a number
+ self.bump();
+ let mut empty_exponent = false;
+ if self.nth_char(0).is_digit(10) {
+ self.eat_decimal_digits();
+ match self.nth_char(0) {
+ 'e' | 'E' => {
+ self.bump();
+ empty_exponent = self.float_exponent().is_err()
+ }
+ _ => (),
+ }
+ }
+ Float { base, empty_exponent }
+ }
+ 'e' | 'E' => {
+ self.bump();
+ let empty_exponent = self.float_exponent().is_err();
+ Float { base, empty_exponent }
+ }
+ _ => Int { base, empty_int: false },
+ }
+ }
+
+ fn lifetime_or_char(&mut self) -> TokenKind {
+ debug_assert!(self.prev() == '\'');
+ let mut starts_with_number = false;
+ if (is_id_start(self.nth_char(0))
+ || self.nth_char(0).is_digit(10) && {
+ starts_with_number = true;
+ true
+ })
+ && self.nth_char(1) != '\''
+ {
+ self.bump();
+ while is_id_continue(self.nth_char(0)) {
+ self.bump();
+ }
+
+ return if self.nth_char(0) == '\'' {
+ self.bump();
+ let kind = Char { terminated: true };
+ Literal { kind, suffix_start: self.len_consumed() }
+ } else {
+ Lifetime { starts_with_number }
+ };
+ }
+ let terminated = self.single_quoted_string();
+ let suffix_start = self.len_consumed();
+ if terminated {
+ self.eat_literal_suffix();
+ }
+ let kind = Char { terminated };
+ return Literal { kind, suffix_start };
+ }
+
+ fn single_quoted_string(&mut self) -> bool {
+ debug_assert!(self.prev() == '\'');
+ // parse `'''` as a single char literal
+ if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' {
+ self.bump();
+ }
+ let mut first = true;
+ loop {
+ match self.nth_char(0) {
+ '/' if !first => break,
+ '\n' if self.nth_char(1) != '\'' => break,
+ EOF_CHAR if self.is_eof() => break,
+ '\'' => {
+ self.bump();
+ return true;
+ }
+ '\\' => {
+ self.bump();
+ self.bump();
+ }
+ _ => {
+ self.bump();
+ }
+ }
+ first = false;
+ }
+ false
+ }
+
+ fn double_quoted_string(&mut self) -> bool {
+ debug_assert!(self.prev() == '"');
+ loop {
+ match self.nth_char(0) {
+ '"' => {
+ self.bump();
+ return true;
+ }
+ EOF_CHAR if self.is_eof() => return false,
+ '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => {
+ self.bump();
+ }
+ _ => (),
+ }
+ self.bump();
+ }
+ }
+
+ fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) {
+ debug_assert!(self.prev() == 'r');
+ let n_hashes = {
+ let mut acc: usize = 0;
+ loop {
+ match self.bump() {
+ Some('#') => acc += 1,
+ Some('"') => break acc,
+ None | Some(_) => return (acc, false, false),
+ }
+ }
+ };
+
+ loop {
+ match self.bump() {
+ Some('"') => {
+ let mut acc = n_hashes;
+ while self.nth_char(0) == '#' && acc > 0 {
+ self.bump();
+ acc -= 1;
+ }
+ if acc == 0 {
+ return (n_hashes, true, true);
+ }
+ }
+ Some(_) => (),
+ None => return (n_hashes, true, false),
+ }
+ }
+ }
+
+ fn eat_decimal_digits(&mut self) -> bool {
+ let mut has_digits = false;
+ loop {
+ match self.nth_char(0) {
+ '_' => {
+ self.bump();
+ }
+ '0'..='9' => {
+ has_digits = true;
+ self.bump();
+ }
+ _ => break,
+ }
+ }
+ has_digits
+ }
+
+ fn eat_hexadecimal_digits(&mut self) -> bool {
+ let mut has_digits = false;
+ loop {
+ match self.nth_char(0) {
+ '_' => {
+ self.bump();
+ }
+ '0'..='9' | 'a'..='f' | 'A'..='F' => {
+ has_digits = true;
+ self.bump();
+ }
+ _ => break,
+ }
+ }
+ has_digits
+ }
+
+ fn float_exponent(&mut self) -> Result<(), ()> {
+ debug_assert!(self.prev() == 'e' || self.prev() == 'E');
+ if self.nth_char(0) == '-' || self.nth_char(0) == '+' {
+ self.bump();
+ }
+ if self.eat_decimal_digits() { Ok(()) } else { Err(()) }
+ }
+
+ fn eat_literal_suffix(&mut self) {
+ if !is_id_start(self.nth_char(0)) {
+ return;
+ }
+ self.bump();
+
+ while is_id_continue(self.nth_char(0)) {
+ self.bump();
+ }
+ }
+}