diff options
Diffstat (limited to 'vendor/rustc_lexer')
| -rw-r--r-- | vendor/rustc_lexer/.cargo-checksum.json | 1 | ||||
| -rw-r--r-- | vendor/rustc_lexer/Cargo.toml | 26 | ||||
| -rw-r--r-- | vendor/rustc_lexer/src/cursor.rs | 57 | ||||
| -rw-r--r-- | vendor/rustc_lexer/src/lib.rs | 562 | ||||
| -rw-r--r-- | vendor/rustc_lexer/src/unescape.rs | 305 | ||||
| -rw-r--r-- | vendor/rustc_lexer/src/unescape/tests.rs | 271 |
6 files changed, 1222 insertions, 0 deletions
diff --git a/vendor/rustc_lexer/.cargo-checksum.json b/vendor/rustc_lexer/.cargo-checksum.json new file mode 100644 index 00000000..04e82318 --- /dev/null +++ b/vendor/rustc_lexer/.cargo-checksum.json @@ -0,0 +1 @@ +{"files":{"Cargo.toml":"9b7e100fce6f31d499bdb9c6eb7acccb19d8d561634e4ae7112cff3d536afa7c","src/cursor.rs":"8c442ec1f16870c013f170f8a4967a011c86ac5a75cefc23127726b36a848f51","src/lib.rs":"e75190c347bd8574d145e120cf3f56abe83f86639e85c80d7b4f84dab540d0b8","src/unescape.rs":"a782961eacfa5daf4f8cbd18a82fc33cd9fae2caca6363e5bedbc78acd8f1ae1","src/unescape/tests.rs":"4def6c86f7a63a50a8740dc11617671ac3934d72d0df16b98a4d328105eac711"},"package":"c86aae0c77166108c01305ee1a36a1e77289d7dc6ca0a3cd91ff4992de2d16a5"}
\ No newline at end of file diff --git a/vendor/rustc_lexer/Cargo.toml b/vendor/rustc_lexer/Cargo.toml new file mode 100644 index 00000000..aa487d44 --- /dev/null +++ b/vendor/rustc_lexer/Cargo.toml @@ -0,0 +1,26 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies +# +# If you believe there's an error in this file please file an +# issue against the rust-lang/cargo repository. If you're +# editing this file be aware that the upstream Cargo.toml +# will likely look very different (and much more reasonable) + +[package] +edition = "2018" +name = "rustc_lexer" +version = "0.1.0" +authors = ["The Rust Project Developers"] +description = "Rust lexer used by rustc. No stability guarantees are provided.\n" +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/rust/" + +[lib] +name = "rustc_lexer" +doctest = false +[dependencies.unicode-xid] +version = "0.2.0" diff --git a/vendor/rustc_lexer/src/cursor.rs b/vendor/rustc_lexer/src/cursor.rs new file mode 100644 index 00000000..5831159c --- /dev/null +++ b/vendor/rustc_lexer/src/cursor.rs @@ -0,0 +1,57 @@ +use std::str::Chars; + +pub(crate) struct Cursor<'a> { + initial_len: usize, + chars: Chars<'a>, + #[cfg(debug_assertions)] + prev: char, +} + +pub(crate) const EOF_CHAR: char = '\0'; + +impl<'a> Cursor<'a> { + pub(crate) fn new(input: &'a str) -> Cursor<'a> { + Cursor { + initial_len: input.len(), + chars: input.chars(), + #[cfg(debug_assertions)] + prev: EOF_CHAR, + } + } + /// For debug assertions only + pub(crate) fn prev(&self) -> char { + #[cfg(debug_assertions)] + { + self.prev + } + + #[cfg(not(debug_assertions))] + { + '\0' + } + } + pub(crate) fn nth_char(&self, n: usize) -> char { + self.chars().nth(n).unwrap_or(EOF_CHAR) + } + pub(crate) fn is_eof(&self) -> bool { + self.chars.as_str().is_empty() + } + pub(crate) fn len_consumed(&self) -> usize { + self.initial_len - self.chars.as_str().len() + } + /// Returns an iterator over the remaining characters. + fn chars(&self) -> Chars<'a> { + self.chars.clone() + } + /// Moves to the next character. + pub(crate) fn bump(&mut self) -> Option<char> { + let c = self.chars.next()?; + + #[cfg(debug_assertions)] + { + self.prev = c; + } + + Some(c) + } +} diff --git a/vendor/rustc_lexer/src/lib.rs b/vendor/rustc_lexer/src/lib.rs new file mode 100644 index 00000000..30a5175d --- /dev/null +++ b/vendor/rustc_lexer/src/lib.rs @@ -0,0 +1,562 @@ +// We want to be able to build this crate with a stable compiler, so no +// `#![feature]` attributes should be added. + +mod cursor; +pub mod unescape; + +use crate::cursor::{Cursor, EOF_CHAR}; + +pub struct Token { + pub kind: TokenKind, + pub len: usize, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum TokenKind { + LineComment, + BlockComment { terminated: bool }, + Whitespace, + Ident, + RawIdent, + Literal { kind: LiteralKind, suffix_start: usize }, + Lifetime { starts_with_number: bool }, + Semi, + Comma, + Dot, + OpenParen, + CloseParen, + OpenBrace, + CloseBrace, + OpenBracket, + CloseBracket, + At, + Pound, + Tilde, + Question, + Colon, + Dollar, + Eq, + Not, + Lt, + Gt, + Minus, + And, + Or, + Plus, + Star, + Slash, + Caret, + Percent, + Unknown, +} +use self::TokenKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum LiteralKind { + Int { base: Base, empty_int: bool }, + Float { base: Base, empty_exponent: bool }, + Char { terminated: bool }, + Byte { terminated: bool }, + Str { terminated: bool }, + ByteStr { terminated: bool }, + RawStr { n_hashes: usize, started: bool, terminated: bool }, + RawByteStr { n_hashes: usize, started: bool, terminated: bool }, +} +use self::LiteralKind::*; + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Base { + Binary, + Octal, + Hexadecimal, + Decimal, +} + +impl Token { + fn new(kind: TokenKind, len: usize) -> Token { + Token { kind, len } + } +} + +pub fn strip_shebang(input: &str) -> Option<usize> { + debug_assert!(!input.is_empty()); + if !input.starts_with("#!") || input.starts_with("#![") { + return None; + } + Some(input.find('\n').unwrap_or(input.len())) +} + +pub fn first_token(input: &str) -> Token { + debug_assert!(!input.is_empty()); + Cursor::new(input).advance_token() +} + +pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ { + std::iter::from_fn(move || { + if input.is_empty() { + return None; + } + let token = first_token(input); + input = &input[token.len..]; + Some(token) + }) +} + +// See [UAX #31](http://unicode.org/reports/tr31) for definitions of these +// classes. + +/// True if `c` is considered a whitespace according to Rust language definition. +pub fn is_whitespace(c: char) -> bool { + // This is Pattern_White_Space. + // + // Note that this set is stable (ie, it doesn't change with different + // Unicode versions), so it's ok to just hard-code the values. + + match c { + // Usual ASCII suspects + | '\u{0009}' // \t + | '\u{000A}' // \n + | '\u{000B}' // vertical tab + | '\u{000C}' // form feed + | '\u{000D}' // \r + | '\u{0020}' // space + + // NEXT LINE from latin1 + | '\u{0085}' + + // Bidi markers + | '\u{200E}' // LEFT-TO-RIGHT MARK + | '\u{200F}' // RIGHT-TO-LEFT MARK + + // Dedicated whitespace characters from Unicode + | '\u{2028}' // LINE SEPARATOR + | '\u{2029}' // PARAGRAPH SEPARATOR + => true, + _ => false, + } +} + +/// True if `c` is valid as a first character of an identifier. +pub fn is_id_start(c: char) -> bool { + // This is XID_Start OR '_' (which formally is not a XID_Start). + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c)) +} + +/// True if `c` is valid as a non-first character of an identifier. +pub fn is_id_continue(c: char) -> bool { + // This is exactly XID_Continue. + // We also add fast-path for ascii idents + ('a' <= c && c <= 'z') + || ('A' <= c && c <= 'Z') + || ('0' <= c && c <= '9') + || c == '_' + || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c)) +} + + +impl Cursor<'_> { + fn advance_token(&mut self) -> Token { + let first_char = self.bump().unwrap(); + let token_kind = match first_char { + '/' => match self.nth_char(0) { + '/' => self.line_comment(), + '*' => self.block_comment(), + _ => Slash, + }, + c if is_whitespace(c) => self.whitespace(), + 'r' => match (self.nth_char(0), self.nth_char(1)) { + ('#', c1) if is_id_start(c1) => self.raw_ident(), + ('#', _) | ('"', _) => { + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + 'b' => match (self.nth_char(0), self.nth_char(1)) { + ('\'', _) => { + self.bump(); + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Byte { terminated }; + Literal { kind, suffix_start } + } + ('"', _) => { + self.bump(); + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = ByteStr { terminated }; + Literal { kind, suffix_start } + } + ('r', '"') | ('r', '#') => { + self.bump(); + let (n_hashes, started, terminated) = self.raw_double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = RawByteStr { n_hashes, started, terminated }; + Literal { kind, suffix_start } + } + _ => self.ident(), + }, + c if is_id_start(c) => self.ident(), + c @ '0'..='9' => { + let literal_kind = self.number(c); + let suffix_start = self.len_consumed(); + self.eat_literal_suffix(); + TokenKind::Literal { kind: literal_kind, suffix_start } + } + ';' => Semi, + ',' => Comma, + '.' => Dot, + '(' => OpenParen, + ')' => CloseParen, + '{' => OpenBrace, + '}' => CloseBrace, + '[' => OpenBracket, + ']' => CloseBracket, + '@' => At, + '#' => Pound, + '~' => Tilde, + '?' => Question, + ':' => Colon, + '$' => Dollar, + '=' => Eq, + '!' => Not, + '<' => Lt, + '>' => Gt, + '-' => Minus, + '&' => And, + '|' => Or, + '+' => Plus, + '*' => Star, + '^' => Caret, + '%' => Percent, + '\'' => self.lifetime_or_char(), + '"' => { + let terminated = self.double_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Str { terminated }; + Literal { kind, suffix_start } + } + _ => Unknown, + }; + Token::new(token_kind, self.len_consumed()) + } + + fn line_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '/'); + self.bump(); + loop { + match self.nth_char(0) { + '\n' => break, + EOF_CHAR if self.is_eof() => break, + _ => { + self.bump(); + } + } + } + LineComment + } + + fn block_comment(&mut self) -> TokenKind { + debug_assert!(self.prev() == '/' && self.nth_char(0) == '*'); + self.bump(); + let mut depth = 1usize; + while let Some(c) = self.bump() { + match c { + '/' if self.nth_char(0) == '*' => { + self.bump(); + depth += 1; + } + '*' if self.nth_char(0) == '/' => { + self.bump(); + depth -= 1; + if depth == 0 { + break; + } + } + _ => (), + } + } + + BlockComment { terminated: depth == 0 } + } + + fn whitespace(&mut self) -> TokenKind { + debug_assert!(is_whitespace(self.prev())); + while is_whitespace(self.nth_char(0)) { + self.bump(); + } + Whitespace + } + + fn raw_ident(&mut self) -> TokenKind { + debug_assert!( + self.prev() == 'r' + && self.nth_char(0) == '#' + && is_id_start(self.nth_char(1)) + ); + self.bump(); + self.bump(); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + RawIdent + } + + fn ident(&mut self) -> TokenKind { + debug_assert!(is_id_start(self.prev())); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + Ident + } + + fn number(&mut self, first_digit: char) -> LiteralKind { + debug_assert!('0' <= self.prev() && self.prev() <= '9'); + let mut base = Base::Decimal; + if first_digit == '0' { + let has_digits = match self.nth_char(0) { + 'b' => { + base = Base::Binary; + self.bump(); + self.eat_decimal_digits() + } + 'o' => { + base = Base::Octal; + self.bump(); + self.eat_decimal_digits() + } + 'x' => { + base = Base::Hexadecimal; + self.bump(); + self.eat_hexadecimal_digits() + } + '0'..='9' | '_' | '.' | 'e' | 'E' => { + self.eat_decimal_digits(); + true + } + // just a 0 + _ => return Int { base, empty_int: false }, + }; + if !has_digits { + return Int { base, empty_int: true }; + } + } else { + self.eat_decimal_digits(); + }; + + match self.nth_char(0) { + // Don't be greedy if this is actually an + // integer literal followed by field/method access or a range pattern + // (`0..2` and `12.foo()`) + '.' if self.nth_char(1) != '.' + && !is_id_start(self.nth_char(1)) => + { + // might have stuff after the ., and if it does, it needs to start + // with a number + self.bump(); + let mut empty_exponent = false; + if self.nth_char(0).is_digit(10) { + self.eat_decimal_digits(); + match self.nth_char(0) { + 'e' | 'E' => { + self.bump(); + empty_exponent = self.float_exponent().is_err() + } + _ => (), + } + } + Float { base, empty_exponent } + } + 'e' | 'E' => { + self.bump(); + let empty_exponent = self.float_exponent().is_err(); + Float { base, empty_exponent } + } + _ => Int { base, empty_int: false }, + } + } + + fn lifetime_or_char(&mut self) -> TokenKind { + debug_assert!(self.prev() == '\''); + let mut starts_with_number = false; + if (is_id_start(self.nth_char(0)) + || self.nth_char(0).is_digit(10) && { + starts_with_number = true; + true + }) + && self.nth_char(1) != '\'' + { + self.bump(); + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + + return if self.nth_char(0) == '\'' { + self.bump(); + let kind = Char { terminated: true }; + Literal { kind, suffix_start: self.len_consumed() } + } else { + Lifetime { starts_with_number } + }; + } + let terminated = self.single_quoted_string(); + let suffix_start = self.len_consumed(); + if terminated { + self.eat_literal_suffix(); + } + let kind = Char { terminated }; + return Literal { kind, suffix_start }; + } + + fn single_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '\''); + // parse `'''` as a single char literal + if self.nth_char(0) == '\'' && self.nth_char(1) == '\'' { + self.bump(); + } + let mut first = true; + loop { + match self.nth_char(0) { + '/' if !first => break, + '\n' if self.nth_char(1) != '\'' => break, + EOF_CHAR if self.is_eof() => break, + '\'' => { + self.bump(); + return true; + } + '\\' => { + self.bump(); + self.bump(); + } + _ => { + self.bump(); + } + } + first = false; + } + false + } + + fn double_quoted_string(&mut self) -> bool { + debug_assert!(self.prev() == '"'); + loop { + match self.nth_char(0) { + '"' => { + self.bump(); + return true; + } + EOF_CHAR if self.is_eof() => return false, + '\\' if self.nth_char(1) == '\\' || self.nth_char(1) == '"' => { + self.bump(); + } + _ => (), + } + self.bump(); + } + } + + fn raw_double_quoted_string(&mut self) -> (usize, bool, bool) { + debug_assert!(self.prev() == 'r'); + let n_hashes = { + let mut acc: usize = 0; + loop { + match self.bump() { + Some('#') => acc += 1, + Some('"') => break acc, + None | Some(_) => return (acc, false, false), + } + } + }; + + loop { + match self.bump() { + Some('"') => { + let mut acc = n_hashes; + while self.nth_char(0) == '#' && acc > 0 { + self.bump(); + acc -= 1; + } + if acc == 0 { + return (n_hashes, true, true); + } + } + Some(_) => (), + None => return (n_hashes, true, false), + } + } + } + + fn eat_decimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn eat_hexadecimal_digits(&mut self) -> bool { + let mut has_digits = false; + loop { + match self.nth_char(0) { + '_' => { + self.bump(); + } + '0'..='9' | 'a'..='f' | 'A'..='F' => { + has_digits = true; + self.bump(); + } + _ => break, + } + } + has_digits + } + + fn float_exponent(&mut self) -> Result<(), ()> { + debug_assert!(self.prev() == 'e' || self.prev() == 'E'); + if self.nth_char(0) == '-' || self.nth_char(0) == '+' { + self.bump(); + } + if self.eat_decimal_digits() { Ok(()) } else { Err(()) } + } + + fn eat_literal_suffix(&mut self) { + if !is_id_start(self.nth_char(0)) { + return; + } + self.bump(); + + while is_id_continue(self.nth_char(0)) { + self.bump(); + } + } +} diff --git a/vendor/rustc_lexer/src/unescape.rs b/vendor/rustc_lexer/src/unescape.rs new file mode 100644 index 00000000..c709b752 --- /dev/null +++ b/vendor/rustc_lexer/src/unescape.rs @@ -0,0 +1,305 @@ +//! Utilities for validating string and char literals and turning them into +//! values they represent. + +use std::str::Chars; +use std::ops::Range; + +#[cfg(test)] +mod tests; + +#[derive(Debug, PartialEq, Eq)] +pub enum EscapeError { + ZeroChars, + MoreThanOneChar, + + LoneSlash, + InvalidEscape, + BareCarriageReturn, + BareCarriageReturnInRawString, + EscapeOnlyChar, + + TooShortHexEscape, + InvalidCharInHexEscape, + OutOfRangeHexEscape, + + NoBraceInUnicodeEscape, + InvalidCharInUnicodeEscape, + EmptyUnicodeEscape, + UnclosedUnicodeEscape, + LeadingUnderscoreUnicodeEscape, + OverlongUnicodeEscape, + LoneSurrogateUnicodeEscape, + OutOfRangeUnicodeEscape, + + UnicodeEscapeInByte, + NonAsciiCharInByte, + NonAsciiCharInByteString, +} + +/// Takes a contents of a char literal (without quotes), and returns an +/// unescaped char or an error +pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub fn unescape_str<F>(literal_text: &str, callback: &mut F) +where + F: FnMut(Range<usize>, Result<char, EscapeError>), +{ + unescape_str_or_byte_str(literal_text, Mode::Str, callback) +} + +pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> { + let mut chars = literal_text.chars(); + unescape_char_or_byte(&mut chars, Mode::Byte) + .map(byte_from_char) + .map_err(|err| (literal_text.len() - chars.as_str().len(), err)) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +pub fn unescape_byte_str<F>(literal_text: &str, callback: &mut F) +where + F: FnMut(Range<usize>, Result<u8, EscapeError>), +{ + unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub fn unescape_raw_str<F>(literal_text: &str, callback: &mut F) +where + F: FnMut(Range<usize>, Result<char, EscapeError>), +{ + unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +pub fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F) +where + F: FnMut(Range<usize>, Result<u8, EscapeError>), +{ + unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| { + callback(range, char.map(byte_from_char)) + }) +} + +#[derive(Debug, Clone, Copy)] +pub enum Mode { + Char, + Str, + Byte, + ByteStr, +} + +impl Mode { + pub fn in_single_quotes(self) -> bool { + match self { + Mode::Char | Mode::Byte => true, + Mode::Str | Mode::ByteStr => false, + } + } + + pub fn in_double_quotes(self) -> bool { + !self.in_single_quotes() + } + + pub fn is_bytes(self) -> bool { + match self { + Mode::Byte | Mode::ByteStr => true, + Mode::Char | Mode::Str => false, + } + } +} + + +fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { + if first_char != '\\' { + return match first_char { + '\t' | '\n' => Err(EscapeError::EscapeOnlyChar), + '\r' => Err(EscapeError::BareCarriageReturn), + '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar), + '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar), + _ => { + if mode.is_bytes() && !first_char.is_ascii() { + return Err(EscapeError::NonAsciiCharInByte); + } + Ok(first_char) + } + }; + } + + let second_char = chars.next().ok_or(EscapeError::LoneSlash)?; + + let res = match second_char { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', + + 'x' => { + let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; + let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; + + let value = hi * 16 + lo; + + if !mode.is_bytes() && !is_ascii(value) { + return Err(EscapeError::OutOfRangeHexEscape); + } + let value = value as u8; + + value as char + } + + 'u' => { + if chars.next() != Some('{') { + return Err(EscapeError::NoBraceInUnicodeEscape); + } + + let mut n_digits = 1; + let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? { + '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape), + '}' => return Err(EscapeError::EmptyUnicodeEscape), + c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?, + }; + + loop { + match chars.next() { + None => return Err(EscapeError::UnclosedUnicodeEscape), + Some('_') => continue, + Some('}') => { + if n_digits > 6 { + return Err(EscapeError::OverlongUnicodeEscape); + } + if mode.is_bytes() { + return Err(EscapeError::UnicodeEscapeInByte); + } + + break std::char::from_u32(value).ok_or_else(|| { + if value > 0x10FFFF { + EscapeError::OutOfRangeUnicodeEscape + } else { + EscapeError::LoneSurrogateUnicodeEscape + } + })?; + } + Some(c) => { + let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?; + n_digits += 1; + if n_digits > 6 { + continue; + } + let digit = digit as u32; + value = value * 16 + digit; + } + }; + } + } + _ => return Err(EscapeError::InvalidEscape), + }; + Ok(res) +} + +fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> { + let first_char = chars.next().ok_or(EscapeError::ZeroChars)?; + let res = scan_escape(first_char, chars, mode)?; + if chars.next().is_some() { + return Err(EscapeError::MoreThanOneChar); + } + Ok(res) +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of escaped characters or errors. +fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range<usize>, Result<char, EscapeError>), +{ + assert!(mode.in_double_quotes()); + let initial_len = src.len(); + let mut chars = src.chars(); + while let Some(first_char) = chars.next() { + let start = initial_len - chars.as_str().len() - first_char.len_utf8(); + + let unescaped_char = match first_char { + '\\' => { + let second_char = chars.clone().next(); + match second_char { + Some('\n') => { + skip_ascii_whitespace(&mut chars); + continue; + } + _ => scan_escape(first_char, &mut chars, mode), + } + } + '\n' => Ok('\n'), + '\t' => Ok('\t'), + _ => scan_escape(first_char, &mut chars, mode), + }; + let end = initial_len - chars.as_str().len(); + callback(start..end, unescaped_char); + } + + fn skip_ascii_whitespace(chars: &mut Chars<'_>) { + let str = chars.as_str(); + let first_non_space = str + .bytes() + .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r') + .unwrap_or(str.len()); + *chars = str[first_non_space..].chars() + } +} + +/// Takes a contents of a string literal (without quotes) and produces a +/// sequence of characters or errors. +/// NOTE: Raw strings do not perform any explicit character escaping, here we +/// only translate CRLF to LF and produce errors on bare CR. +fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F) +where + F: FnMut(Range<usize>, Result<char, EscapeError>), +{ + assert!(mode.in_double_quotes()); + let initial_len = literal_text.len(); + + let mut chars = literal_text.chars(); + while let Some(curr) = chars.next() { + let start = initial_len - chars.as_str().len() - curr.len_utf8(); + + let result = match curr { + '\r' => Err(EscapeError::BareCarriageReturnInRawString), + c if mode.is_bytes() && !c.is_ascii() => + Err(EscapeError::NonAsciiCharInByteString), + c => Ok(c), + }; + let end = initial_len - chars.as_str().len(); + + callback(start..end, result); + } +} + +fn byte_from_char(c: char) -> u8 { + let res = c as u32; + assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)"); + res as u8 +} + +fn is_ascii(x: u32) -> bool { + x <= 0x7F +} diff --git a/vendor/rustc_lexer/src/unescape/tests.rs b/vendor/rustc_lexer/src/unescape/tests.rs new file mode 100644 index 00000000..e7b1ff64 --- /dev/null +++ b/vendor/rustc_lexer/src/unescape/tests.rs @@ -0,0 +1,271 @@ +use super::*; + +#[test] +fn test_unescape_char_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + check(r"\u{0}x", EscapeError::MoreThanOneChar); + check(r"\u{1F63b}}", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + check("\\\r", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + check(r"\xff", EscapeError::OutOfRangeHexEscape); + check(r"\xFF", EscapeError::OutOfRangeHexEscape); + check(r"\x80", EscapeError::OutOfRangeHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape); + + check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape); + + check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape); + check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape); +} + +#[test] +fn test_unescape_char_good() { + fn check(literal_text: &str, expected_char: char) { + let actual_result = unescape_char(literal_text); + assert_eq!(actual_result, Ok(expected_char)); + } + + check("a", 'a'); + check("ы", 'ы'); + check("🦀", '🦀'); + + check(r#"\""#, '"'); + check(r"\n", '\n'); + check(r"\r", '\r'); + check(r"\t", '\t'); + check(r"\\", '\\'); + check(r"\'", '\''); + check(r"\0", '\0'); + + check(r"\x00", '\0'); + check(r"\x5a", 'Z'); + check(r"\x5A", 'Z'); + check(r"\x7f", 127 as char); + + check(r"\u{0}", '\0'); + check(r"\u{000000}", '\0'); + check(r"\u{41}", 'A'); + check(r"\u{0041}", 'A'); + check(r"\u{00_41}", 'A'); + check(r"\u{4__1__}", 'A'); + check(r"\u{1F63b}", '😻'); +} + +#[test] +fn test_unescape_str_good() { + fn check(literal_text: &str, expected: &str) { + let mut buf = Ok(String::with_capacity(literal_text.len())); + unescape_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", "foo"); + check("", ""); + check(" \t\n", " \t\n"); + + check("hello \\\n world", "hello world"); + check("thread's", "thread's") +} + +#[test] +fn test_unescape_byte_bad() { + fn check(literal_text: &str, expected_error: EscapeError) { + let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err); + assert_eq!(actual_result, Err(expected_error)); + } + + check("", EscapeError::ZeroChars); + check(r"\", EscapeError::LoneSlash); + + check("\n", EscapeError::EscapeOnlyChar); + check("\t", EscapeError::EscapeOnlyChar); + check("'", EscapeError::EscapeOnlyChar); + check("\r", EscapeError::BareCarriageReturn); + + check("spam", EscapeError::MoreThanOneChar); + check(r"\x0ff", EscapeError::MoreThanOneChar); + check(r#"\"a"#, EscapeError::MoreThanOneChar); + check(r"\na", EscapeError::MoreThanOneChar); + check(r"\ra", EscapeError::MoreThanOneChar); + check(r"\ta", EscapeError::MoreThanOneChar); + check(r"\\a", EscapeError::MoreThanOneChar); + check(r"\'a", EscapeError::MoreThanOneChar); + check(r"\0a", EscapeError::MoreThanOneChar); + + check(r"\v", EscapeError::InvalidEscape); + check(r"\💩", EscapeError::InvalidEscape); + check(r"\●", EscapeError::InvalidEscape); + + check(r"\x", EscapeError::TooShortHexEscape); + check(r"\x0", EscapeError::TooShortHexEscape); + check(r"\xa", EscapeError::TooShortHexEscape); + check(r"\xf", EscapeError::TooShortHexEscape); + check(r"\xx", EscapeError::InvalidCharInHexEscape); + check(r"\xы", EscapeError::InvalidCharInHexEscape); + check(r"\x🦀", EscapeError::InvalidCharInHexEscape); + check(r"\xtt", EscapeError::InvalidCharInHexEscape); + + check(r"\u", EscapeError::NoBraceInUnicodeEscape); + check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape); + check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape); + check(r"\u{", EscapeError::UnclosedUnicodeEscape); + check(r"\u{0000", EscapeError::UnclosedUnicodeEscape); + check(r"\u{}", EscapeError::EmptyUnicodeEscape); + check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape); + check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape); + + check("ы", EscapeError::NonAsciiCharInByte); + check("🦀", EscapeError::NonAsciiCharInByte); + + check(r"\u{0}", EscapeError::UnicodeEscapeInByte); + check(r"\u{000000}", EscapeError::UnicodeEscapeInByte); + check(r"\u{41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0041}", EscapeError::UnicodeEscapeInByte); + check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte); + check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte); + check(r"\u{0}x", EscapeError::UnicodeEscapeInByte); + check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte); + check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte); + check(r"\u{D800}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte); + check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte); +} + +#[test] +fn test_unescape_byte_good() { + fn check(literal_text: &str, expected_byte: u8) { + let actual_result = unescape_byte(literal_text); + assert_eq!(actual_result, Ok(expected_byte)); + } + + check("a", b'a'); + + check(r#"\""#, b'"'); + check(r"\n", b'\n'); + check(r"\r", b'\r'); + check(r"\t", b'\t'); + check(r"\\", b'\\'); + check(r"\'", b'\''); + check(r"\0", b'\0'); + + check(r"\x00", b'\0'); + check(r"\x5a", b'Z'); + check(r"\x5A", b'Z'); + check(r"\x7f", 127); + check(r"\x80", 128); + check(r"\xff", 255); + check(r"\xFF", 255); +} + +#[test] +fn test_unescape_byte_str_good() { + fn check(literal_text: &str, expected: &[u8]) { + let mut buf = Ok(Vec::with_capacity(literal_text.len())); + unescape_byte_str(literal_text, &mut |range, c| { + if let Ok(b) = &mut buf { + match c { + Ok(c) => b.push(c), + Err(e) => buf = Err((range, e)), + } + } + }); + let buf = buf.as_ref().map(|it| it.as_ref()); + assert_eq!(buf, Ok(expected)) + } + + check("foo", b"foo"); + check("", b""); + check(" \t\n", b" \t\n"); + + check("hello \\\n world", b"hello world"); + check("thread's", b"thread's") +} + +#[test] +fn test_unescape_raw_str() { + fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_raw_str(literal, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]); +} + +#[test] +fn test_unescape_raw_byte_str() { + fn check(literal: &str, expected: &[(Range<usize>, Result<u8, EscapeError>)]) { + let mut unescaped = Vec::with_capacity(literal.len()); + unescape_raw_byte_str(literal, &mut |range, res| unescaped.push((range, res))); + assert_eq!(unescaped, expected); + } + + check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]); + check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]); + check( + "🦀a", + &[(0..4, Err(EscapeError::NonAsciiCharInByteString)), (4..5, Ok(byte_from_char('a')))], + ); +} |
