diff options
Diffstat (limited to 'vendor/logos-codegen/src')
25 files changed, 5187 insertions, 0 deletions
diff --git a/vendor/logos-codegen/src/error.rs b/vendor/logos-codegen/src/error.rs new file mode 100644 index 00000000..64f93c01 --- /dev/null +++ b/vendor/logos-codegen/src/error.rs @@ -0,0 +1,110 @@ +use std::fmt; + +use beef::lean::Cow; +use proc_macro2::{Span, TokenStream}; +use quote::quote; +use quote::{quote_spanned, ToTokens, TokenStreamExt}; + +pub type Result<T> = std::result::Result<T, Error>; + +#[derive(Default)] +pub struct Errors { + collected: Vec<SpannedError>, +} + +impl Errors { + pub fn err<M>(&mut self, message: M, span: Span) -> &mut Self + where + M: Into<Cow<'static, str>>, + { + self.collected.push(SpannedError { + message: message.into(), + span, + }); + + self + } + + pub fn render(self) -> Option<TokenStream> { + let errors = self.collected; + + match errors.len() { + 0 => None, + _ => Some(quote! { + fn _logos_derive_compile_errors() { + #(#errors)* + } + }), + } + } +} + +pub struct Error(Cow<'static, str>); + +#[derive(Debug)] +pub struct SpannedError { + message: Cow<'static, str>, + span: Span, +} + +impl Error { + pub fn new<M>(message: M) -> Self + where + M: Into<Cow<'static, str>>, + { + Error(message.into()) + } + + pub fn span(self, span: Span) -> SpannedError { + SpannedError { + message: self.0, + span, + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.0.fmt(f) + } +} + +impl fmt::Debug for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(self, f) + } +} + +impl From<regex_syntax::Error> for Error { + fn from(err: regex_syntax::Error) -> Error { + Error(err.to_string().into()) + } +} + +impl From<&'static str> for Error { + fn from(err: &'static str) -> Error { + Error(err.into()) + } +} + +impl From<String> for Error { + fn from(err: String) -> Error { + Error(err.into()) + } +} + +impl From<Error> for Cow<'static, str> { + fn from(err: Error) -> Self { + err.0 + } +} + +impl ToTokens for SpannedError { + fn to_tokens(&self, tokens: &mut TokenStream) { + let message = &*self.message; + + tokens.append_all(quote_spanned!(self.span => { + compile_error!(#message) + })) + } +} diff --git a/vendor/logos-codegen/src/generator/context.rs b/vendor/logos-codegen/src/generator/context.rs new file mode 100644 index 00000000..dc52f594 --- /dev/null +++ b/vendor/logos-codegen/src/generator/context.rs @@ -0,0 +1,128 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::Generator; +use crate::graph::NodeId; + +/// This struct keeps track of bytes available to be read without +/// bounds checking across the tree. +/// +/// For example, a branch that matches 4 bytes followed by a fork +/// with smallest branch containing of 2 bytes can do a bounds check +/// for 6 bytes ahead, and leave the remaining 2 byte array (fixed size) +/// to be handled by the fork, avoiding bound checks there. +#[derive(Default, Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub struct Context { + /// Amount of bytes that haven't been bumped yet but should + /// before a new read is performed + at: usize, + /// Number of bytes available without bound checks + available: usize, + /// Whether or not the Lexer has been bumped at least by 1 byte + bumped: bool, + /// Node to backtrack to to in case an explicit match has failed. + /// If `None` will instead produce an error token. + backtrack: Option<NodeId>, +} + +impl Context { + pub fn can_backtrack(&self) -> bool { + self.backtrack.is_some() + } + + pub fn switch(&mut self, miss: Option<NodeId>) -> Option<TokenStream> { + self.backtrack = Some(miss?); + self.bump() + } + + pub const fn advance(self, n: usize) -> Self { + Context { + at: self.at + n, + ..self + } + } + + pub fn bump(&mut self) -> Option<TokenStream> { + match self.at { + 0 => None, + n => { + let tokens = quote!(lex.bump_unchecked(#n);); + self.at = 0; + self.available = 0; + self.bumped = true; + Some(tokens) + } + } + } + + pub fn remainder(&self) -> usize { + self.available.saturating_sub(self.at) + } + + pub fn read_byte(&mut self) -> TokenStream { + let at = self.at; + + self.advance(1); + + #[cfg(not(feature = "forbid_unsafe"))] + { + quote!(unsafe { lex.read_byte_unchecked(#at) }) + } + + #[cfg(feature = "forbid_unsafe")] + { + quote!(lex.read_byte(#at)) + } + } + + pub fn read(&mut self, len: usize) -> TokenStream { + self.available = len; + + match (self.at, len) { + (0, 0) => quote!(lex.read::<u8>()), + (a, 0) => quote!(lex.read_at::<u8>(#a)), + (0, l) => quote!(lex.read::<&[u8; #l]>()), + (a, l) => quote!(lex.read_at::<&[u8; #l]>(#a)), + } + } + + pub fn wipe(&mut self) { + self.available = 0; + } + + const fn backtrack(self) -> Self { + Context { + at: 0, + available: 0, + bumped: self.bumped, + backtrack: None, + } + } + + pub fn miss(mut self, miss: Option<NodeId>, gen: &mut Generator) -> TokenStream { + self.wipe(); + match (miss, self.backtrack) { + (Some(id), _) => gen.goto(id, self).clone(), + (_, Some(id)) => gen.goto(id, self.backtrack()).clone(), + _ if self.bumped => quote!(lex.error()), + _ => quote!(_error(lex)), + } + } + + pub fn write_suffix(&self, buf: &mut String) { + use std::fmt::Write; + + if self.at > 0 { + let _ = write!(buf, "_at{}", self.at); + } + if self.available > 0 { + let _ = write!(buf, "_with{}", self.available); + } + if let Some(id) = self.backtrack { + let _ = write!(buf, "_ctx{}", id); + } + if self.bumped { + buf.push_str("_x"); + } + } +} diff --git a/vendor/logos-codegen/src/generator/fork.rs b/vendor/logos-codegen/src/generator/fork.rs new file mode 100644 index 00000000..11d4eab2 --- /dev/null +++ b/vendor/logos-codegen/src/generator/fork.rs @@ -0,0 +1,216 @@ +use std::cmp::max; + +use fnv::FnvHashMap as Map; +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::graph::{Fork, NodeId, Range}; +use crate::util::ToIdent; + +type Targets = Map<NodeId, Vec<Range>>; + +impl Generator<'_> { + pub fn generate_fork(&mut self, this: NodeId, fork: &Fork, mut ctx: Context) -> TokenStream { + let mut targets: Targets = Map::default(); + + for (range, then) in fork.branches() { + targets.entry(then).or_default().push(range); + } + let loops_to_self = self.meta[this].loop_entry_from.contains(&this); + + match targets.len() { + 1 if loops_to_self => return self.generate_fast_loop(fork, ctx), + 0..=2 => (), + _ => return self.generate_fork_jump_table(this, fork, targets, ctx), + } + let miss = ctx.miss(fork.miss, self); + let end = self.fork_end(this, &miss); + let (byte, read) = self.fork_read(this, end, &mut ctx); + let branches = targets.into_iter().map(|(id, ranges)| { + let next = self.goto(id, ctx.advance(1)); + + match *ranges { + [range] => { + quote!(#range => #next,) + } + [a, b] if a.is_byte() && b.is_byte() => { + quote!(#a | #b => #next,) + } + _ => { + let test = self.generate_test(ranges).clone(); + let next = self.goto(id, ctx.advance(1)); + + quote!(byte if #test(byte) => #next,) + } + } + }); + + quote! { + #read + + match #byte { + #(#branches)* + _ => #miss, + } + } + } + + fn generate_fork_jump_table( + &mut self, + this: NodeId, + fork: &Fork, + targets: Targets, + mut ctx: Context, + ) -> TokenStream { + let miss = ctx.miss(fork.miss, self); + let end = self.fork_end(this, &miss); + let (byte, read) = self.fork_read(this, end, &mut ctx); + + let mut table: [u8; 256] = [0; 256]; + let mut jumps = vec!["__".to_ident()]; + + let branches = targets + .into_iter() + .enumerate() + .map(|(idx, (id, ranges))| { + let idx = (idx as u8) + 1; + let next = self.goto(id, ctx.advance(1)); + jumps.push(format!("J{}", id).to_ident()); + + for byte in ranges.into_iter().flatten() { + table[byte as usize] = idx; + } + let jump = jumps.last().unwrap(); + + quote!(Jump::#jump => #next,) + }) + .collect::<TokenStream>(); + + let may_error = table.iter().any(|&idx| idx == 0); + + let jumps = jumps.as_slice(); + let table = table.iter().copied().map(|idx| &jumps[idx as usize]); + + let jumps = if may_error { jumps } else { &jumps[1..] }; + let error_branch = if may_error { + Some(quote!(Jump::__ => #miss)) + } else { + None + }; + + quote! { + enum Jump { + #(#jumps,)* + } + + const LUT: [Jump; 256] = { + use Jump::*; + + [#(#table),*] + }; + + #read + + match LUT[#byte as usize] { + #branches + #error_branch + } + } + } + + fn fork_end(&self, this: NodeId, miss: &TokenStream) -> TokenStream { + if this == self.root { + quote!(_end(lex)) + } else { + miss.clone() + } + } + + fn fork_read( + &self, + this: NodeId, + end: TokenStream, + ctx: &mut Context, + ) -> (TokenStream, TokenStream) { + let min_read = self.meta[this].min_read; + + if ctx.remainder() >= max(min_read, 1) { + let read = ctx.read_byte(); + + return (quote!(byte), quote!(let byte = #read;)); + } + + match min_read { + 0 | 1 => { + let read = ctx.read(0); + + ( + quote!(byte), + quote! { + let byte = match #read { + Some(byte) => byte, + None => return #end, + }; + }, + ) + } + len => { + let read = ctx.read(len); + + ( + quote!(arr[0]), + quote! { + let arr = match #read { + Some(arr) => arr, + None => return #end, + }; + }, + ) + } + } + } + + fn generate_fast_loop(&mut self, fork: &Fork, ctx: Context) -> TokenStream { + let miss = ctx.miss(fork.miss, self); + let ranges = fork.branches().map(|(range, _)| range).collect::<Vec<_>>(); + let test = self.generate_test(ranges); + + quote! { + _fast_loop!(lex, #test, #miss); + } + } + + pub fn fast_loop_macro() -> TokenStream { + quote! { + macro_rules! _fast_loop { + ($lex:ident, $test:ident, $miss:expr) => { + // Do one bounds check for multiple bytes till EOF + while let Some(arr) = $lex.read::<&[u8; 16]>() { + if $test(arr[0]) { if $test(arr[1]) { if $test(arr[2]) { if $test(arr[3]) { + if $test(arr[4]) { if $test(arr[5]) { if $test(arr[6]) { if $test(arr[7]) { + if $test(arr[8]) { if $test(arr[9]) { if $test(arr[10]) { if $test(arr[11]) { + if $test(arr[12]) { if $test(arr[13]) { if $test(arr[14]) { if $test(arr[15]) { + + $lex.bump_unchecked(16); continue; } $lex.bump_unchecked(15); return $miss; } + $lex.bump_unchecked(14); return $miss; } $lex.bump_unchecked(13); return $miss; } + $lex.bump_unchecked(12); return $miss; } $lex.bump_unchecked(11); return $miss; } + $lex.bump_unchecked(10); return $miss; } $lex.bump_unchecked(9); return $miss; } + $lex.bump_unchecked(8); return $miss; } $lex.bump_unchecked(7); return $miss; } + $lex.bump_unchecked(6); return $miss; } $lex.bump_unchecked(5); return $miss; } + $lex.bump_unchecked(4); return $miss; } $lex.bump_unchecked(3); return $miss; } + $lex.bump_unchecked(2); return $miss; } $lex.bump_unchecked(1); return $miss; } + + return $miss; + } + + while $lex.test($test) { + $lex.bump_unchecked(1); + } + + $miss + }; + } + } + } +} diff --git a/vendor/logos-codegen/src/generator/leaf.rs b/vendor/logos-codegen/src/generator/leaf.rs new file mode 100644 index 00000000..0841a4ff --- /dev/null +++ b/vendor/logos-codegen/src/generator/leaf.rs @@ -0,0 +1,67 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::leaf::{Callback, Leaf}; +use crate::util::MaybeVoid; + +impl Generator<'_> { + pub fn generate_leaf(&mut self, leaf: &Leaf, mut ctx: Context) -> TokenStream { + let bump = ctx.bump(); + + let ident = &leaf.ident; + let name = self.name; + let this = self.this; + let ty = &leaf.field; + + let constructor = match leaf.field { + MaybeVoid::Some(_) => quote!(#name::#ident), + MaybeVoid::Void => quote!(|()| #name::#ident), + }; + + match &leaf.callback { + Some(Callback::Label(callback)) => quote! { + #bump + #callback(lex).construct(#constructor, lex); + }, + Some(Callback::Inline(inline)) => { + let arg = &inline.arg; + let body = &inline.body; + + #[cfg(not(rust_1_82))] + let ret = quote!(impl CallbackResult<'s, #ty, #this>); + + #[cfg(rust_1_82)] + let ret = quote!(impl CallbackResult<'s, #ty, #this> + use<'s>); + + quote! { + #bump + + #[inline] + fn callback<'s>(#arg: &mut Lexer<'s>) -> #ret { + #body + } + + callback(lex).construct(#constructor, lex); + } + } + Some(Callback::Skip(_)) => { + quote! { + #bump + + lex.trivia(); + #name::lex(lex); + } + } + None if matches!(leaf.field, MaybeVoid::Void) => quote! { + #bump + lex.set(Ok(#name::#ident)); + }, + None => quote! { + #bump + let token = #name::#ident(lex.slice()); + lex.set(Ok(token)); + }, + } + } +} diff --git a/vendor/logos-codegen/src/generator/mod.rs b/vendor/logos-codegen/src/generator/mod.rs new file mode 100644 index 00000000..1b8bf8bf --- /dev/null +++ b/vendor/logos-codegen/src/generator/mod.rs @@ -0,0 +1,268 @@ +use fnv::{FnvHashMap as Map, FnvHashSet as Set}; +use proc_macro2::TokenStream; +use quote::{quote, ToTokens, TokenStreamExt}; +use syn::Ident; + +use crate::graph::{Graph, Meta, Node, NodeId, Range}; +use crate::leaf::Leaf; +use crate::util::ToIdent; + +mod context; +mod fork; +mod leaf; +mod rope; +mod tables; + +use self::context::Context; +use self::tables::TableStack; + +pub struct Generator<'a> { + /// Name of the type we are implementing the `Logos` trait for + name: &'a Ident, + /// Name of the type with any generics it might need + this: &'a TokenStream, + /// Id to the root node + root: NodeId, + /// Reference to the graph with all of the nodes + graph: &'a Graph<Leaf<'a>>, + /// Meta data collected for the nodes + meta: Meta, + /// Buffer with functions growing during generation + rendered: TokenStream, + /// Set of functions that have already been rendered + fns: Set<(NodeId, Context)>, + /// Function name identifiers + idents: Map<(NodeId, Context), Ident>, + /// Local function calls. Note: a call might change its context, + /// so we can't use `idents` for this purpose. + gotos: Map<(NodeId, Context), TokenStream>, + /// Identifiers for helper functions matching a byte to a given + /// set of ranges + tests: Map<Vec<Range>, Ident>, + /// Related to above, table stack manages tables that need to be + tables: TableStack, +} + +impl<'a> Generator<'a> { + pub fn new( + name: &'a Ident, + this: &'a TokenStream, + root: NodeId, + graph: &'a Graph<Leaf>, + ) -> Self { + let rendered = Self::fast_loop_macro(); + let meta = Meta::analyze(root, graph); + + Generator { + name, + this, + root, + graph, + meta, + rendered, + fns: Set::default(), + idents: Map::default(), + gotos: Map::default(), + tests: Map::default(), + tables: TableStack::new(), + } + } + + pub fn generate(mut self) -> TokenStream { + let root = self.goto(self.root, Context::default()).clone(); + let rendered = &self.rendered; + let tables = &self.tables; + + quote! { + #tables + #rendered + #root + } + } + + fn generate_fn(&mut self, id: NodeId, ctx: Context) { + if self.fns.contains(&(id, ctx)) { + return; + } + self.fns.insert((id, ctx)); + + let body = match &self.graph[id] { + Node::Fork(fork) => self.generate_fork(id, fork, ctx), + Node::Rope(rope) => self.generate_rope(rope, ctx), + Node::Leaf(leaf) => self.generate_leaf(leaf, ctx), + }; + let ident = self.generate_ident(id, ctx); + let out = quote! { + #[inline] + fn #ident<'s>(lex: &mut Lexer<'s>) { + #body + } + }; + + self.rendered.append_all(out); + } + + fn goto(&mut self, id: NodeId, mut ctx: Context) -> &TokenStream { + let key = (id, ctx); + + // Allow contains_key + insert because self.generate_ident borrows a mutable ref to self + // too. + #[allow(clippy::map_entry)] + if !self.gotos.contains_key(&key) { + let meta = &self.meta[id]; + let enters_loop = !meta.loop_entry_from.is_empty(); + + let bump = if enters_loop || !ctx.can_backtrack() { + ctx.switch(self.graph[id].miss()) + } else { + None + }; + + let bump = match (bump, enters_loop, meta.min_read) { + (Some(t), _, _) => Some(t), + (None, true, _) => ctx.bump(), + (None, false, 0) => ctx.bump(), + (None, false, _) => None, + }; + + if meta.min_read == 0 || ctx.remainder() < meta.min_read { + ctx.wipe(); + } + + let ident = self.generate_ident(id, ctx); + let mut call_site = quote!(#ident(lex)); + + if let Some(bump) = bump { + call_site = quote!({ + #bump + #call_site + }); + } + self.gotos.insert(key, call_site); + self.generate_fn(id, ctx); + } + &self.gotos[&key] + } + + fn generate_ident(&mut self, id: NodeId, ctx: Context) -> &Ident { + self.idents.entry((id, ctx)).or_insert_with(|| { + let mut ident = format!("goto{}", id); + + ctx.write_suffix(&mut ident); + + ident.to_ident() + }) + } + + /// Returns an identifier to a function that matches a byte to any + /// of the provided ranges. This will generate either a simple + /// match expression, or use a lookup table internally. + fn generate_test(&mut self, ranges: Vec<Range>) -> &Ident { + if !self.tests.contains_key(&ranges) { + let idx = self.tests.len(); + let ident = format!("pattern{}", idx).to_ident(); + + let lo = ranges.first().unwrap().start; + let hi = ranges.last().unwrap().end; + + let body = match ranges.len() { + 0..=2 => { + quote! { + match byte { + #(#ranges)|* => true, + _ => false, + } + } + } + _ if hi - lo < 64 => { + let mut offset = hi.saturating_sub(63); + + while offset.count_ones() > 1 && lo - offset > 0 { + offset += 1; + } + + let mut table = 0u64; + + for byte in ranges.iter().flat_map(|range| *range) { + if byte - offset >= 64 { + panic!("{:#?} {} {} {}", ranges, hi, lo, offset); + } + table |= 1 << (byte - offset); + } + + let search = match offset { + 0 => quote!(byte), + _ => quote!(byte.wrapping_sub(#offset)), + }; + + quote! { + const LUT: u64 = #table; + + match 1u64.checked_shl(#search as u32) { + Some(shift) => LUT & shift != 0, + None => false, + } + } + } + _ => { + let mut view = self.tables.view(); + + for byte in ranges.iter().flat_map(|range| *range) { + view.flag(byte); + } + + let mask = view.mask(); + let lut = view.ident(); + + quote! { + #lut[byte as usize] & #mask > 0 + } + } + }; + self.rendered.append_all(quote! { + #[inline] + fn #ident(byte: u8) -> bool { + #body + } + }); + self.tests.insert(ranges.clone(), ident); + } + &self.tests[&ranges] + } +} + +macro_rules! match_quote { + ($source:expr; $($byte:tt,)* ) => {match $source { + $( $byte => quote!($byte), )* + byte => quote!(#byte), + }} +} + +fn byte_to_tokens(byte: u8) -> TokenStream { + match_quote! { + byte; + b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', + b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', + b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', + b'U', b'V', b'W', b'X', b'Y', b'Z', + b'!', b'@', b'#', b'$', b'%', b'^', b'&', b'*', b'(', b')', + b'{', b'}', b'[', b']', b'<', b'>', b'-', b'=', b'_', b'+', + b':', b';', b',', b'.', b'/', b'?', b'|', b'"', b'\'', b'\\', + } +} + +impl ToTokens for Range { + fn to_tokens(&self, tokens: &mut TokenStream) { + let Range { start, end } = self; + + tokens.append_all(byte_to_tokens(*start)); + + if start != end { + tokens.append_all(quote!(..=)); + tokens.append_all(byte_to_tokens(*end)); + } + } +} diff --git a/vendor/logos-codegen/src/generator/rope.rs b/vendor/logos-codegen/src/generator/rope.rs new file mode 100644 index 00000000..ae3b07ad --- /dev/null +++ b/vendor/logos-codegen/src/generator/rope.rs @@ -0,0 +1,39 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::graph::Rope; + +impl Generator<'_> { + pub fn generate_rope(&mut self, rope: &Rope, mut ctx: Context) -> TokenStream { + let miss = ctx.miss(rope.miss.first(), self); + let read = ctx.read(rope.pattern.len()); + let then = self.goto(rope.then, ctx.advance(rope.pattern.len())); + + let pat = match rope.pattern.to_bytes() { + Some(bytes) => byte_slice_literal(&bytes), + None => { + let ranges = rope.pattern.iter(); + + quote!([#(#ranges),*]) + } + }; + + quote! { + match #read { + Some(#pat) => #then, + _ => #miss, + } + } + } +} + +fn byte_slice_literal(bytes: &[u8]) -> TokenStream { + if bytes.iter().any(|&b| !(0x20..0x7F).contains(&b)) { + return quote!(&[#(#bytes),*]); + } + + let slice = std::str::from_utf8(bytes).unwrap(); + + syn::parse_str(&format!("b{:?}", slice)).unwrap() +} diff --git a/vendor/logos-codegen/src/generator/tables.rs b/vendor/logos-codegen/src/generator/tables.rs new file mode 100644 index 00000000..f1e53273 --- /dev/null +++ b/vendor/logos-codegen/src/generator/tables.rs @@ -0,0 +1,77 @@ +use crate::util::ToIdent; +use proc_macro2::{Literal, TokenStream}; +use quote::{quote, ToTokens}; +use syn::Ident; + +pub struct TableStack { + tables: Vec<(Ident, [u8; 256])>, + shift: u8, +} + +pub struct TableView<'a> { + ident: &'a Ident, + table: &'a mut [u8; 256], + mask: u8, +} + +impl TableStack { + pub fn new() -> Self { + TableStack { + tables: vec![("COMPACT_TABLE_0".to_ident(), [0; 256])], + shift: 0, + } + } + + pub fn view(&mut self) -> TableView { + let mask = if self.shift < 8 { + // Reusing existing table with a shifted mask + let mask = 1u8 << self.shift; + + self.shift += 1; + + mask + } else { + // Need to create a new table + let ident = format!("COMPACT_TABLE_{}", self.tables.len()).to_ident(); + + self.tables.push((ident, [0; 256])); + self.shift = 1; + + 1 + }; + + let (ref ident, ref mut table) = self.tables.last_mut().unwrap(); + + TableView { ident, table, mask } + } +} + +impl<'a> TableView<'a> { + pub fn ident(&self) -> &'a Ident { + self.ident + } + + pub fn flag(&mut self, byte: u8) { + self.table[byte as usize] |= self.mask; + } + + pub fn mask(&self) -> Literal { + Literal::u8_unsuffixed(self.mask) + } +} + +impl ToTokens for TableStack { + fn to_tokens(&self, out: &mut TokenStream) { + if self.shift == 0 { + return; + } + + for (ident, table) in self.tables.iter() { + let bytes = table.iter().copied().map(Literal::u8_unsuffixed); + + out.extend(quote! { + static #ident: [u8; 256] = [#(#bytes),*]; + }); + } + } +} diff --git a/vendor/logos-codegen/src/graph/fork.rs b/vendor/logos-codegen/src/graph/fork.rs new file mode 100644 index 00000000..6b59836b --- /dev/null +++ b/vendor/logos-codegen/src/graph/fork.rs @@ -0,0 +1,267 @@ +use crate::graph::{Disambiguate, Graph, NodeId, Range}; + +#[derive(Clone)] +pub struct Fork { + /// LUT matching byte -> node id + lut: Box<[Option<NodeId>; 256]>, + /// State to go to if no arms are matching + pub miss: Option<NodeId>, +} + +impl Fork { + pub fn new() -> Self { + Fork { + lut: Box::new([None; 256]), + miss: None, + } + } + + pub fn miss<M>(mut self, miss: M) -> Self + where + M: Into<Option<NodeId>>, + { + self.miss = miss.into(); + self + } + + pub fn add_branch<R, T>(&mut self, range: R, then: NodeId, graph: &mut Graph<T>) + where + R: Into<Range>, + T: Disambiguate, + { + for byte in range.into() { + match &mut self.lut[byte as usize] { + Some(other) if *other != then => { + *other = graph.merge(*other, then); + } + opt => *opt = Some(then), + } + } + } + + // TODO: Add result with a printable error + pub fn merge<T>(&mut self, other: Fork, graph: &mut Graph<T>) + where + T: Disambiguate, + { + self.miss = match (self.miss, other.miss) { + (None, None) => None, + (Some(id), None) | (None, Some(id)) => Some(id), + (Some(a), Some(b)) => Some(graph.merge(a, b)), + }; + + for (left, right) in self.lut.iter_mut().zip(other.lut.iter()) { + *left = match (*left, *right) { + (None, None) => continue, + (Some(id), None) | (None, Some(id)) => Some(id), + (Some(a), Some(b)) => Some(graph.merge(a, b)), + } + } + } + + pub fn branches(&self) -> ForkIter<'_> { + ForkIter { + offset: 0, + lut: &self.lut, + } + } + + /// Checks if all bytes in the `range` have a branch on this + /// fork, and those branches are resolve to the same `NodeId`. + pub fn contains<R>(&self, range: R) -> Option<NodeId> + where + R: Into<Range>, + { + let mut range = range.into(); + let byte = range.next()?; + let first = self.lut[byte as usize]?; + + for byte in range { + if first != self.lut[byte as usize]? { + return None; + } + } + + Some(first) + } + + pub fn branch<R>(mut self, range: R, then: NodeId) -> Self + where + R: Into<Range>, + { + for byte in range.into() { + match &mut self.lut[byte as usize] { + Some(other) if *other != then => { + panic!("Overlapping branches"); + } + opt => *opt = Some(then), + } + } + self + } + + pub fn shake<T>(&self, graph: &Graph<T>, filter: &mut [bool]) { + if let Some(id) = self.miss { + if !filter[id.get()] { + filter[id.get()] = true; + graph[id].shake(graph, filter); + } + } + + for (_, id) in self.branches() { + if !filter[id.get()] { + filter[id.get()] = true; + graph[id].shake(graph, filter); + } + } + } +} + +pub struct ForkIter<'a> { + offset: usize, + lut: &'a [Option<NodeId>; 256], +} + +impl Iterator for ForkIter<'_> { + type Item = (Range, NodeId); + + fn next(&mut self) -> Option<Self::Item> { + // Consume empty slots + self.offset += self.lut[self.offset..] + .iter() + .take_while(|next| next.is_none()) + .count(); + + let then = self.lut.get(self.offset).copied().flatten()?; + let start = self.offset; + + // Consume all slots with same NodeId target + self.offset += self.lut[self.offset..] + .iter() + .take_while(|next| **next == Some(then)) + .count(); + + Some(( + Range { + start: start as u8, + end: (self.offset - 1) as u8, + }, + then, + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::graph::Node; + use pretty_assertions::assert_eq; + + #[test] + fn fork_iter() { + let mut buf = [None; 256]; + + for byte in b'4'..=b'7' { + buf[byte as usize] = Some(NodeId::new(1)); + } + for byte in b'a'..=b'd' { + buf[byte as usize] = Some(NodeId::new(2)); + } + + let iter = ForkIter { + offset: 0, + lut: &buf, + }; + + assert_eq!( + &[ + ( + Range { + start: b'4', + end: b'7' + }, + NodeId::new(1) + ), + ( + Range { + start: b'a', + end: b'd' + }, + NodeId::new(2) + ), + ], + &*iter.collect::<Vec<_>>(), + ); + } + + #[test] + fn merge_no_conflict() { + let mut graph = Graph::new(); + + let leaf1 = graph.push(Node::Leaf("FOO")); + let leaf2 = graph.push(Node::Leaf("BAR")); + + let mut fork = Fork::new().branch(b'1', leaf1); + + fork.merge(Fork::new().branch(b'2', leaf2), &mut graph); + + assert_eq!(fork, Fork::new().branch(b'1', leaf1).branch(b'2', leaf2)); + } + + #[test] + fn merge_miss_right() { + let mut graph = Graph::new(); + + let leaf1 = graph.push(Node::Leaf("FOO")); + let leaf2 = graph.push(Node::Leaf("BAR")); + + let mut fork = Fork::new().branch(b'1', leaf1); + + fork.merge(Fork::new().miss(leaf2), &mut graph); + + assert_eq!(fork, Fork::new().branch(b'1', leaf1).miss(leaf2)); + } + + #[test] + fn merge_miss_left() { + let mut graph = Graph::new(); + + let leaf1 = graph.push(Node::Leaf("FOO")); + let leaf2 = graph.push(Node::Leaf("BAR")); + + let mut fork = Fork::new().miss(leaf1); + + fork.merge(Fork::new().branch(b'2', leaf2), &mut graph); + + assert_eq!(fork, Fork::new().branch(b'2', leaf2).miss(leaf1)); + } + + #[test] + fn contains_byte() { + let fork = Fork::new().branch('a'..='z', NodeId::new(42)); + + assert_eq!(fork.contains(b't'), Some(NodeId::new(42))); + } + + #[test] + fn contains_range() { + let fork = Fork::new() + .branch('a'..='m', NodeId::new(42)) + .branch('n'..='z', NodeId::new(42)); + + assert_eq!(fork.contains('i'..='r'), Some(NodeId::new(42))); + assert_eq!(fork.contains('a'..='z'), Some(NodeId::new(42))); + } + + #[test] + fn contains_different_ranges() { + let fork = Fork::new() + .branch('a'..='m', NodeId::new(42)) + .branch('n'..='z', NodeId::new(47)); + + assert_eq!(fork.contains('i'..='r'), None); + assert_eq!(fork.contains('a'..='z'), None); + assert_eq!(fork.contains('d'..='f'), Some(NodeId::new(42))); + assert_eq!(fork.contains('n'..='p'), Some(NodeId::new(47))); + } +} diff --git a/vendor/logos-codegen/src/graph/impls.rs b/vendor/logos-codegen/src/graph/impls.rs new file mode 100644 index 00000000..dc97bdf1 --- /dev/null +++ b/vendor/logos-codegen/src/graph/impls.rs @@ -0,0 +1,220 @@ +use std::fmt::{self, Debug, Display}; +use std::hash::{Hash, Hasher}; + +use crate::graph::{Fork, Graph, Node, NodeId, Range, Rope}; + +impl<T> From<Fork> for Node<T> { + fn from(fork: Fork) -> Self { + Node::Fork(fork) + } +} +impl<T> From<Rope> for Node<T> { + fn from(rope: Rope) -> Self { + Node::Rope(rope) + } +} + +fn is_ascii(byte: u8) -> bool { + (0x20..0x7F).contains(&byte) +} + +impl Hash for Fork { + fn hash<H: Hasher>(&self, state: &mut H) { + for branch in self.branches() { + branch.hash(state); + } + self.miss.hash(state); + } +} + +impl<T> Hash for Node<T> { + fn hash<H: Hasher>(&self, state: &mut H) { + match self { + Node::Rope(rope) => { + b"ROPE".hash(state); + rope.hash(state); + } + Node::Fork(fork) => { + b"FORK".hash(state); + fork.hash(state); + } + Node::Leaf(_) => b"LEAF".hash(state), + } + } +} + +impl Debug for NodeId { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + Debug::fmt(&self.0, f) + } +} + +/// We don't need debug impls in release builds +// #[cfg(test)] +mod debug { + use super::*; + use crate::graph::rope::Miss; + use crate::graph::Disambiguate; + use std::cmp::{Ord, Ordering}; + + impl Disambiguate for &str { + fn cmp(left: &&str, right: &&str) -> Ordering { + Ord::cmp(left, right) + } + } + + impl Debug for Range { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let Range { start, end } = *self; + + if start != end || !is_ascii(start) { + f.write_str("[")?; + } + match is_ascii(start) { + true => write!(f, "{}", start as char), + false => write!(f, "{:02X}", start), + }?; + if start != end { + match is_ascii(end) { + true => write!(f, "-{}]", end as char), + false => write!(f, "-{:02X}]", end), + }?; + } else if !is_ascii(start) { + f.write_str("]")?; + } + Ok(()) + } + } + + impl Display for Range { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + <Range as Debug>::fmt(self, f) + } + } + + impl<T: Debug> Debug for Graph<T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let entries = self + .nodes() + .iter() + .enumerate() + .filter_map(|(i, n)| n.as_ref().map(|n| (i, n))); + + f.debug_map().entries(entries).finish() + } + } + + struct Arm<T, U>(T, U); + + impl<T, U> Debug for Arm<T, U> + where + T: Display, + U: Display, + { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{} ⇒ {}", self.0, self.1) + } + } + + impl Debug for Fork { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut list = f.debug_set(); + + for (range, then) in self.branches() { + list.entry(&Arm(range, then)); + } + if let Some(id) = self.miss { + list.entry(&Arm('_', id)); + } + + list.finish() + } + } + + impl Display for Miss { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Miss::First(id) => Display::fmt(id, f), + Miss::Any(id) => write!(f, "{}*", id), + Miss::None => f.write_str("n/a"), + } + } + } + + impl Debug for Rope { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use std::fmt::Write; + + let mut rope = String::with_capacity(self.pattern.len()); + for range in self.pattern.iter() { + write!(rope, "{}", range)?; + } + + match self.miss.is_none() { + false => { + let mut list = f.debug_list(); + + list.entry(&Arm(rope, self.then)); + list.entry(&Arm('_', self.miss)); + + list.finish() + } + true => Arm(rope, self.then).fmt(f), + } + } + } + + impl PartialEq for Fork { + fn eq(&self, other: &Self) -> bool { + self.miss == other.miss && self.branches().eq(other.branches()) + } + } + + impl<T: Debug> Debug for Node<T> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Node::Fork(fork) => fork.fmt(f), + Node::Rope(rope) => rope.fmt(f), + Node::Leaf(leaf) => leaf.fmt(f), + } + } + } + + use std::ops::RangeInclusive; + + impl From<RangeInclusive<u8>> for Range { + fn from(range: RangeInclusive<u8>) -> Range { + Range { + start: *range.start(), + end: *range.end(), + } + } + } + + impl From<RangeInclusive<char>> for Range { + fn from(range: RangeInclusive<char>) -> Range { + Range { + start: *range.start() as u8, + end: *range.end() as u8, + } + } + } + + impl<T> PartialEq<Rope> for Node<T> { + fn eq(&self, other: &Rope) -> bool { + match self { + Node::Rope(rope) => rope == other, + _ => false, + } + } + } + + impl<T> PartialEq<Fork> for Node<T> { + fn eq(&self, other: &Fork) -> bool { + match self { + Node::Fork(fork) => fork == other, + _ => false, + } + } + } +} diff --git a/vendor/logos-codegen/src/graph/meta.rs b/vendor/logos-codegen/src/graph/meta.rs new file mode 100644 index 00000000..757ced09 --- /dev/null +++ b/vendor/logos-codegen/src/graph/meta.rs @@ -0,0 +1,174 @@ +use std::cmp::min; +use std::collections::BTreeMap; +use std::ops::{Index, IndexMut}; + +use crate::graph::{Graph, Node, NodeId}; + +#[derive(Debug)] +pub struct Meta { + map: BTreeMap<NodeId, MetaItem>, +} + +#[derive(Debug, Default)] +pub struct MetaItem { + /// Number of references to this node + pub refcount: usize, + /// Minimum number of bytes that ought to be read for this + /// node to find a match + pub min_read: usize, + /// Marks whether or not this node leads to a loop entry node. + pub is_loop_init: bool, + /// Ids of other nodes that point to this node while this + /// node is on a stack (creating a loop) + pub loop_entry_from: Vec<NodeId>, +} + +impl Index<NodeId> for Meta { + type Output = MetaItem; + + fn index(&self, id: NodeId) -> &MetaItem { + &self.map[&id] + } +} + +impl IndexMut<NodeId> for Meta { + fn index_mut(&mut self, id: NodeId) -> &mut MetaItem { + self.map.entry(id).or_default() + } +} + +impl MetaItem { + fn loop_entry(&mut self, id: NodeId) { + if let Err(idx) = self.loop_entry_from.binary_search(&id) { + self.loop_entry_from.insert(idx, id); + } + } +} + +impl Meta { + pub fn analyze<T>(root: NodeId, graph: &Graph<T>) -> Self { + let mut meta = Meta { + map: Default::default(), + }; + + meta.first_pass(root, root, graph, &mut Vec::new()); + + meta + } + + pub fn first_pass<T>( + &mut self, + this: NodeId, + parent: NodeId, + graph: &Graph<T>, + stack: &mut Vec<NodeId>, + ) -> &MetaItem { + let meta = &mut self[this]; + let is_done = meta.refcount > 0; + + meta.refcount += 1; + + if stack.contains(&this) { + meta.loop_entry(parent); + self[parent].is_loop_init = true; + } + if is_done { + return &self[this]; + } + + stack.push(this); + + let mut min_read; + + match &graph[this] { + Node::Fork(fork) => { + min_read = usize::MAX; + for (_, id) in fork.branches() { + let meta = self.first_pass(id, this, graph, stack); + + if meta.is_loop_init { + min_read = 1; + } else { + min_read = min(min_read, meta.min_read + 1); + } + } + if let Some(id) = fork.miss { + let meta = self.first_pass(id, this, graph, stack); + + if meta.is_loop_init { + min_read = 0; + } else { + min_read = min(min_read, meta.min_read); + } + } + if min_read == usize::MAX { + min_read = 0; + } + } + Node::Rope(rope) => { + min_read = rope.pattern.len(); + let meta = self.first_pass(rope.then, this, graph, stack); + + if !meta.is_loop_init { + min_read += meta.min_read; + } + + if let Some(id) = rope.miss.first() { + let meta = self.first_pass(id, this, graph, stack); + + if meta.is_loop_init { + min_read = 0; + } else { + min_read = min(min_read, meta.min_read); + } + } + } + Node::Leaf(_) => min_read = 0, + } + + stack.pop(); + + let meta = &mut self[this]; + meta.min_read = min_read; + let second_pass = meta.loop_entry_from.clone(); + + for id in second_pass { + self.meta_second_pass(id, graph); + } + + &self[this] + } + + fn meta_second_pass<T>(&mut self, id: NodeId, graph: &Graph<T>) { + let mut min_read; + + match &graph[id] { + Node::Fork(fork) => { + min_read = usize::MAX; + for (_, id) in fork.branches() { + let meta = &self[id]; + + if meta.is_loop_init { + min_read = 1; + } else { + min_read = min(min_read, meta.min_read + 1); + } + } + if min_read == usize::MAX { + min_read = 0; + } + } + Node::Rope(rope) => { + min_read = rope.pattern.len(); + let meta = &self[rope.then]; + + if !meta.is_loop_init { + min_read += meta.min_read; + } + } + Node::Leaf(_) => unreachable!(), + } + + self[id].min_read = min_read; + } +} diff --git a/vendor/logos-codegen/src/graph/mod.rs b/vendor/logos-codegen/src/graph/mod.rs new file mode 100644 index 00000000..6d218e8c --- /dev/null +++ b/vendor/logos-codegen/src/graph/mod.rs @@ -0,0 +1,566 @@ +use std::cmp::Ordering; +use std::collections::btree_map::Entry; +use std::collections::BTreeMap as Map; +use std::hash::{Hash, Hasher}; +use std::num::NonZeroU32; +use std::ops::Index; + +use fnv::FnvHasher; + +mod fork; +mod impls; +mod meta; +mod range; +mod regex; +mod rope; + +pub use self::fork::Fork; +pub use self::meta::Meta; +pub use self::range::Range; +pub use self::rope::Rope; + +/// Disambiguation error during the attempt to merge two leaf +/// nodes with the same priority +#[derive(Debug)] +pub struct DisambiguationError(pub NodeId, pub NodeId); + +pub struct Graph<Leaf> { + /// Internal storage of all allocated nodes. Once a node is + /// put here, it should never be mutated. + nodes: Vec<Option<Node<Leaf>>>, + /// When merging two nodes into a new node, we store the two + /// entry keys and the result, so that we don't merge the same + /// two nodes multiple times. + /// + /// Most of the time the entry we want to find will be the last + /// one that has been inserted, so we can use a vec with reverse + /// order search to get O(1) searches much faster than any *Map. + merges: Map<Merge, NodeId>, + /// Another map used for accounting. Before `.push`ing a new node + /// onto the graph (inserts are exempt), we hash it and find if + /// an identical(!) node has been created before. + hashes: Map<u64, NodeId>, + /// Instead of handling errors over return types, opt to collect + /// them internally. + errors: Vec<DisambiguationError>, + /// Deferred merges. When when attempting to merge a node with an + /// empty reserved slot, the merging operation is deferred until + /// the reserved slot is populated. This is a stack that keeps track + /// of all such deferred merges + deferred: Vec<DeferredMerge>, +} + +/// Trait to be implemented on `Leaf` nodes in order to disambiguate +/// between them. +pub trait Disambiguate { + fn cmp(left: &Self, right: &Self) -> Ordering; +} + +/// Id of a Node in the graph. `NodeId` can be referencing an empty +/// slot that is going to be populated later in time. +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct NodeId(NonZeroU32); + +impl Hash for NodeId { + fn hash<H: Hasher>(&self, state: &mut H) { + // Always use little-endian byte order for hashing to avoid + // different code generation on big-endian platforms due to + // iteration over a HashMap, + // see https://github.com/maciejhirsz/logos/issues/427. + state.write(&self.0.get().to_le_bytes()) + } +} + +impl NodeId { + fn get(self) -> usize { + self.0.get() as usize + } + + fn new(n: usize) -> NodeId { + NodeId(NonZeroU32::new(n as u32).expect("Invalid NodeId")) + } +} + +/// Unique reserved `NodeId` that is guaranteed to point to an +/// empty allocated slot in the graph. It's safe to create multiple +/// `NodeId` copies of `ReservedId`, however API users should never +/// be able to clone a `ReservedId`, or create a new one from `NodeId`. +/// +/// `ReservedId` is consumed once passed into `Graph::insert`. +#[derive(Debug)] +pub struct ReservedId(NodeId); + +impl ReservedId { + pub fn get(&self) -> NodeId { + self.0 + } +} + +/// Merge key used to lookup whether two nodes have been previously +/// mered, so we can avoid duplicating merges, potentially running into +/// loops that blow out the stack. +/// +/// `Merge::new(a, b)` should always equal to `Merge::new(b, a)` to ensure +/// that node merges are symmetric. +#[derive(PartialEq, Eq, PartialOrd, Ord, Debug)] +struct Merge(NodeId, NodeId); + +impl Merge { + fn new(a: NodeId, b: NodeId) -> Self { + if a < b { + Merge(a, b) + } else { + Merge(b, a) + } + } +} + +/// When attempting to merge two nodes, one of which was not yet created, +/// we can record such attempt, and execute the merge later on when the +/// `awaiting` has been `insert`ed into the graph. +#[derive(Debug)] +pub struct DeferredMerge { + awaiting: NodeId, + with: NodeId, + into: ReservedId, +} + +impl<Leaf> Graph<Leaf> { + pub fn new() -> Self { + Graph { + // Start with an empty slot so we can start + // counting NodeIds from 1 and use NonZero + // optimizations + nodes: vec![None], + merges: Map::new(), + hashes: Map::new(), + errors: Vec::new(), + deferred: Vec::new(), + } + } + + pub fn errors(&self) -> &[DisambiguationError] { + &self.errors + } + + fn next_id(&self) -> NodeId { + NodeId::new(self.nodes.len()) + } + + /// Reserve an empty slot for a node on the graph and return an + /// id for it. `ReservedId` cannot be cloned, and must be consumed + /// by calling `insert` on the graph. + pub fn reserve(&mut self) -> ReservedId { + let id = self.next_id(); + + self.nodes.push(None); + + ReservedId(id) + } + + /// Insert a node at a given, previously reserved id. Returns the + /// inserted `NodeId`. + pub fn insert<N>(&mut self, reserved: ReservedId, node: N) -> NodeId + where + N: Into<Node<Leaf>>, + Leaf: Disambiguate, + { + let id = reserved.get(); + + self.nodes[id.get()] = Some(node.into()); + + let mut awaiting = Vec::new(); + + // Partition out all `DeferredMerge`s that can be completed + // now that this `ReservedId` has a `Node` inserted into it. + for idx in (0..self.deferred.len()).rev() { + if self.deferred[idx].awaiting == id { + awaiting.push(self.deferred.remove(idx)); + } + } + + // Complete deferred merges. We've collected them from the back, + // so we must iterate through them from the back as well to restore + // proper order of merges in case there is some cascading going on. + for DeferredMerge { + awaiting, + with, + into, + } in awaiting.into_iter().rev() + { + self.merge_unchecked(awaiting, with, into); + } + + id + } + + /// Push a node onto the graph and get an id to it. If an identical + /// node has already been pushed on the graph, it will return the id + /// of that node instead. + pub fn push<B>(&mut self, node: B) -> NodeId + where + B: Into<Node<Leaf>>, + { + let node = node.into(); + + if let Node::Leaf(_) = node { + return self.push_unchecked(node); + } + + let mut hasher = FnvHasher::default(); + node.hash(&mut hasher); + + let next_id = self.next_id(); + + match self.hashes.entry(hasher.finish()) { + Entry::Occupied(occupied) => { + let id = *occupied.get(); + + if self[id].eq(&node) { + return id; + } + } + Entry::Vacant(vacant) => { + vacant.insert(next_id); + } + } + + self.push_unchecked(node) + } + + fn push_unchecked(&mut self, node: Node<Leaf>) -> NodeId { + let id = self.next_id(); + + self.nodes.push(Some(node)); + + id + } + + /// If nodes `a` and `b` have been already merged, return the + /// `NodeId` of the node they have been merged into. + fn find_merged(&self, a: NodeId, b: NodeId) -> Option<NodeId> { + let probe = Merge::new(a, b); + + self.merges.get(&probe).copied() + } + + /// Mark that nodes `a` and `b` have been merged into `product`. + /// + /// This will also mark merging `a` and `product`, as well as + /// `b` and `product` into `product`, since those are symmetric + /// operations. + /// + /// This is necessary to break out asymmetric merge loops. + fn set_merged(&mut self, a: NodeId, b: NodeId, product: NodeId) { + self.merges.insert(Merge::new(a, b), product); + self.merges.insert(Merge::new(a, product), product); + self.merges.insert(Merge::new(b, product), product); + } + + /// Merge the nodes at id `a` and `b`, returning a new id. + pub fn merge(&mut self, a: NodeId, b: NodeId) -> NodeId + where + Leaf: Disambiguate, + { + if a == b { + return a; + } + + // If the id pair is already merged (or is being merged), just return the id + if let Some(id) = self.find_merged(a, b) { + return id; + } + + match (self.get(a), self.get(b)) { + (None, None) => { + panic!( + "Merging two reserved nodes! This is a bug, please report it:\n\ + \n\ + https://github.com/maciejhirsz/logos/issues" + ); + } + (None, Some(_)) => { + let reserved = self.reserve(); + let id = reserved.get(); + self.deferred.push(DeferredMerge { + awaiting: a, + with: b, + into: reserved, + }); + self.set_merged(a, b, id); + + return id; + } + (Some(_), None) => { + let reserved = self.reserve(); + let id = reserved.get(); + self.deferred.push(DeferredMerge { + awaiting: b, + with: a, + into: reserved, + }); + self.set_merged(a, b, id); + + return id; + } + (Some(Node::Leaf(left)), Some(Node::Leaf(right))) => { + return match Disambiguate::cmp(left, right) { + Ordering::Less => b, + Ordering::Greater => a, + Ordering::Equal => { + self.errors.push(DisambiguationError(a, b)); + + a + } + }; + } + _ => (), + } + + // Reserve the id for the merge and save it. Since the graph can contain loops, + // this prevents us from trying to merge the same id pair in a loop, blowing up + // the stack. + let reserved = self.reserve(); + self.set_merged(a, b, reserved.get()); + + self.merge_unchecked(a, b, reserved) + } + + /// Unchecked merge of `a` and `b`. This fn assumes that `a` and `b` are + /// not pointing to empty slots. + fn merge_unchecked(&mut self, a: NodeId, b: NodeId, reserved: ReservedId) -> NodeId + where + Leaf: Disambiguate, + { + let merged_rope = match (self.get(a), self.get(b)) { + (Some(Node::Rope(rope)), _) => { + let rope = rope.clone(); + + self.merge_rope(rope, b) + } + (_, Some(Node::Rope(rope))) => { + let rope = rope.clone(); + + self.merge_rope(rope, a) + } + _ => None, + }; + + if let Some(rope) = merged_rope { + return self.insert(reserved, rope); + } + + let mut fork = self.fork_off(a); + fork.merge(self.fork_off(b), self); + + let mut stack = vec![reserved.get()]; + + // Flatten the fork + while let Some(miss) = fork.miss { + if stack.contains(&miss) { + break; + } + stack.push(miss); + + let other = match self.get(miss) { + Some(Node::Fork(other)) => other.clone(), + Some(Node::Rope(other)) => other.clone().into_fork(self), + _ => break, + }; + match other.miss { + Some(id) if self.get(id).is_none() => break, + _ => (), + } + fork.miss = None; + fork.merge(other, self); + } + + self.insert(reserved, fork) + } + + fn merge_rope(&mut self, rope: Rope, other: NodeId) -> Option<Rope> + where + Leaf: Disambiguate, + { + match self.get(other) { + Some(Node::Fork(fork)) if rope.miss.is_none() => { + // Count how many consecutive ranges in this rope would + // branch into the fork that results in a loop. + // + // e.g.: for rope "foobar" and a looping fork [a-z]: 6 + let count = rope + .pattern + .iter() + .take_while(|range| fork.contains(**range) == Some(other)) + .count(); + + let mut rope = rope.split_at(count, self)?.miss_any(other); + + rope.then = self.merge(rope.then, other); + + Some(rope) + } + Some(Node::Rope(other)) => { + let (prefix, miss) = rope.prefix(other)?; + + let (a, b) = (rope, other.clone()); + + let a = a.remainder(prefix.len(), self); + let b = b.remainder(prefix.len(), self); + + let rope = Rope::new(prefix, self.merge(a, b)).miss(miss); + + Some(rope) + } + Some(Node::Leaf(_)) | None => { + if rope.miss.is_none() { + Some(rope.miss(other)) + } else { + None + } + } + _ => None, + } + } + + pub fn fork_off(&mut self, id: NodeId) -> Fork + where + Leaf: Disambiguate, + { + match self.get(id) { + Some(Node::Fork(fork)) => fork.clone(), + Some(Node::Rope(rope)) => rope.clone().into_fork(self), + Some(Node::Leaf(_)) | None => Fork::new().miss(id), + } + } + + pub fn nodes(&self) -> &[Option<Node<Leaf>>] { + &self.nodes + } + + /// Find all nodes that have no references and remove them. + pub fn shake(&mut self, root: NodeId) { + let mut filter = vec![false; self.nodes.len()]; + + filter[root.get()] = true; + + self[root].shake(self, &mut filter); + + for (id, referenced) in filter.into_iter().enumerate() { + if !referenced { + self.nodes[id] = None; + } + } + } + + pub fn get(&self, id: NodeId) -> Option<&Node<Leaf>> { + self.nodes.get(id.get())?.as_ref() + } +} + +impl<Leaf> Index<NodeId> for Graph<Leaf> { + type Output = Node<Leaf>; + + fn index(&self, id: NodeId) -> &Node<Leaf> { + self.get(id).expect( + "Indexing into an empty node. This is a bug, please report it at:\n\ + \n\ + https://github.com/maciejhirsz/logos/issues", + ) + } +} + +impl std::fmt::Display for NodeId { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } +} + +#[cfg_attr(test, derive(PartialEq))] +pub enum Node<Leaf> { + /// Fork node, can lead to more than one state + Fork(Fork), + /// Rope node, can lead to one state on match, one state on miss + Rope(Rope), + /// Leaf node, terminal state + Leaf(Leaf), +} + +impl<Leaf> Node<Leaf> { + pub fn miss(&self) -> Option<NodeId> { + match self { + Node::Rope(rope) => rope.miss.first(), + Node::Fork(fork) => fork.miss, + Node::Leaf(_) => None, + } + } + + fn eq(&self, other: &Node<Leaf>) -> bool { + match (self, other) { + (Node::Fork(a), Node::Fork(b)) => a == b, + (Node::Rope(a), Node::Rope(b)) => a == b, + _ => false, + } + } + + fn shake(&self, graph: &Graph<Leaf>, filter: &mut [bool]) { + match self { + Node::Fork(fork) => fork.shake(graph, filter), + Node::Rope(rope) => rope.shake(graph, filter), + Node::Leaf(_) => (), + } + } + + pub fn unwrap_leaf(&self) -> &Leaf { + match self { + Node::Fork(_) => panic!("Internal Error: called unwrap_leaf on a fork"), + Node::Rope(_) => panic!("Internal Error: called unwrap_leaf on a rope"), + Node::Leaf(leaf) => leaf, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn leaf_stack_size() { + use std::mem::size_of; + + const WORD: usize = size_of::<usize>(); + const NODE: usize = size_of::<Node<()>>(); + + assert!(NODE <= 6 * WORD, "Size of Node<()> is {} bytes!", NODE); + } + + #[test] + fn create_a_loop() { + let mut graph = Graph::new(); + + let token = graph.push(Node::Leaf("IDENT")); + let id = graph.reserve(); + let fork = Fork::new().branch('a'..='z', id.get()).miss(token); + let root = graph.insert(id, fork); + + assert_eq!(graph[token], Node::Leaf("IDENT")); + assert_eq!(graph[root], Fork::new().branch('a'..='z', root).miss(token),); + } + + #[test] + fn fork_off() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = graph.push(Rope::new("rope", leaf)); + let fork = graph.push(Fork::new().branch(b'!', leaf)); + + assert_eq!(graph.fork_off(leaf), Fork::new().miss(leaf)); + assert_eq!( + graph.fork_off(rope), + Fork::new().branch(b'r', NodeId::new(graph.nodes.len() - 1)) + ); + assert_eq!(graph.fork_off(fork), Fork::new().branch(b'!', leaf)); + } +} diff --git a/vendor/logos-codegen/src/graph/range.rs b/vendor/logos-codegen/src/graph/range.rs new file mode 100644 index 00000000..a4d23d3b --- /dev/null +++ b/vendor/logos-codegen/src/graph/range.rs @@ -0,0 +1,144 @@ +use regex_syntax::hir::ClassBytesRange; +use regex_syntax::hir::ClassUnicodeRange; +use regex_syntax::utf8::Utf8Range; + +use std::cmp::{Ord, Ordering}; + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct Range { + pub start: u8, + pub end: u8, +} + +impl Range { + pub fn as_byte(&self) -> Option<u8> { + if self.is_byte() { + Some(self.start) + } else { + None + } + } + + pub fn is_byte(&self) -> bool { + self.start == self.end + } +} + +impl From<u8> for Range { + fn from(byte: u8) -> Range { + Range { + start: byte, + end: byte, + } + } +} + +impl From<&u8> for Range { + fn from(byte: &u8) -> Range { + Range::from(*byte) + } +} + +impl Iterator for Range { + type Item = u8; + + fn next(&mut self) -> Option<u8> { + match self.start.cmp(&self.end) { + std::cmp::Ordering::Less => { + let res = self.start; + self.start += 1; + + Some(res) + } + std::cmp::Ordering::Equal => { + let res = self.start; + + // Necessary so that range 0xFF-0xFF doesn't loop forever + self.start = 0xFF; + self.end = 0x00; + + Some(res) + } + std::cmp::Ordering::Greater => None, + } + } +} + +impl PartialOrd for Range { + fn partial_cmp(&self, other: &Range) -> Option<Ordering> { + Some(self.cmp(other)) + } +} + +impl Ord for Range { + fn cmp(&self, other: &Self) -> Ordering { + self.start.cmp(&other.start) + } +} + +impl From<Utf8Range> for Range { + fn from(r: Utf8Range) -> Range { + Range { + start: r.start, + end: r.end, + } + } +} + +impl From<ClassUnicodeRange> for Range { + fn from(r: ClassUnicodeRange) -> Range { + let start = r.start() as u32; + let end = r.end() as u32; + + if start >= 128 || end >= 128 && end != 0x0010FFFF { + panic!("Casting non-ascii ClassUnicodeRange to Range") + } + + Range { + start: start as u8, + end: end as u8, + } + } +} + +impl From<ClassBytesRange> for Range { + fn from(r: ClassBytesRange) -> Range { + Range { + start: r.start(), + end: r.end(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn range_iter_one() { + let byte = Range::from(b'!'); + let collected = byte.take(1000).collect::<Vec<_>>(); + + assert_eq!(b"!", &collected[..]); + } + + #[test] + fn range_iter_few() { + let byte = Range { + start: b'a', + end: b'd', + }; + let collected = byte.take(1000).collect::<Vec<_>>(); + + assert_eq!(b"abcd", &collected[..]); + } + + #[test] + fn range_iter_bounds() { + let byte = Range::from(0xFA..=0xFF); + + let collected = byte.take(1000).collect::<Vec<_>>(); + + assert_eq!(b"\xFA\xFB\xFC\xFD\xFE\xFF", &collected[..]); + } +} diff --git a/vendor/logos-codegen/src/graph/regex.rs b/vendor/logos-codegen/src/graph/regex.rs new file mode 100644 index 00000000..d55c490f --- /dev/null +++ b/vendor/logos-codegen/src/graph/regex.rs @@ -0,0 +1,295 @@ +use std::fmt::Debug; + +use regex_syntax::utf8::Utf8Sequences; + +use crate::graph::{Disambiguate, Fork, Graph, Node, NodeId, Range, ReservedId, Rope}; +use crate::mir::{Class, ClassUnicode, Literal, Mir}; + +impl<Leaf: Disambiguate + Debug> Graph<Leaf> { + pub fn regex(&mut self, mir: Mir, then: NodeId) -> NodeId { + self.parse_mir(&mir, then, None, None, false) + } + + fn parse_mir( + &mut self, + mir: &Mir, + then: NodeId, + miss: Option<NodeId>, + reserved: Option<ReservedId>, + repeated: bool, + ) -> NodeId { + match mir { + Mir::Empty => then, + Mir::Loop(mir) => { + let reserved_first = reserved.unwrap_or_else(|| self.reserve()); + + let (new_then, new_miss); + if let Some(old_miss) = miss { + // We have to separate the first iteration from the other iterations, + // because the `old_miss` path must only be taken if we miss the first + // iteration. + let reserved_next = self.reserve(); + new_then = self.parse_mir( + mir, + reserved_next.get(), + Some(then), + Some(reserved_next), + true, + ); + new_miss = self.merge(old_miss, then); + } else { + new_then = reserved_first.get(); + new_miss = then; + } + + self.parse_mir(mir, new_then, Some(new_miss), Some(reserved_first), true) + } + Mir::Maybe(mir) => { + let miss = match miss { + Some(id) => self.merge(id, then), + None => then, + }; + + self.parse_mir(mir, then, Some(miss), reserved, true) + } + Mir::Alternation(alternation) => { + let mut fork = Fork::new().miss(miss); + + for mir in alternation { + let id = self.parse_mir(mir, then, None, None, repeated); + let alt = self.fork_off(id); + + fork.merge(alt, self); + } + + self.insert_or_push(reserved, fork) + } + Mir::Literal(literal) => { + let pattern = literal.0.to_vec(); + + self.insert_or_push(reserved, Rope::new(pattern, then).miss(miss)) + } + Mir::Concat(concat) => { + // Take an initial guess at the capacity - estimates a little worse than an average case + // scenario by assuming every concat element is singular but has a full code-point unicode literal. + // The only way to get the actual size of the Vec is if every sub-concat node is added up. + let mut ropebuf: Vec<Range> = Vec::with_capacity(concat.len() * 4); + let mut then = then; + + let mut handle_bytes = |graph: &mut Self, mir: &Mir, then: &mut NodeId| match mir { + Mir::Literal(Literal(bytes)) => { + ropebuf.extend(bytes.iter().rev().map(Into::<Range>::into)); + true + } + Mir::Class(Class::Unicode(class)) if is_one_ascii(class, repeated) => { + ropebuf.push(class.ranges()[0].into()); + true + } + Mir::Class(Class::Bytes(class)) if class.ranges().len() == 1 => { + ropebuf.push(class.ranges()[0].into()); + true + } + _ => { + if !ropebuf.is_empty() { + let rope = + Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), *then); + + *then = graph.push(rope); + ropebuf = Vec::with_capacity(concat.len() * 4); + } + false + } + }; + + for mir in concat[1..].iter().rev() { + if !handle_bytes(self, mir, &mut then) { + then = self.parse_mir(mir, then, None, None, false); + } + } + + let first_mir = &concat[0]; + if handle_bytes(self, first_mir, &mut then) { + let rope = Rope::new(ropebuf.iter().cloned().rev().collect::<Vec<_>>(), then) + .miss(miss); + self.insert_or_push(reserved, rope) + } else { + self.parse_mir(first_mir, then, miss, reserved, false) + } + } + Mir::Class(Class::Unicode(class)) if !is_ascii(class, repeated) => { + let mut ropes = class + .iter() + .flat_map(|range| Utf8Sequences::new(range.start(), range.end())) + .map(|sequence| Rope::new(sequence.as_slice(), then)) + .collect::<Vec<_>>(); + + if ropes.len() == 1 { + let rope = ropes.remove(0); + + return self.insert_or_push(reserved, rope.miss(miss)); + } + + let mut root = Fork::new().miss(miss); + + for rope in ropes { + let fork = rope.into_fork(self); + root.merge(fork, self); + } + + self.insert_or_push(reserved, root) + } + Mir::Class(class) => { + let mut fork = Fork::new().miss(miss); + + let class: Vec<Range> = match class { + Class::Unicode(u) => u.iter().copied().map(Into::into).collect(), + Class::Bytes(b) => b.iter().copied().map(Into::into).collect(), + }; + + for range in class { + fork.add_branch(range, then, self); + } + + self.insert_or_push(reserved, fork) + } + } + } + + fn insert_or_push<N>(&mut self, id: Option<ReservedId>, node: N) -> NodeId + where + N: Into<Node<Leaf>>, + { + match id { + Some(id) => self.insert(id, node), + None => self.push(node), + } + } +} + +/// Return whether current class unicode is ascii. +/// +/// Because unicode ranges are iterated in increasing order, +/// it is only necessary to check the last range. +/// +/// If the check is performed in a repetition, +/// a fast path is used by checking if end of range is 0x0010_FFFF. +fn is_ascii(class: &ClassUnicode, repeated: bool) -> bool { + class.iter().last().map_or(true, |range| { + let start = range.start() as u32; + let end = range.end() as u32; + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) + }) +} + +/// Return whether current class unicode is ascii and only contains +/// one range. +/// +/// See [`is_ascii`] function for more details. +fn is_one_ascii(class: &ClassUnicode, repeated: bool) -> bool { + if class.ranges().len() != 1 { + return false; + } + + let range = &class.ranges()[0]; + let start = range.start() as u32; + let end = range.end() as u32; + end < 128 || (repeated && start < 128 && end == 0x0010_FFFF) +} + +#[cfg(test)] +mod tests { + use std::num::NonZeroU32; + + use super::*; + use crate::graph::Node; + use pretty_assertions::assert_eq; + + #[test] + fn rope() { + let mut graph = Graph::new(); + + let mir = Mir::utf8("foobar").unwrap(); + + assert_eq!(mir.priority(), 12); + + let leaf = graph.push(Node::Leaf("LEAF")); + let id = graph.regex(mir, leaf); + + assert_eq!(graph[id], Node::Rope(Rope::new("foobar", leaf)),) + } + + #[test] + fn alternation() { + let mut graph = Graph::new(); + + let mir = Mir::utf8("a|b").unwrap(); + + assert_eq!(mir.priority(), 2); + + let leaf = graph.push(Node::Leaf("LEAF")); + let id = graph.regex(mir, leaf); + + assert_eq!( + graph[id], + Node::Fork(Fork::new().branch(b'a', leaf).branch(b'b', leaf)), + ); + } + + #[test] + fn repeat() { + let mut graph = Graph::new(); + + let mir = Mir::utf8("[a-z]*").unwrap(); + + assert_eq!(mir.priority(), 0); + + let leaf = graph.push(Node::Leaf("LEAF")); + let id = graph.regex(mir, leaf); + + assert_eq!( + graph[id], + Node::Fork( + Fork::new() + .branch('a'..='z', id) // goto self == loop + .miss(leaf) + ), + ); + } + + #[test] + fn maybe() { + let mut graph = Graph::new(); + + let mir = Mir::utf8("[a-z]?").unwrap(); + + assert_eq!(mir.priority(), 0); + + let leaf = graph.push(Node::Leaf("LEAF")); + let id = graph.regex(mir, leaf); + + assert_eq!( + graph[id], + Node::Fork(Fork::new().branch('a'..='z', leaf).miss(leaf)), + ); + } + + #[test] + fn long_concat_389() { + let mut graph = Graph::new(); + + let mir = Mir::utf8("abcdefghijklmnopqrstuvwxyz*").unwrap(); + + assert_eq!(mir.priority(), 50); + + let leaf = graph.push(Node::Leaf("LEAF")); + let id = graph.regex(mir, leaf); + let sub_id = NodeId(NonZeroU32::new(2).unwrap()); + + assert_eq!( + graph[id], + Node::Rope(Rope::new("abcdefghijklmnopqrstuvwxy", sub_id)) + ); + + assert_eq!(graph[sub_id], Node::Rope(Rope::new("z", sub_id).miss(leaf))) + } +} diff --git a/vendor/logos-codegen/src/graph/rope.rs b/vendor/logos-codegen/src/graph/rope.rs new file mode 100644 index 00000000..6e563fac --- /dev/null +++ b/vendor/logos-codegen/src/graph/rope.rs @@ -0,0 +1,330 @@ +use std::ops::Deref; + +use crate::graph::{Disambiguate, Fork, Graph, NodeId, Range}; + +#[derive(PartialEq, Clone, Hash)] +pub struct Rope { + pub pattern: Pattern, + pub then: NodeId, + pub miss: Miss, +} + +#[derive(PartialEq, Clone, Hash)] +pub struct Pattern(pub Vec<Range>); + +impl Deref for Pattern { + type Target = [Range]; + + fn deref(&self) -> &[Range] { + &self.0 + } +} + +/// Because Ropes could potentially fail a match mid-pattern, +/// a regular `Option` is not sufficient here. +#[derive(PartialEq, Clone, Copy, Hash)] +pub enum Miss { + /// Same as Option::None, error on fail + None, + /// Jump to id if first byte does not match, fail on partial match + First(NodeId), + /// Jump to id on partial or empty match + Any(NodeId), +} + +impl Miss { + pub fn is_none(&self) -> bool { + matches!(self, Miss::None) + } + + pub fn first(self) -> Option<NodeId> { + match self { + Miss::First(id) | Miss::Any(id) => Some(id), + _ => None, + } + } + + pub fn take_first(&mut self) -> Option<NodeId> { + match *self { + Miss::First(id) => { + *self = Miss::None; + + Some(id) + } + Miss::Any(id) => Some(id), + Miss::None => None, + } + } +} + +impl From<Option<NodeId>> for Miss { + fn from(miss: Option<NodeId>) -> Self { + match miss { + Some(id) => Miss::First(id), + None => Miss::None, + } + } +} + +impl From<NodeId> for Miss { + fn from(id: NodeId) -> Self { + Miss::First(id) + } +} + +impl Rope { + pub fn new<P>(pattern: P, then: NodeId) -> Self + where + P: Into<Pattern>, + { + Rope { + pattern: pattern.into(), + then, + miss: Miss::None, + } + } + + pub fn miss<M>(mut self, miss: M) -> Self + where + M: Into<Miss>, + { + self.miss = miss.into(); + self + } + + pub fn miss_any(mut self, miss: NodeId) -> Self { + self.miss = Miss::Any(miss); + self + } + + pub fn into_fork<T>(mut self, graph: &mut Graph<T>) -> Fork + where + T: Disambiguate, + { + let first = self.pattern.0.remove(0); + let miss = self.miss.take_first(); + + // The new fork will lead to a new rope, + // or the old target if no new rope was created + let then = match self.pattern.len() { + 0 => self.then, + _ => graph.push(self), + }; + + Fork::new().branch(first, then).miss(miss) + } + + pub fn prefix(&self, other: &Self) -> Option<(Pattern, Miss)> { + let count = self + .pattern + .iter() + .zip(other.pattern.iter()) + .take_while(|(a, b)| a == b) + .count(); + + let pattern = match count { + 0 => return None, + n => self.pattern[..n].into(), + }; + let miss = match (self.miss, other.miss) { + (Miss::None, miss) => miss, + (miss, Miss::None) => miss, + _ => return None, + }; + + Some((pattern, miss)) + } + + pub fn split_at<T>(mut self, at: usize, graph: &mut Graph<T>) -> Option<Rope> + where + T: Disambiguate, + { + match at { + 0 => return None, + n if n == self.pattern.len() => return Some(self), + _ => (), + } + + let (this, next) = self.pattern.split_at(at); + + let next_miss = match self.miss { + Miss::Any(_) => self.miss, + _ => Miss::None, + }; + + let next = graph.push(Rope { + pattern: next.into(), + miss: next_miss, + then: self.then, + }); + + self.pattern = this.into(); + self.then = next; + + Some(self) + } + + pub fn remainder<T>(mut self, at: usize, graph: &mut Graph<T>) -> NodeId + where + T: Disambiguate, + { + self.pattern = self.pattern[at..].into(); + + match self.pattern.len() { + 0 => self.then, + _ => graph.push(self), + } + } + + pub fn shake<T>(&self, graph: &Graph<T>, filter: &mut [bool]) { + if let Some(id) = self.miss.first() { + if !filter[id.get()] { + filter[id.get()] = true; + graph[id].shake(graph, filter); + } + } + + if !filter[self.then.get()] { + filter[self.then.get()] = true; + graph[self.then].shake(graph, filter); + } + } +} + +impl Pattern { + pub fn to_bytes(&self) -> Option<Vec<u8>> { + let mut out = Vec::with_capacity(self.len()); + + for range in self.iter() { + out.push(range.as_byte()?); + } + + Some(out) + } +} + +impl<T> From<&[T]> for Pattern +where + T: Into<Range> + Copy, +{ + fn from(slice: &[T]) -> Self { + Pattern(slice.iter().copied().map(Into::into).collect()) + } +} + +impl<T> From<Vec<T>> for Pattern +where + T: Into<Range>, +{ + fn from(vec: Vec<T>) -> Self { + Pattern(vec.into_iter().map(Into::into).collect()) + } +} + +impl From<&str> for Pattern { + fn from(slice: &str) -> Self { + slice.as_bytes().into() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::graph::Node; + use pretty_assertions::assert_eq; + + #[test] + fn into_fork() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = Rope::new("foobar", leaf); + + let fork = rope.into_fork(&mut graph); + + assert_eq!(leaf, NodeId::new(1)); + assert_eq!(fork, Fork::new().branch(b'f', NodeId::new(2))); + assert_eq!(graph[NodeId::new(2)], Rope::new("oobar", leaf)); + } + + #[test] + fn into_fork_one_byte() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = Rope::new("!", leaf); + + let fork = rope.into_fork(&mut graph); + + assert_eq!(leaf, NodeId::new(1)); + assert_eq!(fork, Fork::new().branch(b'!', NodeId::new(1))); + } + + #[test] + fn into_fork_miss_any() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = Rope::new("42", leaf).miss_any(NodeId::new(42)); + + let fork = rope.into_fork(&mut graph); + + assert_eq!(leaf, NodeId::new(1)); + assert_eq!( + fork, + Fork::new() + .branch(b'4', NodeId::new(2)) + .miss(NodeId::new(42)) + ); + assert_eq!( + graph[NodeId::new(2)], + Rope::new("2", leaf).miss_any(NodeId::new(42)) + ); + } + + #[test] + fn into_fork_miss_first() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = Rope::new("42", leaf).miss(Miss::First(NodeId::new(42))); + + let fork = rope.into_fork(&mut graph); + + assert_eq!(leaf, NodeId::new(1)); + assert_eq!( + fork, + Fork::new() + .branch(b'4', NodeId::new(2)) + .miss(NodeId::new(42)) + ); + assert_eq!(graph[NodeId::new(2)], Rope::new("2", leaf)); + } + + #[test] + fn split_at() { + let mut graph = Graph::new(); + + let leaf = graph.push(Node::Leaf("LEAF")); + let rope = Rope::new("foobar", leaf); + + assert_eq!(rope.clone().split_at(6, &mut graph).unwrap(), rope); + + let split = rope.split_at(3, &mut graph).unwrap(); + let expected_id = NodeId::new(leaf.get() + 1); + + assert_eq!(split, Rope::new("foo", expected_id)); + assert_eq!(graph[expected_id], Rope::new("bar", leaf)); + } + + #[test] + fn pattern_to_bytes() { + let pat = Pattern::from("foobar"); + + assert_eq!(pat.to_bytes().unwrap(), b"foobar"); + + let ranges = Pattern::from(vec![0..=0, 42..=42, b'{'..=b'}']); + + assert_eq!(ranges.to_bytes(), None); + } +} diff --git a/vendor/logos-codegen/src/leaf.rs b/vendor/logos-codegen/src/leaf.rs new file mode 100644 index 00000000..5e0b810e --- /dev/null +++ b/vendor/logos-codegen/src/leaf.rs @@ -0,0 +1,118 @@ +use std::cmp::{Ord, Ordering}; +use std::fmt::{self, Debug, Display}; + +use proc_macro2::{Span, TokenStream}; +use syn::{spanned::Spanned, Ident}; + +use crate::graph::{Disambiguate, Node}; +use crate::util::MaybeVoid; + +#[derive(Clone)] +pub struct Leaf<'t> { + pub ident: Option<&'t Ident>, + pub span: Span, + pub priority: usize, + pub field: MaybeVoid, + pub callback: Option<Callback>, +} + +#[derive(Clone)] +pub enum Callback { + Label(TokenStream), + Inline(Box<InlineCallback>), + Skip(Span), +} + +#[derive(Clone)] +pub struct InlineCallback { + pub arg: Ident, + pub body: TokenStream, + pub span: Span, +} + +impl From<InlineCallback> for Callback { + fn from(inline: InlineCallback) -> Callback { + Callback::Inline(Box::new(inline)) + } +} + +impl Callback { + pub fn span(&self) -> Span { + match self { + Callback::Label(tokens) => tokens.span(), + Callback::Inline(inline) => inline.span, + Callback::Skip(span) => *span, + } + } +} + +impl<'t> Leaf<'t> { + pub fn new(ident: &'t Ident, span: Span) -> Self { + Leaf { + ident: Some(ident), + span, + priority: 0, + field: MaybeVoid::Void, + callback: None, + } + } + + pub fn new_skip(span: Span) -> Self { + Leaf { + ident: None, + span, + priority: 0, + field: MaybeVoid::Void, + callback: Some(Callback::Skip(span)), + } + } + + pub fn callback(mut self, callback: Option<Callback>) -> Self { + self.callback = callback; + self + } + + pub fn field(mut self, field: MaybeVoid) -> Self { + self.field = field; + self + } + + pub fn priority(mut self, priority: usize) -> Self { + self.priority = priority; + self + } +} + +impl Disambiguate for Leaf<'_> { + fn cmp(left: &Leaf, right: &Leaf) -> Ordering { + Ord::cmp(&left.priority, &right.priority) + } +} + +impl<'t> From<Leaf<'t>> for Node<Leaf<'t>> { + fn from(leaf: Leaf<'t>) -> Self { + Node::Leaf(leaf) + } +} + +impl Debug for Leaf<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "::{}", self)?; + + match self.callback { + Some(Callback::Label(ref label)) => write!(f, " ({})", label), + Some(Callback::Inline(_)) => f.write_str(" (<inline>)"), + Some(Callback::Skip(_)) => f.write_str(" (<skip>)"), + None => Ok(()), + } + } +} + +impl Display for Leaf<'_> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.ident { + Some(ident) => Display::fmt(ident, f), + None => f.write_str("<skip>"), + } + } +} diff --git a/vendor/logos-codegen/src/lib.rs b/vendor/logos-codegen/src/lib.rs new file mode 100644 index 00000000..2b2d3db2 --- /dev/null +++ b/vendor/logos-codegen/src/lib.rs @@ -0,0 +1,391 @@ +//! <img src="https://raw.githubusercontent.com/maciejhirsz/logos/master/logos.svg?sanitize=true" alt="Logos logo" width="250" align="right"> +//! +//! # Logos +//! +//! This is a `#[derive]` macro crate, [for documentation go to main crate](https://docs.rs/logos). + +// The `quote!` macro requires deep recursion. +#![recursion_limit = "196"] +#![doc(html_logo_url = "https://maciej.codes/kosz/logos.png")] + +mod error; +mod generator; +#[cfg(not(feature = "fuzzing"))] +mod graph; +#[cfg(feature = "fuzzing")] +pub mod graph; +mod leaf; +#[cfg(not(feature = "fuzzing"))] +mod mir; +#[cfg(feature = "fuzzing")] +pub mod mir; +mod parser; +mod util; + +#[macro_use] +#[allow(missing_docs)] +mod macros; + +use generator::Generator; +use graph::{DisambiguationError, Fork, Graph, Rope}; +use leaf::Leaf; +use parser::{IgnoreFlags, Mode, Parser}; +use quote::ToTokens; +use util::MaybeVoid; + +use proc_macro2::{Delimiter, TokenStream, TokenTree}; +use quote::quote; +use syn::parse_quote; +use syn::spanned::Spanned; +use syn::{Fields, ItemEnum}; + +const LOGOS_ATTR: &str = "logos"; +const ERROR_ATTR: &str = "error"; +const TOKEN_ATTR: &str = "token"; +const REGEX_ATTR: &str = "regex"; + +/// Generate a `Logos` implementation for the given struct, provided as a stream of rust tokens. +pub fn generate(input: TokenStream) -> TokenStream { + debug!("Reading input token streams"); + + let mut item: ItemEnum = syn::parse2(input).expect("Logos can be only be derived for enums"); + + let name = &item.ident; + + let mut parser = Parser::default(); + + for param in item.generics.params { + parser.parse_generic(param); + } + + for attr in &mut item.attrs { + parser.try_parse_logos(attr); + } + + let mut ropes = Vec::new(); + let mut regex_ids = Vec::new(); + let mut graph = Graph::new(); + + { + let errors = &mut parser.errors; + + for literal in &parser.skips { + match literal.to_mir(&parser.subpatterns, IgnoreFlags::Empty, errors) { + Ok(mir) => { + let then = graph.push(Leaf::new_skip(literal.span()).priority(mir.priority())); + let id = graph.regex(mir, then); + + regex_ids.push(id); + } + Err(err) => { + errors.err(err, literal.span()); + } + } + } + } + + debug!("Iterating through enum variants"); + + for variant in &mut item.variants { + let field = match &mut variant.fields { + Fields::Unit => MaybeVoid::Void, + Fields::Unnamed(fields) => { + if fields.unnamed.len() != 1 { + parser.err( + format!( + "Logos currently only supports variants with one field, found {}", + fields.unnamed.len(), + ), + fields.span(), + ); + } + + let ty = &mut fields + .unnamed + .first_mut() + .expect("Already checked len; qed") + .ty; + let ty = parser.get_type(ty); + + MaybeVoid::Some(ty) + } + Fields::Named(fields) => { + parser.err("Logos doesn't support named fields yet.", fields.span()); + + MaybeVoid::Void + } + }; + + // Lazy leaf constructor to avoid cloning + let var_ident = &variant.ident; + let leaf = move |span| Leaf::new(var_ident, span).field(field.clone()); + + for attr in &mut variant.attrs { + let attr_name = match attr.path().get_ident() { + Some(ident) => ident.to_string(), + None => continue, + }; + + match attr_name.as_str() { + ERROR_ATTR => { + // TODO: Remove in future versions + parser.err( + "\ + Since 0.13 Logos no longer requires the #[error] variant.\n\ + \n\ + For help with migration see release notes: \ + https://github.com/maciejhirsz/logos/releases\ + ", + attr.span(), + ); + } + TOKEN_ATTR => { + let definition = match parser.parse_definition(attr) { + Some(definition) => definition, + None => { + parser.err("Expected #[token(...)]", attr.span()); + continue; + } + }; + + if definition.ignore_flags.is_empty() { + let bytes = definition.literal.to_bytes(); + let then = graph.push( + leaf(definition.literal.span()) + .priority(definition.priority.unwrap_or(bytes.len() * 2)) + .callback(definition.callback), + ); + + ropes.push(Rope::new(bytes, then)); + } else { + let mir = definition + .literal + .escape_regex() + .to_mir( + &Default::default(), + definition.ignore_flags, + &mut parser.errors, + ) + .expect("The literal should be perfectly valid regex"); + + let then = graph.push( + leaf(definition.literal.span()) + .priority(definition.priority.unwrap_or_else(|| mir.priority())) + .callback(definition.callback), + ); + let id = graph.regex(mir, then); + + regex_ids.push(id); + } + } + REGEX_ATTR => { + let definition = match parser.parse_definition(attr) { + Some(definition) => definition, + None => { + parser.err("Expected #[regex(...)]", attr.span()); + continue; + } + }; + let mir = match definition.literal.to_mir( + &parser.subpatterns, + definition.ignore_flags, + &mut parser.errors, + ) { + Ok(mir) => mir, + Err(err) => { + parser.err(err, definition.literal.span()); + continue; + } + }; + + let then = graph.push( + leaf(definition.literal.span()) + .priority(definition.priority.unwrap_or_else(|| mir.priority())) + .callback(definition.callback), + ); + let id = graph.regex(mir, then); + + regex_ids.push(id); + } + _ => (), + } + } + } + + let mut root = Fork::new(); + + debug!("Parsing additional options (extras, source, ...)"); + + let error_type = parser.error_type.take(); + let extras = parser.extras.take(); + let source = parser + .source + .take() + .map(strip_wrapping_parens) + .unwrap_or(match parser.mode { + Mode::Utf8 => quote!(str), + Mode::Binary => quote!([u8]), + }); + let logos_path = parser + .logos_path + .take() + .unwrap_or_else(|| parse_quote!(::logos)); + + let generics = parser.generics(); + let this = quote!(#name #generics); + + let impl_logos = |body| { + quote! { + impl<'s> #logos_path::Logos<'s> for #this { + type Error = #error_type; + + type Extras = #extras; + + type Source = #source; + + fn lex(lex: &mut #logos_path::Lexer<'s, Self>) { + #body + } + } + } + }; + + for id in regex_ids { + let fork = graph.fork_off(id); + + root.merge(fork, &mut graph); + } + for rope in ropes { + root.merge(rope.into_fork(&mut graph), &mut graph); + } + while let Some(id) = root.miss.take() { + let fork = graph.fork_off(id); + + if fork.branches().next().is_some() { + root.merge(fork, &mut graph); + } else { + break; + } + } + + debug!("Checking if any two tokens have the same priority"); + + for &DisambiguationError(a, b) in graph.errors() { + let a = graph[a].unwrap_leaf(); + let b = graph[b].unwrap_leaf(); + let disambiguate = a.priority + 1; + + let mut err = |a: &Leaf, b: &Leaf| { + parser.err( + format!( + "\ + A definition of variant `{a}` can match the same input as another definition of variant `{b}`.\n\ + \n\ + hint: Consider giving one definition a higher priority: \ + #[regex(..., priority = {disambiguate})]\ + ", + ), + a.span + ); + }; + + err(a, b); + err(b, a); + } + + if let Some(errors) = parser.errors.render() { + return impl_logos(errors); + } + + let root = graph.push(root); + + graph.shake(root); + + debug!("Generating code from graph:\n{graph:#?}"); + + let generator = Generator::new(name, &this, root, &graph); + + let body = generator.generate(); + impl_logos(quote! { + use #logos_path::internal::{LexerInternal, CallbackResult}; + + type Lexer<'s> = #logos_path::Lexer<'s, #this>; + + fn _end<'s>(lex: &mut Lexer<'s>) { + lex.end() + } + + fn _error<'s>(lex: &mut Lexer<'s>) { + lex.bump_unchecked(1); + + lex.error(); + } + + #body + }) +} + +/// Strip all logos attributes from the given struct, allowing it to be used in code without `logos-derive` present. +pub fn strip_attributes(input: TokenStream) -> TokenStream { + let mut item: ItemEnum = syn::parse2(input).expect("Logos can be only be derived for enums"); + + strip_attrs_from_vec(&mut item.attrs); + + for attr in &mut item.attrs { + if let syn::Meta::List(meta) = &mut attr.meta { + if meta.path.is_ident("derive") { + let mut tokens = + std::mem::replace(&mut meta.tokens, TokenStream::new()).into_iter(); + + while let Some(TokenTree::Ident(ident)) = tokens.next() { + let punct = tokens.next(); + + if ident == "Logos" { + continue; + } + + meta.tokens.extend([TokenTree::Ident(ident)]); + meta.tokens.extend(punct); + } + } + } + } + + for variant in &mut item.variants { + strip_attrs_from_vec(&mut variant.attrs); + for field in &mut variant.fields { + strip_attrs_from_vec(&mut field.attrs); + } + } + + item.to_token_stream() +} + +fn strip_attrs_from_vec(attrs: &mut Vec<syn::Attribute>) { + attrs.retain(|attr| !is_logos_attr(attr)) +} + +fn is_logos_attr(attr: &syn::Attribute) -> bool { + attr.path().is_ident(LOGOS_ATTR) + || attr.path().is_ident(TOKEN_ATTR) + || attr.path().is_ident(REGEX_ATTR) +} + +fn strip_wrapping_parens(t: TokenStream) -> TokenStream { + let tts: Vec<TokenTree> = t.into_iter().collect(); + + if tts.len() != 1 { + tts.into_iter().collect() + } else { + match tts.into_iter().next().unwrap() { + TokenTree::Group(g) => { + if g.delimiter() == Delimiter::Parenthesis { + g.stream() + } else { + core::iter::once(TokenTree::Group(g)).collect() + } + } + tt => core::iter::once(tt).collect(), + } + } +} diff --git a/vendor/logos-codegen/src/macros.rs b/vendor/logos-codegen/src/macros.rs new file mode 100644 index 00000000..148c6f6a --- /dev/null +++ b/vendor/logos-codegen/src/macros.rs @@ -0,0 +1,12 @@ +#[cfg(feature = "debug")] +macro_rules! debug { + ($($arg:tt)*) => { + eprint!("[{}:{}:{}] ", file!(), line!(), column!()); + eprintln!($($arg)*) + } +} + +#[cfg(not(feature = "debug"))] +macro_rules! debug { + ($($arg:tt)*) => {}; +} diff --git a/vendor/logos-codegen/src/mir.rs b/vendor/logos-codegen/src/mir.rs new file mode 100644 index 00000000..e07c5b78 --- /dev/null +++ b/vendor/logos-codegen/src/mir.rs @@ -0,0 +1,235 @@ +use std::convert::TryFrom; + +use lazy_static::lazy_static; +use regex_syntax::hir::{Dot, Hir, HirKind}; +use regex_syntax::ParserBuilder; + +pub use regex_syntax::hir::{Class, ClassUnicode, Literal}; + +use crate::error::{Error, Result}; + +lazy_static! { + static ref DOT_UTF8: Hir = Hir::dot(Dot::AnyChar); + static ref DOT_BYTES: Hir = Hir::dot(Dot::AnyByte); +} + +/// Middle Intermediate Representation of the regex, built from +/// `regex_syntax`'s `Hir`. The goal here is to strip and canonicalize +/// the tree, so that we don't have to do transformations later on the +/// graph, with the potential of running into looping references. +#[derive(Clone, Debug)] +pub enum Mir { + Empty, + Loop(Box<Mir>), + Maybe(Box<Mir>), + Concat(Vec<Mir>), + Alternation(Vec<Mir>), + Class(Class), + Literal(Literal), +} + +impl Mir { + pub fn utf8(source: &str) -> Result<Mir> { + Mir::try_from(ParserBuilder::new().build().parse(source)?) + } + + pub fn utf8_ignore_case(source: &str) -> Result<Mir> { + Mir::try_from( + ParserBuilder::new() + .case_insensitive(true) + .build() + .parse(source)?, + ) + } + + pub fn binary(source: &str) -> Result<Mir> { + Mir::try_from( + ParserBuilder::new() + .utf8(false) + .unicode(false) + .build() + .parse(source)?, + ) + } + + pub fn binary_ignore_case(source: &str) -> Result<Mir> { + Mir::try_from( + ParserBuilder::new() + .utf8(false) + .unicode(false) + .case_insensitive(true) + .build() + .parse(source)?, + ) + } + + pub fn priority(&self) -> usize { + match self { + Mir::Empty | Mir::Loop(_) | Mir::Maybe(_) => 0, + Mir::Concat(concat) => concat.iter().map(Mir::priority).sum(), + Mir::Alternation(alt) => alt.iter().map(Mir::priority).min().unwrap_or(0), + Mir::Class(_) => 2, + Mir::Literal(lit) => match std::str::from_utf8(&lit.0) { + Ok(s) => 2 * s.chars().count(), + Err(_) => 2 * lit.0.len(), + }, + } + } +} + +impl TryFrom<Hir> for Mir { + type Error = Error; + + fn try_from(hir: Hir) -> Result<Mir> { + match hir.into_kind() { + HirKind::Empty => Ok(Mir::Empty), + HirKind::Concat(concat) => { + let mut out = Vec::with_capacity(concat.len()); + + fn extend(mir: Mir, out: &mut Vec<Mir>) { + match mir { + Mir::Concat(nested) => { + for child in nested { + extend(child, out); + } + } + mir => out.push(mir), + } + } + + for hir in concat { + extend(Mir::try_from(hir)?, &mut out); + } + + Ok(Mir::Concat(out)) + } + HirKind::Alternation(alternation) => { + let alternation = alternation + .into_iter() + .map(Mir::try_from) + .collect::<Result<_>>()?; + + Ok(Mir::Alternation(alternation)) + } + HirKind::Literal(literal) => Ok(Mir::Literal(literal)), + HirKind::Class(class) => Ok(Mir::Class(class)), + HirKind::Repetition(repetition) => { + if !repetition.greedy { + return Err("#[regex]: non-greedy parsing is currently unsupported.".into()); + } + + let is_dot = if repetition.sub.properties().is_utf8() { + *repetition.sub == *DOT_UTF8 + } else { + *repetition.sub == *DOT_BYTES + }; + let mir = Mir::try_from(*repetition.sub)?; + + match (repetition.min, repetition.max) { + (0..=1, None) if is_dot => { + Err( + "#[regex]: \".+\" and \".*\" patterns will greedily consume \ + the entire source till the end as Logos does not allow \ + backtracking. If you are looking to match everything until \ + a specific character, you should use a negative character \ + class. E.g., use regex r\"'[^']*'\" to match anything in \ + between two quotes. Read more about that here: \ + https://github.com/maciejhirsz/logos/issues/302#issuecomment-1521342541." + .into() + ) + } + // 0 or 1 + (0, Some(1)) => Ok(Mir::Maybe(Box::new(mir))), + // 0 or more + (0, None) => Ok(Mir::Loop(Box::new(mir))), + // 1 or more + (1, None) => { + Ok(Mir::Concat(vec![mir.clone(), Mir::Loop(Box::new(mir))])) + } + // Exact {n} + (n, Some(m)) if m == n => { + let mut out = Vec::with_capacity(n as usize); + for _ in 0..n { + out.push(mir.clone()); + } + Ok(Mir::Concat(out)) + } + // At least {n,} + (n, None) => { + let mut out = Vec::with_capacity(n as usize); + for _ in 0..n { + out.push(mir.clone()); + } + out.push(Mir::Loop(Box::new(mir))); + Ok(Mir::Concat(out)) + } + // Bounded {n, m} + (n, Some(m)) => { + let mut out = Vec::with_capacity(m as usize); + for _ in 0..n { + out.push(mir.clone()); + } + for _ in n..m { + out.push(Mir::Maybe(Box::new(mir.clone()))); + } + Ok(Mir::Concat(out)) + } + } + } + HirKind::Capture(capture) => Mir::try_from(*capture.sub), + HirKind::Look(_) => { + Err("#[regex]: look-around assertions are currently unsupported.".into()) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::Mir; + + #[test] + fn priorities() { + let regexes = [ + ("a", 2), + ("à", 2), + ("京", 2), + ("Eté", 6), + ("Été", 6), + ("[a-z]+", 2), + ("a|b", 2), + ("a|[b-z]", 2), + ("(foo)+", 6), + ("foobar", 12), + ("(fooz|bar)+qux", 12), + ]; + + for (regex, expected) in regexes.iter() { + let mir = Mir::utf8(regex).unwrap(); + assert_eq!(mir.priority(), *expected, "Failed for regex \"{}\"", regex); + } + } + + #[test] + fn equivalent_patterns() { + let regexes = [ + ("a|b", "[a-b]"), + ("1|2|3", "[1-3]"), + ("1+", "[1]+"), + ("c*", "[c]*"), + ("aaa", "a{3}"), + ("a[a]{2}", "a{3}"), + ]; + + for (regex_left, regex_right) in regexes.iter() { + let mir_left = Mir::utf8(regex_left).unwrap(); + let mir_right = Mir::utf8(regex_right).unwrap(); + assert_eq!( + mir_left.priority(), + mir_right.priority(), + "Regexes \"{regex_left}\" and \"{regex_right}\" \ + are equivalent but have different priorities" + ); + } + } +} diff --git a/vendor/logos-codegen/src/parser/definition.rs b/vendor/logos-codegen/src/parser/definition.rs new file mode 100644 index 00000000..a876fb59 --- /dev/null +++ b/vendor/logos-codegen/src/parser/definition.rs @@ -0,0 +1,193 @@ +use proc_macro2::{Ident, Span}; +use syn::{spanned::Spanned, LitByteStr, LitStr}; + +use crate::error::{Errors, Result}; +use crate::leaf::Callback; +use crate::mir::Mir; +use crate::parser::nested::NestedValue; +use crate::parser::{IgnoreFlags, Parser, Subpatterns}; + +use super::ignore_flags::ascii_case::MakeAsciiCaseInsensitive; + +pub struct Definition { + pub literal: Literal, + pub priority: Option<usize>, + pub callback: Option<Callback>, + pub ignore_flags: IgnoreFlags, +} + +pub enum Literal { + Utf8(LitStr), + Bytes(LitByteStr), +} + +impl Definition { + pub fn new(literal: Literal) -> Self { + Definition { + literal, + priority: None, + callback: None, + ignore_flags: IgnoreFlags::Empty, + } + } + + pub fn named_attr(&mut self, name: Ident, value: NestedValue, parser: &mut Parser) { + match (name.to_string().as_str(), value) { + ("priority", NestedValue::Assign(tokens)) => { + let prio = match tokens.to_string().parse() { + Ok(prio) => prio, + Err(_) => { + parser.err("Expected an unsigned integer", tokens.span()); + return; + } + }; + + if self.priority.replace(prio).is_some() { + parser.err("Resetting previously set priority", tokens.span()); + } + } + ("priority", _) => { + parser.err("Expected: priority = <integer>", name.span()); + } + ("callback", NestedValue::Assign(tokens)) => { + let span = tokens.span(); + let callback = match parser.parse_callback(tokens) { + Some(callback) => callback, + None => { + parser.err("Not a valid callback", span); + return; + } + }; + + if let Some(previous) = self.callback.replace(callback) { + parser + .err( + "Callback has been already set", + span.join(name.span()).unwrap(), + ) + .err("Previous callback set here", previous.span()); + } + } + ("callback", _) => { + parser.err("Expected: callback = ...", name.span()); + } + ("ignore", NestedValue::Group(tokens)) => { + self.ignore_flags.parse_group(name, tokens, parser); + } + ("ignore", _) => { + parser.err("Expected: ignore(<flag>, ...)", name.span()); + } + (unknown, _) => { + parser.err( + format!( + "\ + Unknown nested attribute: {}\n\ + \n\ + Expected one of: priority, callback\ + ", + unknown + ), + name.span(), + ); + } + } + } +} + +impl Literal { + pub fn to_bytes(&self) -> Vec<u8> { + match self { + Literal::Utf8(string) => string.value().into_bytes(), + Literal::Bytes(bytes) => bytes.value(), + } + } + + pub fn escape_regex(&self) -> Literal { + match self { + Literal::Utf8(string) => Literal::Utf8(LitStr::new( + regex_syntax::escape(&string.value()).as_str(), + self.span(), + )), + Literal::Bytes(bytes) => Literal::Bytes(LitByteStr::new( + regex_syntax::escape(&bytes_to_regex_string(bytes.value())).as_bytes(), + self.span(), + )), + } + } + + pub fn to_mir( + &self, + subpatterns: &Subpatterns, + ignore_flags: IgnoreFlags, + errors: &mut Errors, + ) -> Result<Mir> { + let value = subpatterns.fix(self, errors); + + if ignore_flags.contains(IgnoreFlags::IgnoreAsciiCase) { + match self { + Literal::Utf8(_) => { + Mir::utf8(&value).map(MakeAsciiCaseInsensitive::make_ascii_case_insensitive) + } + Literal::Bytes(_) => Mir::binary_ignore_case(&value), + } + } else if ignore_flags.contains(IgnoreFlags::IgnoreCase) { + match self { + Literal::Utf8(_) => Mir::utf8_ignore_case(&value), + Literal::Bytes(_) => Mir::binary_ignore_case(&value), + } + } else { + match self { + Literal::Utf8(_) => Mir::utf8(&value), + Literal::Bytes(_) => Mir::binary(&value), + } + } + } + + pub fn span(&self) -> Span { + match self { + Literal::Utf8(string) => string.span(), + Literal::Bytes(bytes) => bytes.span(), + } + } +} + +impl syn::parse::Parse for Literal { + fn parse(input: syn::parse::ParseStream) -> syn::Result<Self> { + let la = input.lookahead1(); + if la.peek(LitStr) { + Ok(Literal::Utf8(input.parse()?)) + } else if la.peek(LitByteStr) { + Ok(Literal::Bytes(input.parse()?)) + } else { + Err(la.error()) + } + } +} + +pub fn bytes_to_regex_string(bytes: Vec<u8>) -> String { + if bytes.is_ascii() { + unsafe { + // Unicode values are prohibited, so we can't use + // safe version of String::from_utf8 + // + // We can, however, construct a safe ASCII string + return String::from_utf8_unchecked(bytes); + } + } + + let mut string = String::with_capacity(bytes.len() * 2); + + for byte in bytes { + if byte < 0x80 { + string.push(byte as char); + } else { + static DIGITS: [u8; 16] = *b"0123456789abcdef"; + + string.push_str(r"\x"); + string.push(DIGITS[(byte / 16) as usize] as char); + string.push(DIGITS[(byte % 16) as usize] as char); + } + } + + string +} diff --git a/vendor/logos-codegen/src/parser/ignore_flags.rs b/vendor/logos-codegen/src/parser/ignore_flags.rs new file mode 100644 index 00000000..3a79d31b --- /dev/null +++ b/vendor/logos-codegen/src/parser/ignore_flags.rs @@ -0,0 +1,499 @@ +use std::ops::{BitAnd, BitOr}; + +use proc_macro2::{Ident, TokenStream, TokenTree}; + +use crate::parser::Parser; +use crate::util::is_punct; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct IgnoreFlags { + bits: u8, +} + +#[allow(non_upper_case_globals)] +impl IgnoreFlags { + pub const Empty: Self = Self::new(0x00); + pub const IgnoreCase: Self = Self::new(0x01); + pub const IgnoreAsciiCase: Self = Self::new(0x02); + + #[inline] + pub const fn new(bits: u8) -> Self { + Self { bits } + } + + /// Enables a variant. + #[inline] + pub fn enable(&mut self, variant: Self) { + self.bits |= variant.bits; + } + + /// Checks if this `IgnoreFlags` contains *any* of the given variants. + #[inline] + pub fn contains(&self, variants: Self) -> bool { + self.bits & variants.bits != 0 + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.bits == 0 + } + + /// Parses an identifier an enables it for `self`. + /// + /// Valid inputs are (that produces `true`): + /// * `"case"` (incompatible with `"ascii_case"`) + /// * `"ascii_case"` (incompatible with `"case"`) + /// + /// An error causes this function to return `false` and emits an error to + /// the given `Parser`. + fn parse_ident(&mut self, ident: Ident, parser: &mut Parser) -> bool { + match ident.to_string().as_str() { + "case" => { + if self.contains(Self::IgnoreAsciiCase) { + parser.err( + "\ + The flag \"case\" cannot be used along with \"ascii_case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreCase); + true + } + } + "ascii_case" => { + if self.contains(Self::IgnoreCase) { + parser.err( + "\ + The flag \"ascii_case\" cannot be used along with \"case\"\ + ", + ident.span(), + ); + false + } else { + self.enable(Self::IgnoreAsciiCase); + true + } + } + unknown => { + parser.err( + format!( + "\ + Unknown flag: {}\n\ + \n\ + Expected one of: case, ascii_case\ + ", + unknown + ), + ident.span(), + ); + false + } + } + } + + pub fn parse_group(&mut self, name: Ident, tokens: TokenStream, parser: &mut Parser) { + // Little finite state machine to parse "<flag>(,<flag>)*,?" + + // FSM description for future maintenance + // 0: Initial state + // <flag> -> 1 + // _ -> error + // 1: A flag was found + // , -> 2 + // None -> done + // _ -> error + // 2: A comma was found (after a <flag>) + // <flag> -> 1 + // None -> done + // _ -> error + let mut state = 0u8; + + let mut tokens = tokens.into_iter(); + + loop { + state = match state { + 0 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + _ => { + parser.err( + "\ + Invalid ignore flag\n\ + \n\ + Expected one of: case, ascii_case\ + ", + name.span(), + ); + return; + } + }, + 1 => match tokens.next() { + Some(tt) if is_punct(&tt, ',') => 2, + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + 2 => match tokens.next() { + Some(TokenTree::Ident(ident)) => { + if self.parse_ident(ident, parser) { + 1 + } else { + return; + } + } + None => return, + Some(unexpected_tt) => { + parser.err( + format!( + "\ + Unexpected token: {:?}\ + ", + unexpected_tt.to_string(), + ), + unexpected_tt.span(), + ); + return; + } + }, + _ => unreachable!("Internal Error: invalid state ({})", state), + } + } + } +} + +impl BitOr for IgnoreFlags { + type Output = Self; + + fn bitor(self, other: Self) -> Self { + Self::new(self.bits | other.bits) + } +} + +impl BitAnd for IgnoreFlags { + type Output = Self; + + fn bitand(self, other: Self) -> Self { + Self::new(self.bits & other.bits) + } +} + +pub mod ascii_case { + use regex_syntax::hir; + + use crate::mir::Mir; + use crate::parser::Literal; + + macro_rules! literal { + ($byte:expr) => { + hir::Literal(Box::new([$byte])) + }; + (@char $c:expr) => { + hir::Literal( + $c.encode_utf8(&mut [0; 4]) + .as_bytes() + .to_vec() + .into_boxed_slice(), + ) + }; + } + + pub trait MakeAsciiCaseInsensitive { + /// Creates a equivalent regular expression which ignore the letter casing + /// of ascii characters. + fn make_ascii_case_insensitive(self) -> Mir; + } + + impl MakeAsciiCaseInsensitive for u8 { + fn make_ascii_case_insensitive(self) -> Mir { + if self.is_ascii_lowercase() { + Mir::Alternation(vec![ + Mir::Literal(literal!(self - 32)), + Mir::Literal(literal!(self)), + ]) + } else if self.is_ascii_uppercase() { + Mir::Alternation(vec![ + Mir::Literal(literal!(self)), + Mir::Literal(literal!(self + 32)), + ]) + } else { + Mir::Literal(literal!(self)) + } + } + } + + impl MakeAsciiCaseInsensitive for char { + fn make_ascii_case_insensitive(self) -> Mir { + if self.is_ascii() { + (self as u8).make_ascii_case_insensitive() + } else { + Mir::Literal(literal!(@char self)) + } + } + } + + impl MakeAsciiCaseInsensitive for hir::Literal { + fn make_ascii_case_insensitive(self) -> Mir { + Mir::Concat( + self.0 + .iter() + .map(|x| x.make_ascii_case_insensitive()) + .collect(), + ) + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassBytes { + fn make_ascii_case_insensitive(mut self) -> Mir { + self.case_fold_simple(); + Mir::Class(hir::Class::Bytes(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::ClassUnicode { + fn make_ascii_case_insensitive(mut self) -> Mir { + use std::cmp; + + // Manuall implementation to only perform the case folding on ascii characters. + + let mut ranges = Vec::new(); + + for range in self.ranges() { + #[inline] + fn overlaps(st1: u8, end1: u8, st2: u8, end2: u8) -> bool { + (st2 <= st1 && st1 <= end2) || (st1 <= st2 && st2 <= end1) + } + + #[inline] + fn make_ascii(c: char) -> Option<u8> { + if c.is_ascii() { + Some(c as u8) + } else { + None + } + } + + match (make_ascii(range.start()), make_ascii(range.end())) { + (Some(start), Some(end)) => { + if overlaps(b'a', b'z', start, end) { + let lower = cmp::max(start, b'a'); + let upper = cmp::min(end, b'z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower - 32) as char, + (upper - 32) as char, + )) + } + + if overlaps(b'A', b'Z', start, end) { + let lower = cmp::max(start, b'A'); + let upper = cmp::min(end, b'Z'); + ranges.push(hir::ClassUnicodeRange::new( + (lower + 32) as char, + (upper + 32) as char, + )) + } + } + (Some(start), None) => { + if overlaps(b'a', b'z', start, b'z') { + let lower = cmp::max(start, b'a'); + ranges.push(hir::ClassUnicodeRange::new((lower - 32) as char, 'Z')) + } + + if overlaps(b'A', b'Z', start, b'Z') { + let lower = cmp::max(start, b'A'); + ranges.push(hir::ClassUnicodeRange::new((lower + 32) as char, 'Z')) + } + } + _ => (), + } + } + + self.union(&hir::ClassUnicode::new(ranges)); + + Mir::Class(hir::Class::Unicode(self)) + } + } + + impl MakeAsciiCaseInsensitive for hir::Class { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + hir::Class::Bytes(b) => b.make_ascii_case_insensitive(), + hir::Class::Unicode(u) => u.make_ascii_case_insensitive(), + } + } + } + + impl MakeAsciiCaseInsensitive for &Literal { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Literal::Bytes(bytes) => Mir::Concat( + bytes + .value() + .into_iter() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + Literal::Utf8(s) => Mir::Concat( + s.value() + .chars() + .map(|b| b.make_ascii_case_insensitive()) + .collect(), + ), + } + } + } + + impl MakeAsciiCaseInsensitive for Mir { + fn make_ascii_case_insensitive(self) -> Mir { + match self { + Mir::Empty => Mir::Empty, + Mir::Loop(l) => Mir::Loop(Box::new(l.make_ascii_case_insensitive())), + Mir::Maybe(m) => Mir::Maybe(Box::new(m.make_ascii_case_insensitive())), + Mir::Concat(c) => Mir::Concat( + c.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Alternation(a) => Mir::Alternation( + a.into_iter() + .map(|m| m.make_ascii_case_insensitive()) + .collect(), + ), + Mir::Class(c) => c.make_ascii_case_insensitive(), + Mir::Literal(l) => l.make_ascii_case_insensitive(), + } + } + } + + #[cfg(test)] + mod tests { + use super::MakeAsciiCaseInsensitive; + use crate::mir::{Class, Mir}; + use regex_syntax::hir::{ClassUnicode, ClassUnicodeRange}; + + fn assert_range(in_s: char, in_e: char, expected: &[(char, char)]) { + let range = ClassUnicodeRange::new(in_s, in_e); + let class = ClassUnicode::new(vec![range]); + + let expected = + ClassUnicode::new(expected.iter().map(|&(a, b)| ClassUnicodeRange::new(a, b))); + + if let Mir::Class(Class::Unicode(result)) = class.make_ascii_case_insensitive() { + assert_eq!(result, expected); + } else { + panic!("Not a unicode class"); + }; + } + + #[test] + fn no_letters_left() { + assert_range(' ', '+', &[(' ', '+')]); + } + + #[test] + fn no_letters_right() { + assert_range('{', '~', &[('{', '~')]); + } + + #[test] + fn no_letters_middle() { + assert_range('[', '`', &[('[', '`')]); + } + + #[test] + fn lowercase_left_edge() { + assert_range('a', 'd', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn lowercase_right_edge() { + assert_range('r', 'z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn lowercase_total() { + assert_range('a', 'z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn uppercase_left_edge() { + assert_range('A', 'D', &[('a', 'd'), ('A', 'D')]); + } + + #[test] + fn uppercase_right_edge() { + assert_range('R', 'Z', &[('r', 'z'), ('R', 'Z')]); + } + + #[test] + fn uppercase_total() { + assert_range('A', 'Z', &[('a', 'z'), ('A', 'Z')]); + } + + #[test] + fn lowercase_cross_left() { + assert_range('[', 'h', &[('[', 'h'), ('A', 'H')]); + } + + #[test] + fn lowercase_cross_right() { + assert_range('d', '}', &[('d', '}'), ('D', 'Z')]); + } + + #[test] + fn uppercase_cross_left() { + assert_range(';', 'H', &[(';', 'H'), ('a', 'h')]); + } + + #[test] + fn uppercase_cross_right() { + assert_range('T', ']', &[('t', 'z'), ('T', ']')]); + } + + #[test] + fn cross_both() { + assert_range('X', 'c', &[('X', 'c'), ('x', 'z'), ('A', 'C')]); + } + + #[test] + fn all_letters() { + assert_range('+', '|', &[('+', '|')]); + } + + #[test] + fn oob_all_letters() { + assert_range('#', 'é', &[('#', 'é')]); + } + + #[test] + fn oob_from_uppercase() { + assert_range('Q', 'é', &[('A', 'é')]); + } + + #[test] + fn oob_from_lowercase() { + assert_range('q', 'é', &[('q', 'é'), ('Q', 'Z')]); + } + + #[test] + fn oob_no_letters() { + assert_range('|', 'é', &[('|', 'é')]); + } + } +} diff --git a/vendor/logos-codegen/src/parser/mod.rs b/vendor/logos-codegen/src/parser/mod.rs new file mode 100644 index 00000000..3ad7202e --- /dev/null +++ b/vendor/logos-codegen/src/parser/mod.rs @@ -0,0 +1,331 @@ +use beef::lean::Cow; +use proc_macro2::{Span, TokenStream, TokenTree}; +use quote::quote; +use syn::spanned::Spanned; +use syn::{Attribute, GenericParam, Lit, Meta, Type}; + +use crate::error::Errors; +use crate::leaf::{Callback, InlineCallback}; +use crate::util::{expect_punct, MaybeVoid}; +use crate::LOGOS_ATTR; + +mod definition; +mod ignore_flags; +mod nested; +mod subpattern; +mod type_params; + +pub use self::definition::{Definition, Literal}; +pub use self::ignore_flags::IgnoreFlags; +use self::nested::{AttributeParser, Nested, NestedValue}; +pub use self::subpattern::Subpatterns; +use self::type_params::{replace_lifetime, traverse_type, TypeParams}; + +#[derive(Default)] +pub struct Parser { + pub errors: Errors, + pub mode: Mode, + pub source: Option<TokenStream>, + pub skips: Vec<Literal>, + pub extras: MaybeVoid, + pub error_type: MaybeVoid, + pub subpatterns: Subpatterns, + pub logos_path: Option<TokenStream>, + types: TypeParams, +} + +#[derive(Default)] +pub enum Mode { + #[default] + Utf8, + Binary, +} + +impl Parser { + pub fn parse_generic(&mut self, param: GenericParam) { + match param { + GenericParam::Lifetime(lt) => { + self.types.explicit_lifetime(lt, &mut self.errors); + } + GenericParam::Type(ty) => { + self.types.add(ty.ident); + } + GenericParam::Const(c) => { + self.err("Logos doesn't support const generics.", c.span()); + } + } + } + + pub fn generics(&mut self) -> Option<TokenStream> { + self.types.generics(&mut self.errors) + } + + fn parse_attr(&mut self, attr: &mut Attribute) -> Option<AttributeParser> { + match &mut attr.meta { + Meta::List(list) => { + let tokens = std::mem::replace(&mut list.tokens, TokenStream::new()); + + Some(AttributeParser::new(tokens)) + } + _ => None, + } + } + + /// Try to parse the main `#[logos(...)]`, does nothing if + /// the attribute's name isn't `logos`. + pub fn try_parse_logos(&mut self, attr: &mut Attribute) { + if !attr.path().is_ident(LOGOS_ATTR) { + return; + } + + let nested = match self.parse_attr(attr) { + Some(tokens) => tokens, + None => { + self.err("Expected #[logos(...)]", attr.span()); + return; + } + }; + + for nested in nested { + let (name, value) = match nested { + Nested::Named(name, value) => (name, value), + Nested::Unexpected(tokens) | Nested::Unnamed(tokens) => { + self.err("Invalid nested attribute", tokens.span()); + continue; + } + }; + + // IMPORTANT: Keep these sorted alphabetically for binary search down the line + #[allow(clippy::type_complexity)] + static NESTED_LOOKUP: &[(&str, fn(&mut Parser, Span, NestedValue))] = &[ + ("crate", |parser, span, value| match value { + NestedValue::Assign(logos_path) => parser.logos_path = Some(logos_path), + _ => { + parser.err("Expected: #[logos(crate = path::to::logos)]", span); + } + }), + ("error", |parser, span, value| match value { + NestedValue::Assign(value) => { + let span = value.span(); + + if let MaybeVoid::Some(previous) = parser.error_type.replace(value) { + parser + .err("Error type can be defined only once", span) + .err("Previous definition here", previous.span()); + } + } + _ => { + parser.err("Expected: #[logos(error = SomeType)]", span); + } + }), + ("extras", |parser, span, value| match value { + NestedValue::Assign(value) => { + let span = value.span(); + + if let MaybeVoid::Some(previous) = parser.extras.replace(value) { + parser + .err("Extras can be defined only once", span) + .err("Previous definition here", previous.span()); + } + } + _ => { + parser.err("Expected: #[logos(extras = SomeType)]", span); + } + }), + ("skip", |parser, span, value| match value { + NestedValue::Literal(lit) => { + if let Some(literal) = parser.parse_literal(Lit::new(lit)) { + parser.skips.push(literal); + } + } + _ => { + parser.err("Expected: #[logos(skip \"regex literal\")]", span); + } + }), + ("source", |parser, span, value| match value { + NestedValue::Assign(value) => { + let span = value.span(); + if let Some(previous) = parser.source.replace(value) { + parser + .err("Source can be defined only once", span) + .err("Previous definition here", previous.span()); + } + } + _ => { + parser.err("Expected: #[logos(source = SomeType)]", span); + } + }), + ("subpattern", |parser, span, value| match value { + NestedValue::KeywordAssign(name, value) => { + parser.subpatterns.add(name, value, &mut parser.errors); + } + _ => { + parser.err(r#"Expected: #[logos(subpattern name = r"regex")]"#, span); + } + }), + ("type", |parser, span, value| match value { + NestedValue::KeywordAssign(generic, ty) => { + parser.types.set(generic, ty, &mut parser.errors); + } + _ => { + parser.err("Expected: #[logos(type T = SomeType)]", span); + } + }), + ]; + + match NESTED_LOOKUP.binary_search_by_key(&name.to_string().as_str(), |(n, _)| n) { + Ok(idx) => NESTED_LOOKUP[idx].1(self, name.span(), value), + Err(_) => { + let mut err = format!( + "Unknown nested attribute #[logos({name})], expected one of: {}", + NESTED_LOOKUP[0].0 + ); + + for (allowed, _) in &NESTED_LOOKUP[1..] { + err.push_str(", "); + err.push_str(allowed); + } + + self.err(err, name.span()); + } + } + } + } + + pub fn parse_literal(&mut self, lit: Lit) -> Option<Literal> { + match lit { + Lit::Str(string) => Some(Literal::Utf8(string)), + Lit::ByteStr(bytes) => { + self.mode = Mode::Binary; + + Some(Literal::Bytes(bytes)) + } + _ => { + self.err("Expected a &str or &[u8] slice", lit.span()); + + None + } + } + } + + /// Parse attribute definition of a token: + /// + /// + `#[token(literal[, callback])]` + /// + `#[regex(literal[, callback])]` + pub fn parse_definition(&mut self, attr: &mut Attribute) -> Option<Definition> { + let mut nested = self.parse_attr(attr)?; + + let literal = match nested.parsed::<Lit>()? { + Ok(lit) => self.parse_literal(lit)?, + Err(err) => { + self.err(err.to_string(), err.span()); + + return None; + } + }; + + let mut def = Definition::new(literal); + + for (position, next) in nested.enumerate() { + match next { + Nested::Unexpected(tokens) => { + self.err("Unexpected token in attribute", tokens.span()); + } + Nested::Unnamed(tokens) => match position { + 0 => def.callback = self.parse_callback(tokens), + _ => { + self.err( + "\ + Expected a named argument at this position\n\ + \n\ + hint: If you are trying to define a callback here use: callback = ...\ + ", + tokens.span(), + ); + } + }, + Nested::Named(name, value) => { + def.named_attr(name, value, self); + } + } + } + + Some(def) + } + + fn parse_callback(&mut self, tokens: TokenStream) -> Option<Callback> { + let span = tokens.span(); + let mut tokens = tokens.into_iter(); + + if let Some(tt) = expect_punct(tokens.next(), '|') { + let mut label = TokenStream::from(tt); + + label.extend(tokens); + + return Some(Callback::Label(label)); + } + + let first = tokens.next(); + let error = expect_punct(tokens.next(), '|'); + + let arg = match (error, first) { + (None, Some(TokenTree::Ident(arg))) => arg, + _ => { + self.err( + "Inline callbacks must use closure syntax with exactly one parameter", + span, + ); + return None; + } + }; + + let body = match tokens.next() { + Some(TokenTree::Group(group)) => group.stream(), + Some(first) => { + let mut body = TokenStream::from(first); + + body.extend(tokens); + body + } + None => { + self.err("Callback missing a body", span); + return None; + } + }; + + let inline = InlineCallback { arg, body, span }; + + Some(inline.into()) + } + + /// Checks if `ty` is a declared generic param, if so replaces it + /// with a concrete type defined using #[logos(type T = Type)] + /// + /// If no matching generic param is found, all lifetimes are fixed + /// to the source lifetime + pub fn get_type(&self, ty: &mut Type) -> TokenStream { + traverse_type(ty, &mut |ty| { + if let Type::Path(tp) = ty { + // Skip types that begin with `self::` + if tp.qself.is_none() { + // If `ty` is a generic type parameter, try to find + // its concrete type defined with #[logos(type T = Type)] + if let Some(substitute) = self.types.find(&tp.path) { + *ty = substitute; + } + } + } + // If `ty` is a concrete type, fix its lifetimes to 'source + replace_lifetime(ty); + }); + + quote!(#ty) + } + + pub fn err<M>(&mut self, message: M, span: Span) -> &mut Errors + where + M: Into<Cow<'static, str>>, + { + self.errors.err(message, span) + } +} diff --git a/vendor/logos-codegen/src/parser/nested.rs b/vendor/logos-codegen/src/parser/nested.rs new file mode 100644 index 00000000..44ecaeac --- /dev/null +++ b/vendor/logos-codegen/src/parser/nested.rs @@ -0,0 +1,146 @@ +use proc_macro2::token_stream::IntoIter as TokenIter; +use proc_macro2::{Ident, Literal, TokenStream, TokenTree}; +use quote::quote; + +use crate::util::{expect_punct, is_punct}; + +pub enum NestedValue { + /// `name = ...` + Assign(TokenStream), + /// `name "literal"` + Literal(Literal), + /// `name(...)` + Group(TokenStream), + /// `name ident = ...` + KeywordAssign(Ident, TokenStream), +} + +pub enum Nested { + /// Unnamed nested attribute, such as a string, + /// callback closure, or a lone ident/path + /// + /// Note: a lone ident will be Named with no value instead + Unnamed(TokenStream), + /// Named: name ... + Named(Ident, NestedValue), + /// Unexpected token, + Unexpected(TokenStream), +} + +pub struct AttributeParser { + inner: TokenIter, +} + +pub struct Empty; + +impl From<Empty> for TokenStream { + fn from(_: Empty) -> TokenStream { + TokenStream::new() + } +} + +impl AttributeParser { + pub fn new(stream: TokenStream) -> Self { + AttributeParser { + inner: stream.into_iter(), + } + } + + pub fn parsed<T>(&mut self) -> Option<syn::Result<T>> + where + T: syn::parse::Parse, + { + let tokens = self.collect_tail(TokenStream::new()); + + if tokens.is_empty() { + return None; + } + + Some(syn::parse2(tokens)) + } + + fn next_tt(&mut self) -> Option<TokenTree> { + expect_punct(self.inner.next(), ',') + } + + fn collect_tail<T>(&mut self, first: T) -> TokenStream + where + T: Into<TokenStream>, + { + let mut out = first.into(); + + while let Some(tt) = self.next_tt() { + out.extend(Some(tt)); + } + + out + } + + fn parse_unnamed(&mut self, first: Ident, next: TokenTree) -> Nested { + let mut out = TokenStream::from(TokenTree::Ident(first)); + + out.extend(self.collect_tail(next)); + + Nested::Unnamed(out.into_iter().collect()) + } + + fn parse_assign(&mut self, name: Ident) -> Nested { + let value = self.collect_tail(Empty); + + Nested::Named(name, NestedValue::Assign(value)) + } + + fn parse_literal(&mut self, name: Ident, lit: Literal) -> Nested { + // TODO: Error if there are any tokens following + let _ = self.collect_tail(Empty); + + Nested::Named(name, NestedValue::Literal(lit)) + } + + fn parse_group(&mut self, name: Ident, group: TokenStream) -> Nested { + Nested::Named(name, NestedValue::Group(group)) + } + + fn parse_keyword(&mut self, keyword: Ident, name: Ident) -> Nested { + let error = expect_punct(self.next_tt(), '='); + + match error { + Some(error) => { + let error = self.collect_tail(error); + + Nested::Unexpected(error) + } + None => { + let value = self.collect_tail(Empty); + + Nested::Named(keyword, NestedValue::KeywordAssign(name, value)) + } + } + } +} + +impl Iterator for AttributeParser { + type Item = Nested; + + fn next(&mut self) -> Option<Nested> { + let first = self.inner.next()?; + + let name = match first { + TokenTree::Ident(ident) => ident, + tt => { + let stream = self.collect_tail(tt); + + return Some(Nested::Unnamed(stream.into_iter().collect())); + } + }; + + match self.next_tt() { + Some(tt) if is_punct(&tt, '=') => Some(self.parse_assign(name)), + Some(TokenTree::Literal(lit)) => Some(self.parse_literal(name, lit)), + Some(TokenTree::Group(group)) => Some(self.parse_group(name, group.stream())), + Some(TokenTree::Ident(next)) => Some(self.parse_keyword(name, next)), + Some(next) => Some(self.parse_unnamed(name, next)), + None => Some(Nested::Unnamed(quote!(#name))), + } + } +} diff --git a/vendor/logos-codegen/src/parser/subpattern.rs b/vendor/logos-codegen/src/parser/subpattern.rs new file mode 100644 index 00000000..eb620028 --- /dev/null +++ b/vendor/logos-codegen/src/parser/subpattern.rs @@ -0,0 +1,97 @@ +use proc_macro2::TokenStream; +use syn::Ident; + +use crate::error::Errors; +use crate::mir::Mir; +use crate::parser::definition::{bytes_to_regex_string, Literal}; + +#[derive(Default)] +pub struct Subpatterns { + map: Vec<(Ident, String)>, +} + +impl Subpatterns { + pub fn add(&mut self, param: Ident, pattern: TokenStream, errors: &mut Errors) { + let lit = match syn::parse2::<Literal>(pattern) { + Ok(lit) => lit, + Err(e) => { + errors.err(e.to_string(), e.span()); + return; + } + }; + + if let Some((name, _)) = self.map.iter().find(|(name, _)| *name == param) { + errors + .err(format!("{} can only be assigned once", param), param.span()) + .err("Previously assigned here", name.span()); + return; + } + + let fixed = self.fix(&lit, errors); + + // Validate the literal as proper regex. If it's not, emit an error. + let mir = match &lit { + Literal::Utf8(_) => Mir::utf8(&fixed), + Literal::Bytes(_) => Mir::binary(&fixed), + }; + + if let Err(err) = mir { + errors.err(err, lit.span()); + }; + + self.map.push((param, fixed)); + } + + pub fn fix(&self, lit: &Literal, errors: &mut Errors) -> String { + let mut i = 0; + let mut pattern = match lit { + Literal::Utf8(s) => s.value(), + Literal::Bytes(b) => bytes_to_regex_string(b.value()), + }; + + while let Some(f) = pattern[i..].find("(?&") { + i += f; + pattern.replace_range(i..i + 3, "(?:"); + i += 3; + + let subref_end = if let Some(f) = pattern[i..].find(')') { + i + f + } else { + pattern.truncate(i); // truncate so latter error doesn't suppress + break; // regex-syntax will report the unclosed group + }; + + let name = &pattern[i..subref_end]; + let name = match syn::parse_str::<Ident>(name) { + Ok(name) => name, + Err(_) => { + errors.err( + format!("subpattern reference `{}` is not an identifier", name), + lit.span(), + ); + // we emitted the error; make something up and continue + pattern.replace_range(i..subref_end, "_"); + i += 2; + continue; + } + }; + + match self.map.iter().find(|(def, _)| *def == name) { + Some((_, subpattern)) => { + pattern.replace_range(i..subref_end, subpattern); + i += subpattern.len() + 1; + } + None => { + errors.err( + format!("subpattern reference `{}` has not been defined", name), + lit.span(), + ); + // leaving `(?:name)` is fine + i = subref_end + 1; + } + } + } + + pattern + } +} diff --git a/vendor/logos-codegen/src/parser/type_params.rs b/vendor/logos-codegen/src/parser/type_params.rs new file mode 100644 index 00000000..1be4948e --- /dev/null +++ b/vendor/logos-codegen/src/parser/type_params.rs @@ -0,0 +1,200 @@ +use proc_macro2::{Ident, Span, TokenStream}; +use quote::quote; +use syn::spanned::Spanned; +use syn::{Lifetime, LifetimeParam, Path, Type}; + +use crate::error::Errors; + +#[derive(Default)] +pub struct TypeParams { + lifetime: bool, + type_params: Vec<(Ident, Option<Type>)>, +} + +impl TypeParams { + pub fn explicit_lifetime(&mut self, lt: LifetimeParam, errors: &mut Errors) { + if self.lifetime { + let span = lt.span(); + + errors.err("Logos types can only have one lifetime can be set", span); + } + + self.lifetime = true; + } + + pub fn add(&mut self, param: Ident) { + self.type_params.push((param, None)); + } + + pub fn set(&mut self, param: Ident, ty: TokenStream, errors: &mut Errors) { + let ty = match syn::parse2::<Type>(ty) { + Ok(mut ty) => { + replace_lifetimes(&mut ty); + ty + } + Err(err) => { + errors.err(err.to_string(), err.span()); + return; + } + }; + + match self.type_params.iter_mut().find(|(name, _)| *name == param) { + Some((_, slot)) => { + if let Some(previous) = slot.replace(ty) { + errors + .err( + format!("{} can only have one type assigned to it", param), + param.span(), + ) + .err("Previously assigned here", previous.span()); + } + } + None => { + errors.err( + format!("{} is not a declared type parameter", param), + param.span(), + ); + } + } + } + + pub fn find(&self, path: &Path) -> Option<Type> { + for (ident, ty) in &self.type_params { + if path.is_ident(ident) { + return ty.clone(); + } + } + + None + } + + pub fn generics(&self, errors: &mut Errors) -> Option<TokenStream> { + if !self.lifetime && self.type_params.is_empty() { + return None; + } + + let mut generics = Vec::new(); + + if self.lifetime { + generics.push(quote!('s)); + } + + for (ty, replace) in self.type_params.iter() { + match replace { + Some(ty) => generics.push(quote!(#ty)), + None => { + errors.err( + format!( + "Generic type parameter without a concrete type\n\ + \n\ + Define a concrete type Logos can use: #[logos(type {} = Type)]", + ty, + ), + ty.span(), + ); + } + } + } + + if generics.is_empty() { + None + } else { + Some(quote!(<#(#generics),*>)) + } + } +} + +pub fn replace_lifetimes(ty: &mut Type) { + traverse_type(ty, &mut replace_lifetime) +} + +pub fn replace_lifetime(ty: &mut Type) { + use syn::{GenericArgument, PathArguments}; + + match ty { + Type::Path(p) => { + p.path + .segments + .iter_mut() + .filter_map(|segment| match &mut segment.arguments { + PathArguments::AngleBracketed(ab) => Some(ab), + _ => None, + }) + .flat_map(|ab| ab.args.iter_mut()) + .for_each(|arg| { + if let GenericArgument::Lifetime(lt) = arg { + *lt = Lifetime::new("'s", lt.span()); + } + }); + } + Type::Reference(r) => { + let span = match r.lifetime.take() { + Some(lt) => lt.span(), + None => Span::call_site(), + }; + + r.lifetime = Some(Lifetime::new("'s", span)); + } + _ => (), + } +} + +pub fn traverse_type(ty: &mut Type, f: &mut impl FnMut(&mut Type)) { + f(ty); + match ty { + Type::Array(array) => traverse_type(&mut array.elem, f), + Type::BareFn(bare_fn) => { + for input in &mut bare_fn.inputs { + traverse_type(&mut input.ty, f); + } + if let syn::ReturnType::Type(_, ty) = &mut bare_fn.output { + traverse_type(ty, f); + } + } + Type::Group(group) => traverse_type(&mut group.elem, f), + Type::Paren(paren) => traverse_type(&mut paren.elem, f), + Type::Path(path) => traverse_path(&mut path.path, f), + Type::Ptr(p) => traverse_type(&mut p.elem, f), + Type::Reference(r) => traverse_type(&mut r.elem, f), + Type::Slice(slice) => traverse_type(&mut slice.elem, f), + Type::TraitObject(object) => object.bounds.iter_mut().for_each(|bound| { + if let syn::TypeParamBound::Trait(trait_bound) = bound { + traverse_path(&mut trait_bound.path, f); + } + }), + Type::Tuple(tuple) => tuple + .elems + .iter_mut() + .for_each(|elem| traverse_type(elem, f)), + _ => (), + } +} + +fn traverse_path(path: &mut Path, f: &mut impl FnMut(&mut Type)) { + for segment in &mut path.segments { + match &mut segment.arguments { + syn::PathArguments::None => (), + syn::PathArguments::AngleBracketed(args) => { + for arg in &mut args.args { + match arg { + syn::GenericArgument::Type(ty) => { + traverse_type(ty, f); + } + syn::GenericArgument::AssocType(assoc) => { + traverse_type(&mut assoc.ty, f); + } + _ => (), + } + } + } + syn::PathArguments::Parenthesized(args) => { + for arg in &mut args.inputs { + traverse_type(arg, f); + } + if let syn::ReturnType::Type(_, ty) = &mut args.output { + traverse_type(ty, f); + } + } + } + } +} diff --git a/vendor/logos-codegen/src/util.rs b/vendor/logos-codegen/src/util.rs new file mode 100644 index 00000000..156de035 --- /dev/null +++ b/vendor/logos-codegen/src/util.rs @@ -0,0 +1,64 @@ +use proc_macro2::{Spacing, Span, TokenStream, TokenTree}; +use quote::{quote, ToTokens}; +use syn::Ident; + +/// Analog to Option<TokenStream>, except when put into the quote! +/// macro, `MaybeVoid::Void` will produce `()` +#[derive(Clone, Default)] +pub enum MaybeVoid { + Some(TokenStream), + #[default] + Void, +} + +impl MaybeVoid { + pub fn replace(&mut self, stream: TokenStream) -> MaybeVoid { + std::mem::replace(self, MaybeVoid::Some(stream)) + } + + pub fn take(&mut self) -> MaybeVoid { + std::mem::replace(self, MaybeVoid::Void) + } +} + +impl ToTokens for MaybeVoid { + fn to_tokens(&self, out: &mut TokenStream) { + match self { + MaybeVoid::Some(stream) => out.extend(stream.clone()), + MaybeVoid::Void => out.extend(quote!(())), + } + } + + fn to_token_stream(&self) -> TokenStream { + match self { + MaybeVoid::Some(stream) => stream.clone(), + MaybeVoid::Void => quote!(()), + } + } + + fn into_token_stream(self) -> TokenStream { + match self { + MaybeVoid::Some(stream) => stream, + MaybeVoid::Void => quote!(()), + } + } +} + +pub fn is_punct(tt: &TokenTree, expect: char) -> bool { + matches!(tt, TokenTree::Punct(punct) if punct.as_char() == expect && punct.spacing() == Spacing::Alone) +} + +/// If supplied `tt` is a punct matching a char, returns `None`, else returns `tt` +pub fn expect_punct(tt: Option<TokenTree>, expect: char) -> Option<TokenTree> { + tt.filter(|tt| !is_punct(tt, expect)) +} + +pub trait ToIdent { + fn to_ident(&self) -> Ident; +} + +impl ToIdent for str { + fn to_ident(&self) -> Ident { + Ident::new(self, Span::call_site()) + } +} |
