diff options
Diffstat (limited to 'vendor/logos-codegen/src/generator')
| -rw-r--r-- | vendor/logos-codegen/src/generator/context.rs | 128 | ||||
| -rw-r--r-- | vendor/logos-codegen/src/generator/fork.rs | 216 | ||||
| -rw-r--r-- | vendor/logos-codegen/src/generator/leaf.rs | 67 | ||||
| -rw-r--r-- | vendor/logos-codegen/src/generator/mod.rs | 268 | ||||
| -rw-r--r-- | vendor/logos-codegen/src/generator/rope.rs | 39 | ||||
| -rw-r--r-- | vendor/logos-codegen/src/generator/tables.rs | 77 |
6 files changed, 795 insertions, 0 deletions
diff --git a/vendor/logos-codegen/src/generator/context.rs b/vendor/logos-codegen/src/generator/context.rs new file mode 100644 index 00000000..dc52f594 --- /dev/null +++ b/vendor/logos-codegen/src/generator/context.rs @@ -0,0 +1,128 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::Generator; +use crate::graph::NodeId; + +/// This struct keeps track of bytes available to be read without +/// bounds checking across the tree. +/// +/// For example, a branch that matches 4 bytes followed by a fork +/// with smallest branch containing of 2 bytes can do a bounds check +/// for 6 bytes ahead, and leave the remaining 2 byte array (fixed size) +/// to be handled by the fork, avoiding bound checks there. +#[derive(Default, Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub struct Context { + /// Amount of bytes that haven't been bumped yet but should + /// before a new read is performed + at: usize, + /// Number of bytes available without bound checks + available: usize, + /// Whether or not the Lexer has been bumped at least by 1 byte + bumped: bool, + /// Node to backtrack to to in case an explicit match has failed. + /// If `None` will instead produce an error token. + backtrack: Option<NodeId>, +} + +impl Context { + pub fn can_backtrack(&self) -> bool { + self.backtrack.is_some() + } + + pub fn switch(&mut self, miss: Option<NodeId>) -> Option<TokenStream> { + self.backtrack = Some(miss?); + self.bump() + } + + pub const fn advance(self, n: usize) -> Self { + Context { + at: self.at + n, + ..self + } + } + + pub fn bump(&mut self) -> Option<TokenStream> { + match self.at { + 0 => None, + n => { + let tokens = quote!(lex.bump_unchecked(#n);); + self.at = 0; + self.available = 0; + self.bumped = true; + Some(tokens) + } + } + } + + pub fn remainder(&self) -> usize { + self.available.saturating_sub(self.at) + } + + pub fn read_byte(&mut self) -> TokenStream { + let at = self.at; + + self.advance(1); + + #[cfg(not(feature = "forbid_unsafe"))] + { + quote!(unsafe { lex.read_byte_unchecked(#at) }) + } + + #[cfg(feature = "forbid_unsafe")] + { + quote!(lex.read_byte(#at)) + } + } + + pub fn read(&mut self, len: usize) -> TokenStream { + self.available = len; + + match (self.at, len) { + (0, 0) => quote!(lex.read::<u8>()), + (a, 0) => quote!(lex.read_at::<u8>(#a)), + (0, l) => quote!(lex.read::<&[u8; #l]>()), + (a, l) => quote!(lex.read_at::<&[u8; #l]>(#a)), + } + } + + pub fn wipe(&mut self) { + self.available = 0; + } + + const fn backtrack(self) -> Self { + Context { + at: 0, + available: 0, + bumped: self.bumped, + backtrack: None, + } + } + + pub fn miss(mut self, miss: Option<NodeId>, gen: &mut Generator) -> TokenStream { + self.wipe(); + match (miss, self.backtrack) { + (Some(id), _) => gen.goto(id, self).clone(), + (_, Some(id)) => gen.goto(id, self.backtrack()).clone(), + _ if self.bumped => quote!(lex.error()), + _ => quote!(_error(lex)), + } + } + + pub fn write_suffix(&self, buf: &mut String) { + use std::fmt::Write; + + if self.at > 0 { + let _ = write!(buf, "_at{}", self.at); + } + if self.available > 0 { + let _ = write!(buf, "_with{}", self.available); + } + if let Some(id) = self.backtrack { + let _ = write!(buf, "_ctx{}", id); + } + if self.bumped { + buf.push_str("_x"); + } + } +} diff --git a/vendor/logos-codegen/src/generator/fork.rs b/vendor/logos-codegen/src/generator/fork.rs new file mode 100644 index 00000000..11d4eab2 --- /dev/null +++ b/vendor/logos-codegen/src/generator/fork.rs @@ -0,0 +1,216 @@ +use std::cmp::max; + +use fnv::FnvHashMap as Map; +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::graph::{Fork, NodeId, Range}; +use crate::util::ToIdent; + +type Targets = Map<NodeId, Vec<Range>>; + +impl Generator<'_> { + pub fn generate_fork(&mut self, this: NodeId, fork: &Fork, mut ctx: Context) -> TokenStream { + let mut targets: Targets = Map::default(); + + for (range, then) in fork.branches() { + targets.entry(then).or_default().push(range); + } + let loops_to_self = self.meta[this].loop_entry_from.contains(&this); + + match targets.len() { + 1 if loops_to_self => return self.generate_fast_loop(fork, ctx), + 0..=2 => (), + _ => return self.generate_fork_jump_table(this, fork, targets, ctx), + } + let miss = ctx.miss(fork.miss, self); + let end = self.fork_end(this, &miss); + let (byte, read) = self.fork_read(this, end, &mut ctx); + let branches = targets.into_iter().map(|(id, ranges)| { + let next = self.goto(id, ctx.advance(1)); + + match *ranges { + [range] => { + quote!(#range => #next,) + } + [a, b] if a.is_byte() && b.is_byte() => { + quote!(#a | #b => #next,) + } + _ => { + let test = self.generate_test(ranges).clone(); + let next = self.goto(id, ctx.advance(1)); + + quote!(byte if #test(byte) => #next,) + } + } + }); + + quote! { + #read + + match #byte { + #(#branches)* + _ => #miss, + } + } + } + + fn generate_fork_jump_table( + &mut self, + this: NodeId, + fork: &Fork, + targets: Targets, + mut ctx: Context, + ) -> TokenStream { + let miss = ctx.miss(fork.miss, self); + let end = self.fork_end(this, &miss); + let (byte, read) = self.fork_read(this, end, &mut ctx); + + let mut table: [u8; 256] = [0; 256]; + let mut jumps = vec!["__".to_ident()]; + + let branches = targets + .into_iter() + .enumerate() + .map(|(idx, (id, ranges))| { + let idx = (idx as u8) + 1; + let next = self.goto(id, ctx.advance(1)); + jumps.push(format!("J{}", id).to_ident()); + + for byte in ranges.into_iter().flatten() { + table[byte as usize] = idx; + } + let jump = jumps.last().unwrap(); + + quote!(Jump::#jump => #next,) + }) + .collect::<TokenStream>(); + + let may_error = table.iter().any(|&idx| idx == 0); + + let jumps = jumps.as_slice(); + let table = table.iter().copied().map(|idx| &jumps[idx as usize]); + + let jumps = if may_error { jumps } else { &jumps[1..] }; + let error_branch = if may_error { + Some(quote!(Jump::__ => #miss)) + } else { + None + }; + + quote! { + enum Jump { + #(#jumps,)* + } + + const LUT: [Jump; 256] = { + use Jump::*; + + [#(#table),*] + }; + + #read + + match LUT[#byte as usize] { + #branches + #error_branch + } + } + } + + fn fork_end(&self, this: NodeId, miss: &TokenStream) -> TokenStream { + if this == self.root { + quote!(_end(lex)) + } else { + miss.clone() + } + } + + fn fork_read( + &self, + this: NodeId, + end: TokenStream, + ctx: &mut Context, + ) -> (TokenStream, TokenStream) { + let min_read = self.meta[this].min_read; + + if ctx.remainder() >= max(min_read, 1) { + let read = ctx.read_byte(); + + return (quote!(byte), quote!(let byte = #read;)); + } + + match min_read { + 0 | 1 => { + let read = ctx.read(0); + + ( + quote!(byte), + quote! { + let byte = match #read { + Some(byte) => byte, + None => return #end, + }; + }, + ) + } + len => { + let read = ctx.read(len); + + ( + quote!(arr[0]), + quote! { + let arr = match #read { + Some(arr) => arr, + None => return #end, + }; + }, + ) + } + } + } + + fn generate_fast_loop(&mut self, fork: &Fork, ctx: Context) -> TokenStream { + let miss = ctx.miss(fork.miss, self); + let ranges = fork.branches().map(|(range, _)| range).collect::<Vec<_>>(); + let test = self.generate_test(ranges); + + quote! { + _fast_loop!(lex, #test, #miss); + } + } + + pub fn fast_loop_macro() -> TokenStream { + quote! { + macro_rules! _fast_loop { + ($lex:ident, $test:ident, $miss:expr) => { + // Do one bounds check for multiple bytes till EOF + while let Some(arr) = $lex.read::<&[u8; 16]>() { + if $test(arr[0]) { if $test(arr[1]) { if $test(arr[2]) { if $test(arr[3]) { + if $test(arr[4]) { if $test(arr[5]) { if $test(arr[6]) { if $test(arr[7]) { + if $test(arr[8]) { if $test(arr[9]) { if $test(arr[10]) { if $test(arr[11]) { + if $test(arr[12]) { if $test(arr[13]) { if $test(arr[14]) { if $test(arr[15]) { + + $lex.bump_unchecked(16); continue; } $lex.bump_unchecked(15); return $miss; } + $lex.bump_unchecked(14); return $miss; } $lex.bump_unchecked(13); return $miss; } + $lex.bump_unchecked(12); return $miss; } $lex.bump_unchecked(11); return $miss; } + $lex.bump_unchecked(10); return $miss; } $lex.bump_unchecked(9); return $miss; } + $lex.bump_unchecked(8); return $miss; } $lex.bump_unchecked(7); return $miss; } + $lex.bump_unchecked(6); return $miss; } $lex.bump_unchecked(5); return $miss; } + $lex.bump_unchecked(4); return $miss; } $lex.bump_unchecked(3); return $miss; } + $lex.bump_unchecked(2); return $miss; } $lex.bump_unchecked(1); return $miss; } + + return $miss; + } + + while $lex.test($test) { + $lex.bump_unchecked(1); + } + + $miss + }; + } + } + } +} diff --git a/vendor/logos-codegen/src/generator/leaf.rs b/vendor/logos-codegen/src/generator/leaf.rs new file mode 100644 index 00000000..0841a4ff --- /dev/null +++ b/vendor/logos-codegen/src/generator/leaf.rs @@ -0,0 +1,67 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::leaf::{Callback, Leaf}; +use crate::util::MaybeVoid; + +impl Generator<'_> { + pub fn generate_leaf(&mut self, leaf: &Leaf, mut ctx: Context) -> TokenStream { + let bump = ctx.bump(); + + let ident = &leaf.ident; + let name = self.name; + let this = self.this; + let ty = &leaf.field; + + let constructor = match leaf.field { + MaybeVoid::Some(_) => quote!(#name::#ident), + MaybeVoid::Void => quote!(|()| #name::#ident), + }; + + match &leaf.callback { + Some(Callback::Label(callback)) => quote! { + #bump + #callback(lex).construct(#constructor, lex); + }, + Some(Callback::Inline(inline)) => { + let arg = &inline.arg; + let body = &inline.body; + + #[cfg(not(rust_1_82))] + let ret = quote!(impl CallbackResult<'s, #ty, #this>); + + #[cfg(rust_1_82)] + let ret = quote!(impl CallbackResult<'s, #ty, #this> + use<'s>); + + quote! { + #bump + + #[inline] + fn callback<'s>(#arg: &mut Lexer<'s>) -> #ret { + #body + } + + callback(lex).construct(#constructor, lex); + } + } + Some(Callback::Skip(_)) => { + quote! { + #bump + + lex.trivia(); + #name::lex(lex); + } + } + None if matches!(leaf.field, MaybeVoid::Void) => quote! { + #bump + lex.set(Ok(#name::#ident)); + }, + None => quote! { + #bump + let token = #name::#ident(lex.slice()); + lex.set(Ok(token)); + }, + } + } +} diff --git a/vendor/logos-codegen/src/generator/mod.rs b/vendor/logos-codegen/src/generator/mod.rs new file mode 100644 index 00000000..1b8bf8bf --- /dev/null +++ b/vendor/logos-codegen/src/generator/mod.rs @@ -0,0 +1,268 @@ +use fnv::{FnvHashMap as Map, FnvHashSet as Set}; +use proc_macro2::TokenStream; +use quote::{quote, ToTokens, TokenStreamExt}; +use syn::Ident; + +use crate::graph::{Graph, Meta, Node, NodeId, Range}; +use crate::leaf::Leaf; +use crate::util::ToIdent; + +mod context; +mod fork; +mod leaf; +mod rope; +mod tables; + +use self::context::Context; +use self::tables::TableStack; + +pub struct Generator<'a> { + /// Name of the type we are implementing the `Logos` trait for + name: &'a Ident, + /// Name of the type with any generics it might need + this: &'a TokenStream, + /// Id to the root node + root: NodeId, + /// Reference to the graph with all of the nodes + graph: &'a Graph<Leaf<'a>>, + /// Meta data collected for the nodes + meta: Meta, + /// Buffer with functions growing during generation + rendered: TokenStream, + /// Set of functions that have already been rendered + fns: Set<(NodeId, Context)>, + /// Function name identifiers + idents: Map<(NodeId, Context), Ident>, + /// Local function calls. Note: a call might change its context, + /// so we can't use `idents` for this purpose. + gotos: Map<(NodeId, Context), TokenStream>, + /// Identifiers for helper functions matching a byte to a given + /// set of ranges + tests: Map<Vec<Range>, Ident>, + /// Related to above, table stack manages tables that need to be + tables: TableStack, +} + +impl<'a> Generator<'a> { + pub fn new( + name: &'a Ident, + this: &'a TokenStream, + root: NodeId, + graph: &'a Graph<Leaf>, + ) -> Self { + let rendered = Self::fast_loop_macro(); + let meta = Meta::analyze(root, graph); + + Generator { + name, + this, + root, + graph, + meta, + rendered, + fns: Set::default(), + idents: Map::default(), + gotos: Map::default(), + tests: Map::default(), + tables: TableStack::new(), + } + } + + pub fn generate(mut self) -> TokenStream { + let root = self.goto(self.root, Context::default()).clone(); + let rendered = &self.rendered; + let tables = &self.tables; + + quote! { + #tables + #rendered + #root + } + } + + fn generate_fn(&mut self, id: NodeId, ctx: Context) { + if self.fns.contains(&(id, ctx)) { + return; + } + self.fns.insert((id, ctx)); + + let body = match &self.graph[id] { + Node::Fork(fork) => self.generate_fork(id, fork, ctx), + Node::Rope(rope) => self.generate_rope(rope, ctx), + Node::Leaf(leaf) => self.generate_leaf(leaf, ctx), + }; + let ident = self.generate_ident(id, ctx); + let out = quote! { + #[inline] + fn #ident<'s>(lex: &mut Lexer<'s>) { + #body + } + }; + + self.rendered.append_all(out); + } + + fn goto(&mut self, id: NodeId, mut ctx: Context) -> &TokenStream { + let key = (id, ctx); + + // Allow contains_key + insert because self.generate_ident borrows a mutable ref to self + // too. + #[allow(clippy::map_entry)] + if !self.gotos.contains_key(&key) { + let meta = &self.meta[id]; + let enters_loop = !meta.loop_entry_from.is_empty(); + + let bump = if enters_loop || !ctx.can_backtrack() { + ctx.switch(self.graph[id].miss()) + } else { + None + }; + + let bump = match (bump, enters_loop, meta.min_read) { + (Some(t), _, _) => Some(t), + (None, true, _) => ctx.bump(), + (None, false, 0) => ctx.bump(), + (None, false, _) => None, + }; + + if meta.min_read == 0 || ctx.remainder() < meta.min_read { + ctx.wipe(); + } + + let ident = self.generate_ident(id, ctx); + let mut call_site = quote!(#ident(lex)); + + if let Some(bump) = bump { + call_site = quote!({ + #bump + #call_site + }); + } + self.gotos.insert(key, call_site); + self.generate_fn(id, ctx); + } + &self.gotos[&key] + } + + fn generate_ident(&mut self, id: NodeId, ctx: Context) -> &Ident { + self.idents.entry((id, ctx)).or_insert_with(|| { + let mut ident = format!("goto{}", id); + + ctx.write_suffix(&mut ident); + + ident.to_ident() + }) + } + + /// Returns an identifier to a function that matches a byte to any + /// of the provided ranges. This will generate either a simple + /// match expression, or use a lookup table internally. + fn generate_test(&mut self, ranges: Vec<Range>) -> &Ident { + if !self.tests.contains_key(&ranges) { + let idx = self.tests.len(); + let ident = format!("pattern{}", idx).to_ident(); + + let lo = ranges.first().unwrap().start; + let hi = ranges.last().unwrap().end; + + let body = match ranges.len() { + 0..=2 => { + quote! { + match byte { + #(#ranges)|* => true, + _ => false, + } + } + } + _ if hi - lo < 64 => { + let mut offset = hi.saturating_sub(63); + + while offset.count_ones() > 1 && lo - offset > 0 { + offset += 1; + } + + let mut table = 0u64; + + for byte in ranges.iter().flat_map(|range| *range) { + if byte - offset >= 64 { + panic!("{:#?} {} {} {}", ranges, hi, lo, offset); + } + table |= 1 << (byte - offset); + } + + let search = match offset { + 0 => quote!(byte), + _ => quote!(byte.wrapping_sub(#offset)), + }; + + quote! { + const LUT: u64 = #table; + + match 1u64.checked_shl(#search as u32) { + Some(shift) => LUT & shift != 0, + None => false, + } + } + } + _ => { + let mut view = self.tables.view(); + + for byte in ranges.iter().flat_map(|range| *range) { + view.flag(byte); + } + + let mask = view.mask(); + let lut = view.ident(); + + quote! { + #lut[byte as usize] & #mask > 0 + } + } + }; + self.rendered.append_all(quote! { + #[inline] + fn #ident(byte: u8) -> bool { + #body + } + }); + self.tests.insert(ranges.clone(), ident); + } + &self.tests[&ranges] + } +} + +macro_rules! match_quote { + ($source:expr; $($byte:tt,)* ) => {match $source { + $( $byte => quote!($byte), )* + byte => quote!(#byte), + }} +} + +fn byte_to_tokens(byte: u8) -> TokenStream { + match_quote! { + byte; + b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', + b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h', b'i', b'j', + b'k', b'l', b'm', b'n', b'o', b'p', b'q', b'r', b's', b't', + b'u', b'v', b'w', b'x', b'y', b'z', + b'A', b'B', b'C', b'D', b'E', b'F', b'G', b'H', b'I', b'J', + b'K', b'L', b'M', b'N', b'O', b'P', b'Q', b'R', b'S', b'T', + b'U', b'V', b'W', b'X', b'Y', b'Z', + b'!', b'@', b'#', b'$', b'%', b'^', b'&', b'*', b'(', b')', + b'{', b'}', b'[', b']', b'<', b'>', b'-', b'=', b'_', b'+', + b':', b';', b',', b'.', b'/', b'?', b'|', b'"', b'\'', b'\\', + } +} + +impl ToTokens for Range { + fn to_tokens(&self, tokens: &mut TokenStream) { + let Range { start, end } = self; + + tokens.append_all(byte_to_tokens(*start)); + + if start != end { + tokens.append_all(quote!(..=)); + tokens.append_all(byte_to_tokens(*end)); + } + } +} diff --git a/vendor/logos-codegen/src/generator/rope.rs b/vendor/logos-codegen/src/generator/rope.rs new file mode 100644 index 00000000..ae3b07ad --- /dev/null +++ b/vendor/logos-codegen/src/generator/rope.rs @@ -0,0 +1,39 @@ +use proc_macro2::TokenStream; +use quote::quote; + +use crate::generator::{Context, Generator}; +use crate::graph::Rope; + +impl Generator<'_> { + pub fn generate_rope(&mut self, rope: &Rope, mut ctx: Context) -> TokenStream { + let miss = ctx.miss(rope.miss.first(), self); + let read = ctx.read(rope.pattern.len()); + let then = self.goto(rope.then, ctx.advance(rope.pattern.len())); + + let pat = match rope.pattern.to_bytes() { + Some(bytes) => byte_slice_literal(&bytes), + None => { + let ranges = rope.pattern.iter(); + + quote!([#(#ranges),*]) + } + }; + + quote! { + match #read { + Some(#pat) => #then, + _ => #miss, + } + } + } +} + +fn byte_slice_literal(bytes: &[u8]) -> TokenStream { + if bytes.iter().any(|&b| !(0x20..0x7F).contains(&b)) { + return quote!(&[#(#bytes),*]); + } + + let slice = std::str::from_utf8(bytes).unwrap(); + + syn::parse_str(&format!("b{:?}", slice)).unwrap() +} diff --git a/vendor/logos-codegen/src/generator/tables.rs b/vendor/logos-codegen/src/generator/tables.rs new file mode 100644 index 00000000..f1e53273 --- /dev/null +++ b/vendor/logos-codegen/src/generator/tables.rs @@ -0,0 +1,77 @@ +use crate::util::ToIdent; +use proc_macro2::{Literal, TokenStream}; +use quote::{quote, ToTokens}; +use syn::Ident; + +pub struct TableStack { + tables: Vec<(Ident, [u8; 256])>, + shift: u8, +} + +pub struct TableView<'a> { + ident: &'a Ident, + table: &'a mut [u8; 256], + mask: u8, +} + +impl TableStack { + pub fn new() -> Self { + TableStack { + tables: vec![("COMPACT_TABLE_0".to_ident(), [0; 256])], + shift: 0, + } + } + + pub fn view(&mut self) -> TableView { + let mask = if self.shift < 8 { + // Reusing existing table with a shifted mask + let mask = 1u8 << self.shift; + + self.shift += 1; + + mask + } else { + // Need to create a new table + let ident = format!("COMPACT_TABLE_{}", self.tables.len()).to_ident(); + + self.tables.push((ident, [0; 256])); + self.shift = 1; + + 1 + }; + + let (ref ident, ref mut table) = self.tables.last_mut().unwrap(); + + TableView { ident, table, mask } + } +} + +impl<'a> TableView<'a> { + pub fn ident(&self) -> &'a Ident { + self.ident + } + + pub fn flag(&mut self, byte: u8) { + self.table[byte as usize] |= self.mask; + } + + pub fn mask(&self) -> Literal { + Literal::u8_unsuffixed(self.mask) + } +} + +impl ToTokens for TableStack { + fn to_tokens(&self, out: &mut TokenStream) { + if self.shift == 0 { + return; + } + + for (ident, table) in self.tables.iter() { + let bytes = table.iter().copied().map(Literal::u8_unsuffixed); + + out.extend(quote! { + static #ident: [u8; 256] = [#(#bytes),*]; + }); + } + } +} |
