diff options
| author | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
|---|---|---|
| committer | mo khan <mo@mokhan.ca> | 2025-07-02 18:36:06 -0600 |
| commit | 8cdfa445d6629ffef4cb84967ff7017654045bc2 (patch) | |
| tree | 22f0b0907c024c78d26a731e2e1f5219407d8102 /vendor/httparse/src/simd | |
| parent | 4351c74c7c5f97156bc94d3a8549b9940ac80e3f (diff) | |
chore: add vendor directory
Diffstat (limited to 'vendor/httparse/src/simd')
| -rw-r--r-- | vendor/httparse/src/simd/avx2.rs | 206 | ||||
| -rw-r--r-- | vendor/httparse/src/simd/mod.rs | 153 | ||||
| -rw-r--r-- | vendor/httparse/src/simd/neon.rs | 258 | ||||
| -rw-r--r-- | vendor/httparse/src/simd/runtime.rs | 57 | ||||
| -rw-r--r-- | vendor/httparse/src/simd/sse42.rs | 142 | ||||
| -rw-r--r-- | vendor/httparse/src/simd/swar.rs | 235 |
6 files changed, 1051 insertions, 0 deletions
diff --git a/vendor/httparse/src/simd/avx2.rs b/vendor/httparse/src/simd/avx2.rs new file mode 100644 index 00000000..2adf0a28 --- /dev/null +++ b/vendor/httparse/src/simd/avx2.rs @@ -0,0 +1,206 @@ +use crate::iter::Bytes; + +#[inline] +#[target_feature(enable = "avx2")] +pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 32 { + + let advance = match_url_char_32_avx(bytes.as_ref()); + + bytes.advance(advance); + + if advance != 32 { + return; + } + } + // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2 + super::swar::match_uri_vectored(bytes) +} + +#[inline(always)] +#[allow(non_snake_case, overflowing_literals)] +#[allow(unused)] +unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize { + // NOTE: This check might be not necessary since this function is only used in + // `match_uri_vectored` where buffer overflow is taken care of. + debug_assert!(buf.len() >= 32); + + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + // pointer to buffer + let ptr = buf.as_ptr(); + + // %x21-%x7e %x80-%xff + // + // Character ranges allowed by this function, can also be interpreted as: + // 33 =< (x != 127) =< 255 + // + // Create a vector full of DEL (0x7f) characters. + let DEL: __m256i = _mm256_set1_epi8(0x7f); + // Create a vector full of exclamation mark (!) (0x21) characters. + // Used as lower threshold, characters in URLs cannot be smaller than this. + let LOW: __m256i = _mm256_set1_epi8(0x21); + + // Load a chunk of 32 bytes from `ptr` as a vector. + // We can check 32 bytes in parallel at most with AVX2 since + // YMM registers can only have 256 bits most. + let dat = _mm256_lddqu_si256(ptr as *const _); + + // unsigned comparison dat >= LOW + // + // `_mm256_max_epu8` creates a new vector by comparing vectors `dat` and `LOW` + // and picks the max. values from each for all indices. + // So if a byte in `dat` is <= 32, it'll be represented as 33 + // which is the smallest valid character. + // + // Then, we compare the new vector with `dat` for equality. + // + // `_mm256_cmpeq_epi8` returns a new vector where; + // * matching bytes are set to 0xFF (all bits set), + // * nonmatching bytes are set to 0 (no bits set). + let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat); + // Similar to what we did before, but now invalid characters are set to 0xFF. + let del = _mm256_cmpeq_epi8(dat, DEL); + + // We glue the both comparisons via `_mm256_andnot_si256`. + // + // Since the representation of truthiness differ in these comparisons, + // we are in need of bitwise NOT to convert valid characters of `del`. + let bit = _mm256_andnot_si256(del, low); + // This creates a bitmask from the most significant bit of each byte. + // Simply, we're converting a vector value to scalar value here. + let res = _mm256_movemask_epi8(bit) as u32; + + // Count trailing zeros to find the first encountered invalid character. + // Bitwise NOT is required once again to flip truthiness. + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize +} + +#[target_feature(enable = "avx2")] +pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 32 { + let advance = match_header_value_char_32_avx(bytes.as_ref()); + bytes.advance(advance); + + if advance != 32 { + return; + } + } + // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2 + super::swar::match_header_value_vectored(bytes) +} + +#[inline(always)] +#[allow(non_snake_case)] +#[allow(unused)] +unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize { + debug_assert!(buf.len() >= 32); + + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let ptr = buf.as_ptr(); + + // %x09 %x20-%x7e %x80-%xff + // Create a vector full of horizontal tab (\t) (0x09) characters. + let TAB: __m256i = _mm256_set1_epi8(0x09); + // Create a vector full of DEL (0x7f) characters. + let DEL: __m256i = _mm256_set1_epi8(0x7f); + // Create a vector full of space (0x20) characters. + let LOW: __m256i = _mm256_set1_epi8(0x20); + + // Load a chunk of 32 bytes from `ptr` as a vector. + let dat = _mm256_lddqu_si256(ptr as *const _); + + // unsigned comparison dat >= LOW + // + // Same as what we do in `match_url_char_32_avx`. + // This time the lower threshold is set to space character though. + let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat); + // Check if `dat` includes `TAB` characters. + let tab = _mm256_cmpeq_epi8(dat, TAB); + // Check if `dat` includes `DEL` characters. + let del = _mm256_cmpeq_epi8(dat, DEL); + + // Combine all comparisons together, notice that we're also using OR + // to connect `low` and `tab` but flip bits of `del`. + // + // In the end, this is simply: + // ~del & (low | tab) + let bit = _mm256_andnot_si256(del, _mm256_or_si256(low, tab)); + // This creates a bitmask from the most significant bit of each byte. + // Creates a scalar value from vector value. + let res = _mm256_movemask_epi8(bit) as u32; + + // Count trailing zeros to find the first encountered invalid character. + // Bitwise NOT is required once again to flip truthiness. + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize +} + +#[test] +fn avx2_code_matches_uri_chars_table() { + if !is_x86_feature_detected!("avx2") { + return; + } + + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_uri_vectored)); + + for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_uri_vectored), allowed, + "byte_is_allowed({:?}) should be {:?}", b, allowed, + ); + } + } +} + +#[test] +fn avx2_code_matches_header_value_chars_table() { + if !is_x86_feature_detected!("avx2") { + return; + } + + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_header_value_vectored)); + + for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_header_value_vectored), allowed, + "byte_is_allowed({:?}) should be {:?}", b, allowed, + ); + } + } +} + +#[cfg(test)] +unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool { + let slice = [ + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', byte, b'_', + b'_', b'_', b'_', b'_', + ]; + let mut bytes = Bytes::new(&slice); + + f(&mut bytes); + + match bytes.pos() { + 32 => true, + 26 => false, + _ => unreachable!(), + } +} diff --git a/vendor/httparse/src/simd/mod.rs b/vendor/httparse/src/simd/mod.rs new file mode 100644 index 00000000..0e4493a5 --- /dev/null +++ b/vendor/httparse/src/simd/mod.rs @@ -0,0 +1,153 @@ +mod swar; + +#[cfg(not(all( + httparse_simd, + any( + target_arch = "x86", + target_arch = "x86_64", + all( + target_arch = "aarch64", + httparse_simd_neon_intrinsics, + ) + ), +)))] +pub use self::swar::*; + +#[cfg(all( + httparse_simd, + not(httparse_simd_target_feature_avx2), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +mod sse42; + +#[cfg(all( + httparse_simd, + any( + httparse_simd_target_feature_avx2, + not(httparse_simd_target_feature_sse42), + ), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +mod avx2; + +#[cfg(all( + httparse_simd, + not(any( + httparse_simd_target_feature_sse42, + httparse_simd_target_feature_avx2, + )), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +mod runtime; + +#[cfg(all( + httparse_simd, + not(any( + httparse_simd_target_feature_sse42, + httparse_simd_target_feature_avx2, + )), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +pub use self::runtime::*; + +#[cfg(all( + httparse_simd, + httparse_simd_target_feature_sse42, + not(httparse_simd_target_feature_avx2), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +mod sse42_compile_time { + #[inline(always)] + pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) { + super::swar::match_header_name_vectored(b); + } + + #[inline(always)] + pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) { + // SAFETY: calls are guarded by a compile time feature check + unsafe { crate::simd::sse42::match_uri_vectored(b) } + } + + #[inline(always)] + pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) { + // SAFETY: calls are guarded by a compile time feature check + unsafe { crate::simd::sse42::match_header_value_vectored(b) } + } +} + +#[cfg(all( + httparse_simd, + httparse_simd_target_feature_sse42, + not(httparse_simd_target_feature_avx2), + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +pub use self::sse42_compile_time::*; + +#[cfg(all( + httparse_simd, + httparse_simd_target_feature_avx2, + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +mod avx2_compile_time { + #[inline(always)] + pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) { + super::swar::match_header_name_vectored(b); + } + + #[inline(always)] + pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) { + // SAFETY: calls are guarded by a compile time feature check + unsafe { crate::simd::avx2::match_uri_vectored(b) } + } + + #[inline(always)] + pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) { + // SAFETY: calls are guarded by a compile time feature check + unsafe { crate::simd::avx2::match_header_value_vectored(b) } + } +} + +#[cfg(all( + httparse_simd, + httparse_simd_target_feature_avx2, + any( + target_arch = "x86", + target_arch = "x86_64", + ), +))] +pub use self::avx2_compile_time::*; + +#[cfg(all( + httparse_simd, + target_arch = "aarch64", + httparse_simd_neon_intrinsics, +))] +mod neon; + +#[cfg(all( + httparse_simd, + target_arch = "aarch64", + httparse_simd_neon_intrinsics, +))] +pub use self::neon::*; diff --git a/vendor/httparse/src/simd/neon.rs b/vendor/httparse/src/simd/neon.rs new file mode 100644 index 00000000..b059efb2 --- /dev/null +++ b/vendor/httparse/src/simd/neon.rs @@ -0,0 +1,258 @@ +use crate::iter::Bytes; +use core::arch::aarch64::*; + +#[inline] +pub fn match_header_name_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 16 { + // SAFETY: ensured that there are at least 16 bytes remaining + unsafe { + let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr()); + bytes.advance(advance); + + if advance != 16 { + return; + } + } + } + super::swar::match_header_name_vectored(bytes); +} + +#[inline] +pub fn match_header_value_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 16 { + // SAFETY: ensured that there are at least 16 bytes remaining + unsafe { + let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr()); + bytes.advance(advance); + + if advance != 16 { + return; + } + } + } + super::swar::match_header_value_vectored(bytes); +} + +#[inline] +pub fn match_uri_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 16 { + // SAFETY: ensured that there are at least 16 bytes remaining + unsafe { + let advance = match_url_char_16_neon(bytes.as_ref().as_ptr()); + bytes.advance(advance); + + if advance != 16 { + return; + } + } + } + super::swar::match_uri_vectored(bytes); +} + +const fn bit_set(x: u8) -> bool { + // Validates if a byte is a valid header name character + // https://tools.ietf.org/html/rfc7230#section-3.2.6 + matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~') +} + +// A 256-bit bitmap, split into two halves +// lower half contains bits whose higher nibble is <= 7 +// higher half contains bits whose higher nibble is >= 8 +const fn build_bitmap() -> ([u8; 16], [u8; 16]) { + let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F + let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF + let mut i = 0; + while i < 256 { + if bit_set(i as u8) { + // Nibbles + let (lo, hi) = (i & 0x0F, i >> 4); + if i < 128 { + bitmap_0_7[lo] |= 1 << hi; + } else { + bitmap_8_15[lo] |= 1 << hi; + } + } + i += 1; + } + (bitmap_0_7, bitmap_8_15) +} + +const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap(); + +// NOTE: adapted from 256-bit version, with upper 128-bit ops commented out +#[inline] +unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize { + let bitmaps = BITMAPS; + // NOTE: ideally compile-time constants + let (bitmap_0_7, _bitmap_8_15) = bitmaps; + let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr()); + // let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr()); + + // Initialize the bitmask_lookup. + const BITMASK_LOOKUP_DATA: [u8; 16] = + [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128]; + let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr()); + + // Load 16 input bytes. + let input = vld1q_u8(ptr); + + // Extract indices for row_0_7. + let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111; + + // Extract indices for row_8_15. + // let msb = vandq_u8(input, vdupq_n_u8(0x80)); + // let indices_8_15 = veorq_u8(indices_0_7, msb); + + // Fetch row_0_7 and row_8_15. + let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7); + // let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15); + + // Calculate a bitmask, i.e. (1 << hi_nibble % 8). + let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4)); + + // Choose rows halves depending on higher nibbles. + // let bitsets = vorrq_u8(row_0_7, row_8_15); + let bitsets = row_0_7; + + // Finally check which bytes belong to the set. + let tmp = vandq_u8(bitsets, bitmask); + let result = vceqq_u8(tmp, bitmask); + + offsetz(result) as usize +} + +#[inline] +unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize { + let input = vld1q_u8(ptr); + + // Check that b'!' <= and b != 127 + let result = vcleq_u8(vdupq_n_u8(b'!'), input); + + // Disallow del + let del = vceqq_u8(input, vdupq_n_u8(0x7F)); + let result = vbicq_u8(result, del); + + offsetz(result) as usize +} + +#[inline] +unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize { + let input = vld1q_u8(ptr); + + // Check that b' ' <= and b != 127 or b == 9 + let result = vcleq_u8(vdupq_n_u8(b' '), input); + + // Allow tab + let tab = vceqq_u8(input, vdupq_n_u8(0x09)); + let result = vorrq_u8(result, tab); + + // Disallow del + let del = vceqq_u8(input, vdupq_n_u8(0x7F)); + let result = vbicq_u8(result, del); + + offsetz(result) as usize +} + +#[inline] +unsafe fn offsetz(x: uint8x16_t) -> u32 { + // NOT the vector since it's faster to operate with zeros instead + offsetnz(vmvnq_u8(x)) +} + +#[inline] +unsafe fn offsetnz(x: uint8x16_t) -> u32 { + // Extract two u64 + let x = vreinterpretq_u64_u8(x); + // Extract to general purpose registers to perform clz + let low: u64 = vgetq_lane_u64::<0>(x); + let high: u64 = vgetq_lane_u64::<1>(x); + + #[inline] + fn clz(x: u64) -> u32 { + // perf: rust will unroll this loop + // and it's much faster than rbit + clz so voila + for (i, b) in x.to_ne_bytes().iter().copied().enumerate() { + if b != 0 { + return i as u32; + } + } + 8 // Technically not reachable since zero-guarded + } + + if low != 0 { + clz(low) + } else if high != 0 { + return 8 + clz(high); + } else { + return 16; + } +} + +#[test] +fn neon_code_matches_uri_chars_table() { + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_uri_vectored)); + + for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_uri_vectored), + allowed, + "byte_is_allowed({:?}) should be {:?}", + b, + allowed, + ); + } + } +} + +#[test] +fn neon_code_matches_header_value_chars_table() { + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_header_value_vectored)); + + for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_header_value_vectored), + allowed, + "byte_is_allowed({:?}) should be {:?}", + b, + allowed, + ); + } + } +} + +#[test] +fn neon_code_matches_header_name_chars_table() { + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_header_name_vectored)); + + for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_header_name_vectored), + allowed, + "byte_is_allowed({:?}) should be {:?}", + b, + allowed, + ); + } + } +} + +#[cfg(test)] +unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool { + let mut slice = [b'_'; 16]; + slice[10] = byte; + let mut bytes = Bytes::new(&slice); + + f(&mut bytes); + + match bytes.pos() { + 16 => true, + 10 => false, + x => panic!("unexpected pos: {}", x), + } +} diff --git a/vendor/httparse/src/simd/runtime.rs b/vendor/httparse/src/simd/runtime.rs new file mode 100644 index 00000000..c523a921 --- /dev/null +++ b/vendor/httparse/src/simd/runtime.rs @@ -0,0 +1,57 @@ +use std::sync::atomic::{AtomicU8, Ordering}; +use crate::iter::Bytes; +use super::avx2; +use super::sse42; + +const AVX2: u8 = 1; +const SSE42: u8 = 2; +const NOP: u8 = 3; + +fn detect_runtime_feature() -> u8 { + if is_x86_feature_detected!("avx2") { + AVX2 + } else if is_x86_feature_detected!("sse4.2") { + SSE42 + } else { + NOP + } +} + +static RUNTIME_FEATURE: AtomicU8 = AtomicU8::new(0); + +#[inline] +fn get_runtime_feature() -> u8 { + let mut feature = RUNTIME_FEATURE.load(Ordering::Relaxed); + if feature == 0 { + feature = detect_runtime_feature(); + RUNTIME_FEATURE.store(feature, Ordering::Relaxed); + } + + feature +} + +pub fn match_header_name_vectored(bytes: &mut Bytes) { + super::swar::match_header_name_vectored(bytes); +} + +pub fn match_uri_vectored(bytes: &mut Bytes) { + // SAFETY: calls are guarded by a feature check + unsafe { + match get_runtime_feature() { + AVX2 => avx2::match_uri_vectored(bytes), + SSE42 => sse42::match_uri_vectored(bytes), + _ /* NOP */ => super::swar::match_uri_vectored(bytes), + } + } +} + +pub fn match_header_value_vectored(bytes: &mut Bytes) { + // SAFETY: calls are guarded by a feature check + unsafe { + match get_runtime_feature() { + AVX2 => avx2::match_header_value_vectored(bytes), + SSE42 => sse42::match_header_value_vectored(bytes), + _ /* NOP */ => super::swar::match_header_value_vectored(bytes), + } + } +} diff --git a/vendor/httparse/src/simd/sse42.rs b/vendor/httparse/src/simd/sse42.rs new file mode 100644 index 00000000..0fabdfeb --- /dev/null +++ b/vendor/httparse/src/simd/sse42.rs @@ -0,0 +1,142 @@ +use crate::iter::Bytes; + +#[target_feature(enable = "sse4.2")] +pub unsafe fn match_uri_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 16 { + let advance = match_url_char_16_sse(bytes.as_ref()); + + bytes.advance(advance); + + if advance != 16 { + return; + } + } + super::swar::match_uri_vectored(bytes); +} + +#[inline(always)] +#[allow(non_snake_case)] +unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize { + debug_assert!(buf.len() >= 16); + + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let ptr = buf.as_ptr(); + + // %x21-%x7e %x80-%xff + let DEL: __m128i = _mm_set1_epi8(0x7f); + let LOW: __m128i = _mm_set1_epi8(0x21); + + let dat = _mm_lddqu_si128(ptr as *const _); + // unsigned comparison dat >= LOW + let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat); + let del = _mm_cmpeq_epi8(dat, DEL); + let bit = _mm_andnot_si128(del, low); + let res = _mm_movemask_epi8(bit) as u16; + + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize +} + +#[target_feature(enable = "sse4.2")] +pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) { + while bytes.as_ref().len() >= 16 { + let advance = match_header_value_char_16_sse(bytes.as_ref()); + bytes.advance(advance); + + if advance != 16 { + return; + } + } + super::swar::match_header_value_vectored(bytes); +} + +#[inline(always)] +#[allow(non_snake_case)] +unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize { + debug_assert!(buf.len() >= 16); + + #[cfg(target_arch = "x86")] + use core::arch::x86::*; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::*; + + let ptr = buf.as_ptr(); + + // %x09 %x20-%x7e %x80-%xff + let TAB: __m128i = _mm_set1_epi8(0x09); + let DEL: __m128i = _mm_set1_epi8(0x7f); + let LOW: __m128i = _mm_set1_epi8(0x20); + + let dat = _mm_lddqu_si128(ptr as *const _); + // unsigned comparison dat >= LOW + let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat); + let tab = _mm_cmpeq_epi8(dat, TAB); + let del = _mm_cmpeq_epi8(dat, DEL); + let bit = _mm_andnot_si128(del, _mm_or_si128(low, tab)); + let res = _mm_movemask_epi8(bit) as u16; + + // TODO: use .trailing_ones() once MSRV >= 1.46 + (!res).trailing_zeros() as usize +} + +#[test] +fn sse_code_matches_uri_chars_table() { + if !is_x86_feature_detected!("sse4.2") { + return; + } + + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_uri_vectored)); + + for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_uri_vectored), allowed, + "byte_is_allowed({:?}) should be {:?}", b, allowed, + ); + } + } +} + +#[test] +fn sse_code_matches_header_value_chars_table() { + if !is_x86_feature_detected!("sse4.2") { + return; + } + + #[allow(clippy::undocumented_unsafe_blocks)] + unsafe { + assert!(byte_is_allowed(b'_', match_header_value_vectored)); + + for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() { + assert_eq!( + byte_is_allowed(b as u8, match_header_value_vectored), allowed, + "byte_is_allowed({:?}) should be {:?}", b, allowed, + ); + } + } +} + +#[allow(clippy::missing_safety_doc)] +#[cfg(test)] +unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool { + let slice = [ + b'_', b'_', b'_', b'_', + b'_', b'_', b'_', b'_', + b'_', b'_', byte, b'_', + b'_', b'_', b'_', b'_', + ]; + let mut bytes = Bytes::new(&slice); + + f(&mut bytes); + + match bytes.pos() { + 16 => true, + 10 => false, + _ => unreachable!(), + } +} diff --git a/vendor/httparse/src/simd/swar.rs b/vendor/httparse/src/simd/swar.rs new file mode 100644 index 00000000..e1028d05 --- /dev/null +++ b/vendor/httparse/src/simd/swar.rs @@ -0,0 +1,235 @@ +/// SWAR: SIMD Within A Register +/// SIMD validator backend that validates register-sized chunks of data at a time. +use crate::{is_header_name_token, is_header_value_token, is_uri_token, Bytes}; + +// Adapt block-size to match native register size, i.e: 32bit => 4, 64bit => 8 +const BLOCK_SIZE: usize = core::mem::size_of::<usize>(); +type ByteBlock = [u8; BLOCK_SIZE]; + +#[inline] +pub fn match_uri_vectored(bytes: &mut Bytes) { + loop { + if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { + let n = match_uri_char_8_swar(bytes8); + // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes + // in `bytes`, so calling `advance(n)` is safe. + unsafe { + bytes.advance(n); + } + if n == BLOCK_SIZE { + continue; + } + } + if let Some(b) = bytes.peek() { + if is_uri_token(b) { + // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte + // in bytes, so calling advance is safe. + unsafe { + bytes.advance(1); + } + continue; + } + } + break; + } +} + +#[inline] +pub fn match_header_value_vectored(bytes: &mut Bytes) { + loop { + if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { + let n = match_header_value_char_8_swar(bytes8); + // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes + // in `bytes`, so calling `advance(n)` is safe. + unsafe { + bytes.advance(n); + } + if n == BLOCK_SIZE { + continue; + } + } + if let Some(b) = bytes.peek() { + if is_header_value_token(b) { + // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte + // in bytes, so calling advance is safe. + unsafe { + bytes.advance(1); + } + continue; + } + } + break; + } +} + +#[inline] +pub fn match_header_name_vectored(bytes: &mut Bytes) { + while let Some(block) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { + let n = match_block(is_header_name_token, block); + // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes + // in `bytes`, so calling `advance(n)` is safe. + unsafe { + bytes.advance(n); + } + if n != BLOCK_SIZE { + return; + } + } + // SAFETY: match_tail processes at most the remaining data in `bytes`. advances `bytes` to the + // end, but no further. + unsafe { bytes.advance(match_tail(is_header_name_token, bytes.as_ref())) }; +} + +// Matches "tail", i.e: when we have <BLOCK_SIZE bytes in the buffer, should be uncommon +#[cold] +#[inline] +fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize { + for (i, &b) in bytes.iter().enumerate() { + if !f(b) { + return i; + } + } + bytes.len() +} + +// Naive fallback block matcher +#[inline(always)] +fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize { + for (i, &b) in block.iter().enumerate() { + if !f(b) { + return i; + } + } + BLOCK_SIZE +} + +// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44) +// creates a u64 whose bytes are each equal to b +const fn uniform_block(b: u8) -> usize { + (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize +} + +// A byte-wise range-check on an entire word/block, +// ensuring all bytes in the word satisfy `33 <= (x != 127) <= 255` +#[inline] +fn match_uri_char_8_swar(block: ByteBlock) -> usize { + // 33 <= (x != 127) <= 255 + const M: u8 = 0x21; + // uniform block full of exclamation mark (!) (33). + const BM: usize = uniform_block(M); + // uniform block full of 1. + const ONE: usize = uniform_block(0x01); + // uniform block full of DEL (127). + const DEL: usize = uniform_block(0x7f); + // uniform block full of 128. + const M128: usize = uniform_block(128); + + let x = usize::from_ne_bytes(block); // Really just a transmute + let lt = x.wrapping_sub(BM) & !x; // <= m + + let xor_del = x ^ DEL; + let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL + + offsetnz((lt | eq_del) & M128) +} + +// A byte-wise range-check on an entire word/block, +// ensuring all bytes in the word satisfy `32 <= (x != 127) <= 255` +#[inline] +fn match_header_value_char_8_swar(block: ByteBlock) -> usize { + // 32 <= (x != 127) <= 255 + const M: u8 = 0x20; + // uniform block full of exclamation mark (!) (33). + const BM: usize = uniform_block(M); + // uniform block full of 1. + const ONE: usize = uniform_block(0x01); + // uniform block full of DEL (127). + const DEL: usize = uniform_block(0x7f); + // uniform block full of 128. + const M128: usize = uniform_block(128); + + let x = usize::from_ne_bytes(block); // Really just a transmute + let lt = x.wrapping_sub(BM) & !x; // <= m + + let xor_del = x ^ DEL; + let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL + + offsetnz((lt | eq_del) & M128) +} + +/// Check block to find offset of first non-zero byte +// NOTE: Curiously `block.trailing_zeros() >> 3` appears to be slower, maybe revisit +#[inline] +fn offsetnz(block: usize) -> usize { + // fast path optimistic case (common for long valid sequences) + if block == 0 { + return BLOCK_SIZE; + } + + // perf: rust will unroll this loop + for (i, b) in block.to_ne_bytes().iter().copied().enumerate() { + if b != 0 { + return i; + } + } + unreachable!() +} + +#[test] +fn test_is_header_value_block() { + let is_header_value_block = |b| match_header_value_char_8_swar(b) == BLOCK_SIZE; + + // 0..32 => false + for b in 0..32_u8 { + assert!(!is_header_value_block([b; BLOCK_SIZE]), "b={}", b); + } + // 32..=126 => true + for b in 32..=126_u8 { + assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b); + } + // 127 => false + assert!(!is_header_value_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F'); + // 128..=255 => true + for b in 128..=255_u8 { + assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b); + } + + + #[cfg(target_pointer_width = "64")] + { + // A few sanity checks on non-uniform bytes for safe-measure + assert!(!is_header_value_block(*b"foo.com\n")); + assert!(!is_header_value_block(*b"o.com\r\nU")); + } +} + +#[test] +fn test_is_uri_block() { + let is_uri_block = |b| match_uri_char_8_swar(b) == BLOCK_SIZE; + + // 0..33 => false + for b in 0..33_u8 { + assert!(!is_uri_block([b; BLOCK_SIZE]), "b={}", b); + } + // 33..=126 => true + for b in 33..=126_u8 { + assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b); + } + // 127 => false + assert!(!is_uri_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F'); + // 128..=255 => true + for b in 128..=255_u8 { + assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b); + } +} + +#[test] +fn test_offsetnz() { + let seq = [0_u8; BLOCK_SIZE]; + for i in 0..BLOCK_SIZE { + let mut seq = seq; + seq[i] = 1; + let x = usize::from_ne_bytes(seq); + assert_eq!(offsetnz(x), i); + } +} |
