chore: add vendor directory

author: mo khan <mo@mokhan.ca> 2025-07-02 18:36:06 -0600
committer: mo khan <mo@mokhan.ca> 2025-07-02 18:36:06 -0600
commit: 8cdfa445d6629ffef4cb84967ff7017654045bc2 (patch)
tree: 22f0b0907c024c78d26a731e2e1f5219407d8102 /vendor/httparse/src/simd
parent: 4351c74c7c5f97156bc94d3a8549b9940ac80e3f (diff)
6 files changed, 1051 insertions, 0 deletions
diff --git a/vendor/httparse/src/simd/avx2.rs b/vendor/httparse/src/simd/avx2.rs
new file mode 100644
index 00000000..2adf0a28
--- /dev/null
+++ b/vendor/httparse/src/simd/avx2.rs
@@ -0,0 +1,206 @@
+use crate::iter::Bytes;
+
+#[inline]
+#[target_feature(enable = "avx2")]
+pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 32 {
+
+        let advance = match_url_char_32_avx(bytes.as_ref());
+
+        bytes.advance(advance);
+
+        if advance != 32 {
+            return;
+        }
+    }
+    // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2
+    super::swar::match_uri_vectored(bytes)
+}
+
+#[inline(always)]
+#[allow(non_snake_case, overflowing_literals)]
+#[allow(unused)]
+unsafe fn match_url_char_32_avx(buf: &[u8]) -> usize {
+    // NOTE: This check might be not necessary since this function is only used in
+    // `match_uri_vectored` where buffer overflow is taken care of.
+    debug_assert!(buf.len() >= 32);
+
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    // pointer to buffer
+    let ptr = buf.as_ptr();
+
+    // %x21-%x7e %x80-%xff
+    //
+    // Character ranges allowed by this function, can also be interpreted as:
+    // 33 =< (x != 127) =< 255
+    //
+    // Create a vector full of DEL (0x7f) characters.
+    let DEL: __m256i = _mm256_set1_epi8(0x7f);
+    // Create a vector full of exclamation mark (!) (0x21) characters.
+    // Used as lower threshold, characters in URLs cannot be smaller than this.
+    let LOW: __m256i = _mm256_set1_epi8(0x21);
+
+    // Load a chunk of 32 bytes from `ptr` as a vector.
+    // We can check 32 bytes in parallel at most with AVX2 since
+    // YMM registers can only have 256 bits most.
+    let dat = _mm256_lddqu_si256(ptr as *const _);
+
+    // unsigned comparison dat >= LOW
+    //
+    // `_mm256_max_epu8` creates a new vector by comparing vectors `dat` and `LOW`
+    // and picks the max. values from each for all indices.
+    // So if a byte in `dat` is <= 32, it'll be represented as 33
+    // which is the smallest valid character.
+    //
+    // Then, we compare the new vector with `dat` for equality.
+    //
+    // `_mm256_cmpeq_epi8` returns a new vector where;
+    // * matching bytes are set to 0xFF (all bits set),
+    // * nonmatching bytes are set to 0 (no bits set).
+    let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
+    // Similar to what we did before, but now invalid characters are set to 0xFF.
+    let del = _mm256_cmpeq_epi8(dat, DEL);
+
+    // We glue the both comparisons via `_mm256_andnot_si256`.
+    //
+    // Since the representation of truthiness differ in these comparisons,
+    // we are in need of bitwise NOT to convert valid characters of `del`.
+    let bit = _mm256_andnot_si256(del, low);
+    // This creates a bitmask from the most significant bit of each byte.
+    // Simply, we're converting a vector value to scalar value here.
+    let res = _mm256_movemask_epi8(bit) as u32;
+
+    // Count trailing zeros to find the first encountered invalid character.
+    // Bitwise NOT is required once again to flip truthiness.
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
+}
+
+#[target_feature(enable = "avx2")]
+pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 32 {
+        let advance = match_header_value_char_32_avx(bytes.as_ref());
+        bytes.advance(advance);
+
+        if advance != 32 {
+            return;
+        }
+    }
+    // NOTE: use SWAR for <32B, more efficient than falling back to SSE4.2
+    super::swar::match_header_value_vectored(bytes)
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+#[allow(unused)]
+unsafe fn match_header_value_char_32_avx(buf: &[u8]) -> usize {
+    debug_assert!(buf.len() >= 32);
+
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    let ptr = buf.as_ptr();
+
+    // %x09 %x20-%x7e %x80-%xff
+    // Create a vector full of horizontal tab (\t) (0x09) characters.
+    let TAB: __m256i = _mm256_set1_epi8(0x09);
+    // Create a vector full of DEL (0x7f) characters.
+    let DEL: __m256i = _mm256_set1_epi8(0x7f);
+    // Create a vector full of space (0x20) characters.
+    let LOW: __m256i = _mm256_set1_epi8(0x20);
+
+    // Load a chunk of 32 bytes from `ptr` as a vector.
+    let dat = _mm256_lddqu_si256(ptr as *const _);
+
+    // unsigned comparison dat >= LOW
+    //
+    // Same as what we do in `match_url_char_32_avx`.
+    // This time the lower threshold is set to space character though.
+    let low = _mm256_cmpeq_epi8(_mm256_max_epu8(dat, LOW), dat);
+    // Check if `dat` includes `TAB` characters.
+    let tab = _mm256_cmpeq_epi8(dat, TAB);
+    // Check if `dat` includes `DEL` characters.
+    let del = _mm256_cmpeq_epi8(dat, DEL);
+
+    // Combine all comparisons together, notice that we're also using OR
+    // to connect `low` and `tab` but flip bits of `del`.
+    //
+    // In the end, this is simply:
+    // ~del & (low | tab)
+    let bit = _mm256_andnot_si256(del, _mm256_or_si256(low, tab));
+    // This creates a bitmask from the most significant bit of each byte.
+    // Creates a scalar value from vector value.
+    let res = _mm256_movemask_epi8(bit) as u32;
+
+    // Count trailing zeros to find the first encountered invalid character.
+    // Bitwise NOT is required once again to flip truthiness.
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
+}
+
+#[test]
+fn avx2_code_matches_uri_chars_table() {
+    if !is_x86_feature_detected!("avx2") {
+        return;
+    }
+
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_uri_vectored));
+
+        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_uri_vectored), allowed,
+                "byte_is_allowed({:?}) should be {:?}", b, allowed,
+            );
+        }
+    }
+}
+
+#[test]
+fn avx2_code_matches_header_value_chars_table() {
+    if !is_x86_feature_detected!("avx2") {
+        return;
+    }
+
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_header_value_vectored));
+
+        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_header_value_vectored), allowed,
+                "byte_is_allowed({:?}) should be {:?}", b, allowed,
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
+    let slice = [
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', byte, b'_',
+        b'_', b'_', b'_', b'_',
+    ];
+    let mut bytes = Bytes::new(&slice);
+
+    f(&mut bytes);
+
+    match bytes.pos() {
+        32 => true,
+        26 => false,
+        _ => unreachable!(),
+    }
+}
diff --git a/vendor/httparse/src/simd/mod.rs b/vendor/httparse/src/simd/mod.rs
new file mode 100644
index 00000000..0e4493a5
--- /dev/null
+++ b/vendor/httparse/src/simd/mod.rs
@@ -0,0 +1,153 @@
+mod swar;
+
+#[cfg(not(all(
+    httparse_simd,
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(
+            target_arch = "aarch64",
+            httparse_simd_neon_intrinsics,
+        )
+    ),
+)))]
+pub use self::swar::*;
+
+#[cfg(all(
+    httparse_simd,
+    not(httparse_simd_target_feature_avx2),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+mod sse42;
+
+#[cfg(all(
+    httparse_simd,
+    any(
+        httparse_simd_target_feature_avx2,
+        not(httparse_simd_target_feature_sse42),
+    ),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+mod avx2;
+
+#[cfg(all(
+    httparse_simd,
+    not(any(
+        httparse_simd_target_feature_sse42,
+        httparse_simd_target_feature_avx2,
+    )),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+mod runtime;
+
+#[cfg(all(
+    httparse_simd,
+    not(any(
+        httparse_simd_target_feature_sse42,
+        httparse_simd_target_feature_avx2,
+    )),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+pub use self::runtime::*;
+
+#[cfg(all(
+    httparse_simd,
+    httparse_simd_target_feature_sse42,
+    not(httparse_simd_target_feature_avx2),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+mod sse42_compile_time {
+    #[inline(always)]
+    pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) {
+        super::swar::match_header_name_vectored(b);
+    }
+
+    #[inline(always)]
+    pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) {
+        // SAFETY: calls are guarded by a compile time feature check
+        unsafe { crate::simd::sse42::match_uri_vectored(b) }
+    }
+    
+    #[inline(always)]
+    pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) {
+        // SAFETY: calls are guarded by a compile time feature check
+        unsafe { crate::simd::sse42::match_header_value_vectored(b) }
+    }
+}
+
+#[cfg(all(
+    httparse_simd,
+    httparse_simd_target_feature_sse42,
+    not(httparse_simd_target_feature_avx2),
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+pub use self::sse42_compile_time::*;
+
+#[cfg(all(
+    httparse_simd,
+    httparse_simd_target_feature_avx2,
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+mod avx2_compile_time {
+    #[inline(always)]
+    pub fn match_header_name_vectored(b: &mut crate::iter::Bytes<'_>) {
+        super::swar::match_header_name_vectored(b);
+    }
+
+    #[inline(always)]
+    pub fn match_uri_vectored(b: &mut crate::iter::Bytes<'_>) {
+        // SAFETY: calls are guarded by a compile time feature check
+        unsafe { crate::simd::avx2::match_uri_vectored(b) }
+    }
+    
+    #[inline(always)]
+    pub fn match_header_value_vectored(b: &mut crate::iter::Bytes<'_>) {
+        // SAFETY: calls are guarded by a compile time feature check
+        unsafe { crate::simd::avx2::match_header_value_vectored(b) }
+    }
+}
+
+#[cfg(all(
+    httparse_simd,
+    httparse_simd_target_feature_avx2,
+    any(
+        target_arch = "x86",
+        target_arch = "x86_64",
+    ),
+))]
+pub use self::avx2_compile_time::*;
+
+#[cfg(all(
+    httparse_simd,
+    target_arch = "aarch64",
+    httparse_simd_neon_intrinsics,
+))]
+mod neon;
+
+#[cfg(all(
+    httparse_simd,
+    target_arch = "aarch64",
+    httparse_simd_neon_intrinsics,
+))]
+pub use self::neon::*;
diff --git a/vendor/httparse/src/simd/neon.rs b/vendor/httparse/src/simd/neon.rs
new file mode 100644
index 00000000..b059efb2
--- /dev/null
+++ b/vendor/httparse/src/simd/neon.rs
@@ -0,0 +1,258 @@
+use crate::iter::Bytes;
+use core::arch::aarch64::*;
+
+#[inline]
+pub fn match_header_name_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining 
+        unsafe {
+            let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());
+            bytes.advance(advance);
+
+            if advance != 16 {
+                return;
+            }
+        }
+    }
+    super::swar::match_header_name_vectored(bytes);
+}
+
+#[inline]
+pub fn match_header_value_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining 
+        unsafe {
+            let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());
+            bytes.advance(advance);
+
+            if advance != 16 {
+                return;
+            }
+        }
+    }
+    super::swar::match_header_value_vectored(bytes);
+}
+
+#[inline]
+pub fn match_uri_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 16 {
+        // SAFETY: ensured that there are at least 16 bytes remaining 
+        unsafe {
+            let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());
+            bytes.advance(advance);
+
+            if advance != 16 {
+                return;
+            }
+        }
+    }
+    super::swar::match_uri_vectored(bytes);
+}
+
+const fn bit_set(x: u8) -> bool {
+    // Validates if a byte is a valid header name character
+    // https://tools.ietf.org/html/rfc7230#section-3.2.6
+    matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~')
+}
+
+// A 256-bit bitmap, split into two halves
+// lower half contains bits whose higher nibble is <= 7
+// higher half contains bits whose higher nibble is >= 8
+const fn build_bitmap() -> ([u8; 16], [u8; 16]) {
+    let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F
+    let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF
+    let mut i = 0;
+    while i < 256 {
+        if bit_set(i as u8) {
+            // Nibbles
+            let (lo, hi) = (i & 0x0F, i >> 4);
+            if i < 128 {
+                bitmap_0_7[lo] |= 1 << hi;
+            } else {
+                bitmap_8_15[lo] |= 1 << hi;
+            }
+        }
+        i += 1;
+    }
+    (bitmap_0_7, bitmap_8_15)
+}
+
+const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();
+
+// NOTE: adapted from 256-bit version, with upper 128-bit ops commented out
+#[inline]
+unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
+    let bitmaps = BITMAPS;
+    // NOTE: ideally compile-time constants
+    let (bitmap_0_7, _bitmap_8_15) = bitmaps;
+    let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr());
+    // let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr());
+
+    // Initialize the bitmask_lookup.
+    const BITMASK_LOOKUP_DATA: [u8; 16] =
+        [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
+    let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());
+
+    // Load 16 input bytes.
+    let input = vld1q_u8(ptr);
+
+    // Extract indices for row_0_7.
+    let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;
+
+    // Extract indices for row_8_15.
+    // let msb = vandq_u8(input, vdupq_n_u8(0x80));
+    // let indices_8_15 = veorq_u8(indices_0_7, msb);
+
+    // Fetch row_0_7 and row_8_15.
+    let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7);
+    // let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15);
+
+    // Calculate a bitmask, i.e. (1 << hi_nibble % 8).
+    let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4));
+
+    // Choose rows halves depending on higher nibbles.
+    // let bitsets = vorrq_u8(row_0_7, row_8_15);
+    let bitsets = row_0_7;
+
+    // Finally check which bytes belong to the set.
+    let tmp = vandq_u8(bitsets, bitmask);
+    let result = vceqq_u8(tmp, bitmask);
+
+    offsetz(result) as usize
+}
+
+#[inline]
+unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
+    let input = vld1q_u8(ptr);
+
+    // Check that b'!' <= and b != 127
+    let result = vcleq_u8(vdupq_n_u8(b'!'), input);
+
+    // Disallow del
+    let del = vceqq_u8(input, vdupq_n_u8(0x7F));
+    let result = vbicq_u8(result, del);
+
+    offsetz(result) as usize
+}
+
+#[inline]
+unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {
+    let input = vld1q_u8(ptr);
+
+    // Check that b' ' <= and b != 127 or b == 9
+    let result = vcleq_u8(vdupq_n_u8(b' '), input);
+
+    // Allow tab
+    let tab = vceqq_u8(input, vdupq_n_u8(0x09));
+    let result = vorrq_u8(result, tab);
+
+    // Disallow del
+    let del = vceqq_u8(input, vdupq_n_u8(0x7F));
+    let result = vbicq_u8(result, del);
+
+    offsetz(result) as usize
+}
+
+#[inline]
+unsafe fn offsetz(x: uint8x16_t) -> u32 {
+    // NOT the vector since it's faster to operate with zeros instead
+    offsetnz(vmvnq_u8(x))
+}
+
+#[inline]
+unsafe fn offsetnz(x: uint8x16_t) -> u32 {
+    // Extract two u64
+    let x = vreinterpretq_u64_u8(x);
+    // Extract to general purpose registers to perform clz
+    let low: u64 = vgetq_lane_u64::<0>(x);
+    let high: u64 = vgetq_lane_u64::<1>(x);
+
+    #[inline]
+    fn clz(x: u64) -> u32 {
+        // perf: rust will unroll this loop
+        // and it's much faster than rbit + clz so voila
+        for (i, b) in x.to_ne_bytes().iter().copied().enumerate() {
+            if b != 0 {
+                return i as u32;
+            }
+        }
+        8 // Technically not reachable since zero-guarded
+    }
+
+    if low != 0 {
+        clz(low)
+    } else if high != 0 {
+        return 8 + clz(high);
+    } else {
+        return 16;
+    }
+}
+
+#[test]
+fn neon_code_matches_uri_chars_table() {
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_uri_vectored));
+
+        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_uri_vectored),
+                allowed,
+                "byte_is_allowed({:?}) should be {:?}",
+                b,
+                allowed,
+            );
+        }
+    }
+}
+
+#[test]
+fn neon_code_matches_header_value_chars_table() {
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_header_value_vectored));
+
+        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_header_value_vectored),
+                allowed,
+                "byte_is_allowed({:?}) should be {:?}",
+                b,
+                allowed,
+            );
+        }
+    }
+}
+
+#[test]
+fn neon_code_matches_header_name_chars_table() {
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_header_name_vectored));
+
+        for (b, allowed) in crate::TOKEN_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_header_name_vectored),
+                allowed,
+                "byte_is_allowed({:?}) should be {:?}",
+                b,
+                allowed,
+            );
+        }
+    }
+}
+
+#[cfg(test)]
+unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
+    let mut slice = [b'_'; 16];
+    slice[10] = byte;
+    let mut bytes = Bytes::new(&slice);
+
+    f(&mut bytes);
+
+    match bytes.pos() {
+        16 => true,
+        10 => false,
+        x => panic!("unexpected pos: {}", x),
+    }
+}
diff --git a/vendor/httparse/src/simd/runtime.rs b/vendor/httparse/src/simd/runtime.rs
new file mode 100644
index 00000000..c523a921
--- /dev/null
+++ b/vendor/httparse/src/simd/runtime.rs
@@ -0,0 +1,57 @@
+use std::sync::atomic::{AtomicU8, Ordering};
+use crate::iter::Bytes;
+use super::avx2;
+use super::sse42;
+
+const AVX2: u8 = 1;
+const SSE42: u8 = 2;
+const NOP: u8 = 3;
+
+fn detect_runtime_feature() -> u8 {
+    if is_x86_feature_detected!("avx2") {
+        AVX2
+    } else if is_x86_feature_detected!("sse4.2") {
+        SSE42
+    } else {
+        NOP
+    }
+}
+
+static RUNTIME_FEATURE: AtomicU8 = AtomicU8::new(0);
+
+#[inline]
+fn get_runtime_feature() -> u8 {
+    let mut feature = RUNTIME_FEATURE.load(Ordering::Relaxed);
+    if feature == 0 {
+        feature = detect_runtime_feature();
+        RUNTIME_FEATURE.store(feature, Ordering::Relaxed);
+    }
+
+    feature
+}
+
+pub fn match_header_name_vectored(bytes: &mut Bytes) {
+    super::swar::match_header_name_vectored(bytes);
+}
+
+pub fn match_uri_vectored(bytes: &mut Bytes) {
+    // SAFETY: calls are guarded by a feature check
+    unsafe {
+        match get_runtime_feature() {
+            AVX2 => avx2::match_uri_vectored(bytes),
+            SSE42 => sse42::match_uri_vectored(bytes),
+            _ /* NOP */ => super::swar::match_uri_vectored(bytes),
+        }
+    }
+}
+
+pub fn match_header_value_vectored(bytes: &mut Bytes) {
+    // SAFETY: calls are guarded by a feature check
+    unsafe {
+        match get_runtime_feature() {
+            AVX2 => avx2::match_header_value_vectored(bytes),
+            SSE42 => sse42::match_header_value_vectored(bytes),
+            _ /* NOP */ => super::swar::match_header_value_vectored(bytes),
+        }
+    }
+}
diff --git a/vendor/httparse/src/simd/sse42.rs b/vendor/httparse/src/simd/sse42.rs
new file mode 100644
index 00000000..0fabdfeb
--- /dev/null
+++ b/vendor/httparse/src/simd/sse42.rs
@@ -0,0 +1,142 @@
+use crate::iter::Bytes;
+
+#[target_feature(enable = "sse4.2")]
+pub unsafe fn match_uri_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 16 {
+        let advance = match_url_char_16_sse(bytes.as_ref());
+
+        bytes.advance(advance);
+
+        if advance != 16 {
+            return;
+        }
+    }
+    super::swar::match_uri_vectored(bytes);
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+unsafe fn match_url_char_16_sse(buf: &[u8]) -> usize {
+    debug_assert!(buf.len() >= 16);
+
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    let ptr = buf.as_ptr();
+
+    // %x21-%x7e %x80-%xff
+    let DEL: __m128i = _mm_set1_epi8(0x7f);
+    let LOW: __m128i = _mm_set1_epi8(0x21);
+
+    let dat = _mm_lddqu_si128(ptr as *const _);
+    // unsigned comparison dat >= LOW
+    let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
+    let del = _mm_cmpeq_epi8(dat, DEL);
+    let bit = _mm_andnot_si128(del, low);
+    let res = _mm_movemask_epi8(bit) as u16;
+
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
+}
+
+#[target_feature(enable = "sse4.2")]
+pub unsafe fn match_header_value_vectored(bytes: &mut Bytes) {
+    while bytes.as_ref().len() >= 16 {
+        let advance = match_header_value_char_16_sse(bytes.as_ref());
+        bytes.advance(advance);
+
+       if advance != 16 {
+            return;
+       }
+    }
+    super::swar::match_header_value_vectored(bytes);
+}
+
+#[inline(always)]
+#[allow(non_snake_case)]
+unsafe fn match_header_value_char_16_sse(buf: &[u8]) -> usize {
+    debug_assert!(buf.len() >= 16);
+
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::*;
+
+    let ptr = buf.as_ptr();
+
+    // %x09 %x20-%x7e %x80-%xff
+    let TAB: __m128i = _mm_set1_epi8(0x09);
+    let DEL: __m128i = _mm_set1_epi8(0x7f);
+    let LOW: __m128i = _mm_set1_epi8(0x20);
+
+    let dat = _mm_lddqu_si128(ptr as *const _);
+    // unsigned comparison dat >= LOW
+    let low = _mm_cmpeq_epi8(_mm_max_epu8(dat, LOW), dat);
+    let tab = _mm_cmpeq_epi8(dat, TAB);
+    let del = _mm_cmpeq_epi8(dat, DEL);
+    let bit = _mm_andnot_si128(del, _mm_or_si128(low, tab));
+    let res = _mm_movemask_epi8(bit) as u16;
+
+    // TODO: use .trailing_ones() once MSRV >= 1.46
+    (!res).trailing_zeros() as usize
+}
+
+#[test]
+fn sse_code_matches_uri_chars_table() {
+    if !is_x86_feature_detected!("sse4.2") {
+        return;
+    }
+
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_uri_vectored));
+
+        for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_uri_vectored), allowed,
+                "byte_is_allowed({:?}) should be {:?}", b, allowed,
+            );
+        }
+    }
+}
+
+#[test]
+fn sse_code_matches_header_value_chars_table() {
+    if !is_x86_feature_detected!("sse4.2") {
+        return;
+    }
+
+    #[allow(clippy::undocumented_unsafe_blocks)]
+    unsafe {
+        assert!(byte_is_allowed(b'_', match_header_value_vectored));
+
+        for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
+            assert_eq!(
+                byte_is_allowed(b as u8, match_header_value_vectored), allowed,
+                "byte_is_allowed({:?}) should be {:?}", b, allowed,
+            );
+        }
+    }
+}
+
+#[allow(clippy::missing_safety_doc)]
+#[cfg(test)]
+unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
+    let slice = [
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', b'_', b'_',
+        b'_', b'_', byte, b'_',
+        b'_', b'_', b'_', b'_',
+    ];
+    let mut bytes = Bytes::new(&slice);
+
+    f(&mut bytes);
+
+    match bytes.pos() {
+        16 => true,
+        10 => false,
+        _ => unreachable!(),
+    }
+}
diff --git a/vendor/httparse/src/simd/swar.rs b/vendor/httparse/src/simd/swar.rs
new file mode 100644
index 00000000..e1028d05
--- /dev/null
+++ b/vendor/httparse/src/simd/swar.rs
@@ -0,0 +1,235 @@
+/// SWAR: SIMD Within A Register
+/// SIMD validator backend that validates register-sized chunks of data at a time.
+use crate::{is_header_name_token, is_header_value_token, is_uri_token, Bytes};
+
+// Adapt block-size to match native register size, i.e: 32bit => 4, 64bit => 8
+const BLOCK_SIZE: usize = core::mem::size_of::<usize>();
+type ByteBlock = [u8; BLOCK_SIZE];
+
+#[inline]
+pub fn match_uri_vectored(bytes: &mut Bytes) {
+    loop {
+        if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
+            let n = match_uri_char_8_swar(bytes8);
+            // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
+            // in `bytes`, so calling `advance(n)` is safe.
+            unsafe {
+                bytes.advance(n);
+            }
+            if n == BLOCK_SIZE {
+                continue;
+            }
+        }
+        if let Some(b) = bytes.peek() {
+            if is_uri_token(b) {
+                // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
+                // in bytes, so calling advance is safe.
+                unsafe {
+                    bytes.advance(1);
+                }
+                continue;
+            }
+        }
+        break;
+    }
+}
+
+#[inline]
+pub fn match_header_value_vectored(bytes: &mut Bytes) {
+    loop {
+        if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
+            let n = match_header_value_char_8_swar(bytes8);
+            // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
+            // in `bytes`, so calling `advance(n)` is safe.
+            unsafe {
+                bytes.advance(n);
+            }
+            if n == BLOCK_SIZE {
+                continue;
+            }
+        }
+        if let Some(b) = bytes.peek() {
+            if is_header_value_token(b) {
+                // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte
+                // in bytes, so calling advance is safe.
+                unsafe {
+                    bytes.advance(1);
+                }
+                continue;
+            }
+        }
+        break;
+    }
+}
+
+#[inline]
+pub fn match_header_name_vectored(bytes: &mut Bytes) {
+    while let Some(block) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) {
+        let n = match_block(is_header_name_token, block);
+        // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes
+        // in `bytes`, so calling `advance(n)` is safe.
+        unsafe {
+            bytes.advance(n);
+        }
+        if n != BLOCK_SIZE {
+            return;
+        }
+    }
+    // SAFETY: match_tail processes at most the remaining data in `bytes`. advances `bytes` to the
+    // end, but no further.
+    unsafe { bytes.advance(match_tail(is_header_name_token, bytes.as_ref())) };
+}
+
+// Matches "tail", i.e: when we have <BLOCK_SIZE bytes in the buffer, should be uncommon
+#[cold]
+#[inline]
+fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize {
+    for (i, &b) in bytes.iter().enumerate() {
+        if !f(b) {
+            return i;
+        }
+    }
+    bytes.len()
+}
+
+// Naive fallback block matcher
+#[inline(always)]
+fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize {
+    for (i, &b) in block.iter().enumerate() {
+        if !f(b) {
+            return i;
+        }
+    }
+    BLOCK_SIZE
+}
+
+// A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44)
+// creates a u64 whose bytes are each equal to b
+const fn uniform_block(b: u8) -> usize {
+    (b as u64 *  0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize
+}
+
+// A byte-wise range-check on an entire word/block,
+// ensuring all bytes in the word satisfy `33 <= (x != 127) <= 255`
+#[inline]
+fn match_uri_char_8_swar(block: ByteBlock) -> usize {
+    // 33 <= (x != 127) <= 255
+    const M: u8 = 0x21;
+    // uniform block full of exclamation mark (!) (33).
+    const BM: usize = uniform_block(M);
+    // uniform block full of 1.
+    const ONE: usize = uniform_block(0x01);
+    // uniform block full of DEL (127).
+    const DEL: usize = uniform_block(0x7f);
+    // uniform block full of 128.
+    const M128: usize = uniform_block(128);
+
+    let x = usize::from_ne_bytes(block); // Really just a transmute
+    let lt = x.wrapping_sub(BM) & !x; // <= m
+
+    let xor_del = x ^ DEL;
+    let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL
+
+    offsetnz((lt | eq_del) & M128)
+}
+
+// A byte-wise range-check on an entire word/block,
+// ensuring all bytes in the word satisfy `32 <= (x != 127) <= 255`
+#[inline]
+fn match_header_value_char_8_swar(block: ByteBlock) -> usize {
+    // 32 <= (x != 127) <= 255
+    const M: u8 = 0x20;
+    // uniform block full of exclamation mark (!) (33).
+    const BM: usize = uniform_block(M);
+    // uniform block full of 1.
+    const ONE: usize = uniform_block(0x01);
+    // uniform block full of DEL (127).
+    const DEL: usize = uniform_block(0x7f);
+    // uniform block full of 128.
+    const M128: usize = uniform_block(128);
+
+    let x = usize::from_ne_bytes(block); // Really just a transmute
+    let lt = x.wrapping_sub(BM) & !x; // <= m
+
+    let xor_del = x ^ DEL;
+    let eq_del = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL
+
+    offsetnz((lt | eq_del) & M128)
+}
+
+/// Check block to find offset of first non-zero byte
+// NOTE: Curiously `block.trailing_zeros() >> 3` appears to be slower, maybe revisit
+#[inline]
+fn offsetnz(block: usize) -> usize {
+    // fast path optimistic case (common for long valid sequences)
+    if block == 0 {
+        return BLOCK_SIZE;
+    }
+
+    // perf: rust will unroll this loop
+    for (i, b) in block.to_ne_bytes().iter().copied().enumerate() {
+        if b != 0 {
+            return i;
+        }
+    }
+    unreachable!()
+}
+
+#[test]
+fn test_is_header_value_block() {
+    let is_header_value_block = |b| match_header_value_char_8_swar(b) == BLOCK_SIZE;
+
+    // 0..32 => false
+    for b in 0..32_u8 {
+        assert!(!is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+    // 32..=126 => true
+    for b in 32..=126_u8 {
+        assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+    // 127 => false
+    assert!(!is_header_value_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F');
+    // 128..=255 => true
+    for b in 128..=255_u8 {
+        assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+
+
+    #[cfg(target_pointer_width = "64")]
+    {
+        // A few sanity checks on non-uniform bytes for safe-measure
+        assert!(!is_header_value_block(*b"foo.com\n"));
+        assert!(!is_header_value_block(*b"o.com\r\nU"));
+    }
+}
+
+#[test]
+fn test_is_uri_block() {
+    let is_uri_block = |b| match_uri_char_8_swar(b) == BLOCK_SIZE;
+
+    // 0..33 => false
+    for b in 0..33_u8 {
+        assert!(!is_uri_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+    // 33..=126 => true
+    for b in 33..=126_u8 {
+        assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+    // 127 => false
+    assert!(!is_uri_block([b'\x7F'; BLOCK_SIZE]), "b={}", b'\x7F');
+    // 128..=255 => true
+    for b in 128..=255_u8 {
+        assert!(is_uri_block([b; BLOCK_SIZE]), "b={}", b);
+    }
+}
+
+#[test]
+fn test_offsetnz() {
+    let seq = [0_u8; BLOCK_SIZE];
+    for i in 0..BLOCK_SIZE {
+        let mut seq = seq;
+        seq[i] = 1;
+        let x = usize::from_ne_bytes(seq);
+        assert_eq!(offsetnz(x), i);
+    }
+}
author	mo khan <mo@mokhan.ca>	2025-07-02 18:36:06 -0600
committer	mo khan <mo@mokhan.ca>	2025-07-02 18:36:06 -0600
commit	8cdfa445d6629ffef4cb84967ff7017654045bc2 (patch)
tree	22f0b0907c024c78d26a731e2e1f5219407d8102 /vendor/httparse/src/simd
parent	4351c74c7c5f97156bc94d3a8549b9940ac80e3f (diff)