chore: vendor dependencies

author: mo khan <mo@mokhan.ca> 2025-07-10 13:11:11 -0600
committer: mo khan <mo@mokhan.ca> 2025-07-10 13:11:11 -0600
commit: 01959b16a21b22b5df5f16569c2a8e8f92beecef (patch)
tree: 32afa5d747c5466345c59ec52161a7cba3d6d755 /vendor/iri-string/src/normalize
parent: ff30574117a996df332e23d1fb6f65259b316b5b (diff)
3 files changed, 1004 insertions, 0 deletions
diff --git a/vendor/iri-string/src/normalize/error.rs b/vendor/iri-string/src/normalize/error.rs
new file mode 100644
index 00000000..a5c5c895
--- /dev/null
+++ b/vendor/iri-string/src/normalize/error.rs
@@ -0,0 +1,26 @@
+//! Normalization and resolution error.
+
+use core::fmt;
+
+/// IRI normalization and resolution error.
+///
+/// For detail about resolution failure, see [the module documentation][`crate::resolve`].
+#[derive(Debug, Clone)]
+pub struct Error(());
+
+impl Error {
+    /// Creates a new error.
+    pub(crate) fn new() -> Self {
+        Self(())
+    }
+}
+
+impl fmt::Display for Error {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str("unresolvable IRI")
+    }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {}
diff --git a/vendor/iri-string/src/normalize/path.rs b/vendor/iri-string/src/normalize/path.rs
new file mode 100644
index 00000000..4f3e3397
--- /dev/null
+++ b/vendor/iri-string/src/normalize/path.rs
@@ -0,0 +1,620 @@
+//! Path normalization.
+
+use core::fmt;
+use core::ops::Range;
+
+use crate::parser::str::{find_split_hole, rfind};
+use crate::spec::{Spec, UriSpec};
+
+use super::pct_case::PctCaseNormalized;
+use super::{Error, NormalizationMode, NormalizationOp};
+
+/// Path that is (possibly) not yet processed or being processed.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum Path<'a> {
+    /// The result. No more processing is needed.
+    Done(&'a str),
+    /// Not yet completely processed path.
+    NeedsProcessing(PathToNormalize<'a>),
+}
+
+/// Path that needs merge and/or dot segment removal.
+///
+/// # Invariants
+///
+/// If the first field (prefix field) is not `None`, it must end with a slash.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PathToNormalize<'a>(Option<&'a str>, &'a str);
+
+impl<'a> PathToNormalize<'a> {
+    /// Creates a `PathToNormalize` from the given single path.
+    #[inline]
+    #[must_use]
+    pub(crate) fn from_single_path(path: &'a str) -> Self {
+        Self(None, path)
+    }
+
+    /// Creates a `PathToNormalize` from the given base and reference paths to be resolved.
+    #[must_use]
+    pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self {
+        if reference.starts_with('/') {
+            return Self(None, reference);
+        }
+
+        match rfind(base.as_bytes(), b'/') {
+            Some(last_slash_pos) => Self(Some(&base[..=last_slash_pos]), reference),
+            None => Self(None, reference),
+        }
+    }
+
+    /// Returns true if the path is empty string.
+    #[inline]
+    #[must_use]
+    fn is_empty(&self) -> bool {
+        // If `self.0` is `Some(_)`, it ends with a slash, i.e. it is not empty.
+        self.0.is_none() && self.1.is_empty()
+    }
+
+    /// Returns the length of the not yet normalized path.
+    #[inline]
+    #[must_use]
+    pub(super) fn len(&self) -> usize {
+        self.len_prefix() + self.1.len()
+    }
+
+    /// Returns the length of the prefix part.
+    ///
+    /// Returns 0 if the prefix part is empty.
+    #[inline]
+    #[must_use]
+    fn len_prefix(&self) -> usize {
+        self.0.map_or(0, |s| s.len())
+    }
+
+    /// Returns a byte at the given position.
+    #[must_use]
+    fn byte_at(&self, mut i: usize) -> Option<u8> {
+        if let Some(prefix) = self.0 {
+            if i < prefix.len() {
+                return Some(prefix.as_bytes()[i]);
+            }
+            i -= prefix.len();
+        }
+        self.1.as_bytes().get(i).copied()
+    }
+
+    /// Returns the position of the next slash of the byte at the given position.
+    #[must_use]
+    fn find_next_slash(&self, scan_start: usize) -> Option<usize> {
+        if let Some(prefix) = self.0 {
+            let prefix_len = prefix.len();
+            if scan_start < prefix_len {
+                prefix[scan_start..].find('/').map(|rel| rel + scan_start)
+            } else {
+                let local_i = scan_start - prefix_len;
+                self.1[local_i..].find('/').map(|rel| rel + scan_start)
+            }
+        } else {
+            self.1[scan_start..].find('/').map(|rel| rel + scan_start)
+        }
+    }
+
+    /// Removes the `len` characters from the beginning of `self`.
+    fn remove_start(&mut self, len: usize) {
+        if let Some(prefix) = self.0 {
+            if let Some(suffix_trim_len) = len.checked_sub(prefix.len()) {
+                self.0 = None;
+                self.1 = &self.1[suffix_trim_len..];
+            } else {
+                self.0 = Some(&prefix[len..]);
+            }
+        } else {
+            self.1 = &self.1[len..];
+        }
+    }
+
+    /// Removes the prefix that are ignorable on normalization.
+    // Skips the prefix dot segments without leading slashes (such as `./`,
+    // `../`, and `../.././`).
+    // This is necessary because such segments should be removed with the
+    // FOLLOWING slashes, not leading slashes.
+    fn remove_ignorable_prefix(&mut self) {
+        while let Some(seg) = PathSegmentsIter::new(self).next() {
+            if seg.has_leading_slash {
+                // The first segment starting with a slash is not target.
+                break;
+            }
+            match seg.kind(self) {
+                SegmentKind::Dot | SegmentKind::DotDot => {
+                    // Attempt to skip the following slash by `+ 1`.
+                    let skip = self.len().min(seg.range.end + 1);
+                    self.remove_start(skip);
+                }
+                SegmentKind::Normal => break,
+            }
+        }
+    }
+}
+
+impl PathToNormalize<'_> {
+    /// Writes the normalized path.
+    pub(crate) fn fmt_write_normalize<S: Spec, W: fmt::Write>(
+        &self,
+        f: &mut W,
+        op: NormalizationOp,
+        authority_is_present: bool,
+    ) -> fmt::Result {
+        debug_assert!(
+            self.0.map_or(true, |s| s.ends_with('/')),
+            "[validity] the prefix field of `PathToNormalize` should end with a slash"
+        );
+
+        if self.is_empty() {
+            return Ok(());
+        }
+
+        if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath)
+            && !authority_is_present
+            && self.byte_at(0) != Some(b'/')
+        {
+            // Treat the path as "opaque", i.e. do not apply dot segments removal.
+            // See <https://github.com/lo48576/iri-string/issues/29>.
+            debug_assert!(
+                op.mode.case_pct_normalization(),
+                "[consistency] case/pct normalization should still be applied"
+            );
+            if let Some(prefix) = self.0 {
+                write!(f, "{}", PctCaseNormalized::<S>::new(prefix))?;
+            }
+            write!(f, "{}", PctCaseNormalized::<S>::new(self.1))?;
+            return Ok(());
+        }
+
+        let mut rest = *self;
+
+        // Skip the prefix dot segments without leading slashes (such as `./`,
+        // `../`, and `../.././`).
+        // This is necessary because such segments should be removed with the
+        // FOLLOWING slashes, not leading slashes.
+        rest.remove_ignorable_prefix();
+        if rest.is_empty() {
+            // Path consists of only `/.`s and `/..`s.
+            // In this case, if the authority component is present, the result
+            // should be `/`, not empty.
+            if authority_is_present {
+                f.write_char('/')?;
+            }
+            return Ok(());
+        }
+
+        // None: No segments are written yet.
+        // Some(false): Something other than `/` is already written as the path.
+        // Some(true): Only a `/` is written as the path.
+        let mut only_a_slash_is_written = None;
+        let mut too_deep_area_may_have_dot_segments = true;
+        while !rest.is_empty() && too_deep_area_may_have_dot_segments {
+            /// The size of the queue to track the path segments.
+            ///
+            /// This should be nonzero.
+            const QUEUE_SIZE: usize = 8;
+
+            {
+                // Skip `/.` and `/..` segments at the head.
+                let mut skipped_len = 0;
+                for seg in PathSegmentsIter::new(&rest) {
+                    match seg.kind(&rest) {
+                        SegmentKind::Dot | SegmentKind::DotDot => {
+                            debug_assert!(
+                                seg.has_leading_slash,
+                                "[consistency] `.` or `..` segments without a
+                                 leading slash have already been skipped"
+                            );
+                            skipped_len = seg.range.end;
+                        }
+                        _ => break,
+                    }
+                }
+                rest.remove_start(skipped_len);
+                if rest.is_empty() {
+                    // Finished with a dot segment.
+                    // The last `/.` or `/..` should be replaced to `/`.
+                    if !authority_is_present && (only_a_slash_is_written == Some(true)) {
+                        // Insert a dot segment to break the prefix `//`.
+                        // Without this, the path starts with `//` and it may
+                        // be confused with the prefix of an authority.
+                        f.write_str(".//")?;
+                    } else {
+                        f.write_char('/')?;
+                    }
+                    break;
+                }
+            }
+
+            let mut queue: [Option<&'_ str>; QUEUE_SIZE] = Default::default();
+            let mut level: usize = 0;
+            let mut first_segment_has_leading_slash = false;
+
+            // Find higher path segments.
+            let mut end = 0;
+            for seg in PathSegmentsIter::new(&rest) {
+                let kind = seg.kind(&rest);
+                match kind {
+                    SegmentKind::Dot => {
+                        too_deep_area_may_have_dot_segments = true;
+                    }
+                    SegmentKind::DotDot => {
+                        level = level.saturating_sub(1);
+                        too_deep_area_may_have_dot_segments = true;
+                        if level < queue.len() {
+                            queue[level] = None;
+                        }
+                    }
+                    SegmentKind::Normal => {
+                        if level < queue.len() {
+                            queue[level] = Some(seg.segment(&rest));
+                            too_deep_area_may_have_dot_segments = false;
+                            end = seg.range.end;
+                            if level == 0 {
+                                first_segment_has_leading_slash = seg.has_leading_slash;
+                            }
+                        }
+                        level += 1;
+                    }
+                }
+            }
+
+            // Write the path segments as possible, and update the internal state.
+            for segname in queue.iter().flatten() {
+                Self::emit_segment::<S, _>(
+                    f,
+                    &mut only_a_slash_is_written,
+                    first_segment_has_leading_slash,
+                    segname,
+                    authority_is_present,
+                    op,
+                )?;
+            }
+
+            rest.remove_start(end);
+        }
+
+        if !rest.is_empty() {
+            // No need of searching dot segments anymore.
+            assert!(
+                !too_deep_area_may_have_dot_segments,
+                "[consistency] loop condition of the previous loop"
+            );
+            // Apply only normalization (if needed).
+            for seg in PathSegmentsIter::new(&rest) {
+                assert_eq!(
+                    seg.kind(&rest),
+                    SegmentKind::Normal,
+                    "[consistency] already confirmed that there are no more dot segments"
+                );
+                let segname = seg.segment(&rest);
+                Self::emit_segment::<S, _>(
+                    f,
+                    &mut only_a_slash_is_written,
+                    seg.has_leading_slash,
+                    segname,
+                    authority_is_present,
+                    op,
+                )?;
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Emits a non-dot segment and update the current state.
+    //
+    // `first_segment_has_leading_slash` can be any value if the segment is not the first one.
+    fn emit_segment<S: Spec, W: fmt::Write>(
+        f: &mut W,
+        only_a_slash_is_written: &mut Option<bool>,
+        first_segment_has_leading_slash: bool,
+        segname: &str,
+        authority_is_present: bool,
+        op: NormalizationOp,
+    ) -> fmt::Result {
+        // Omit the leading slash of the segment only if the segment is
+        // the first one and marked as not having a leading slash.
+        match *only_a_slash_is_written {
+            None => {
+                // First segment.
+                // This pass can be possible if `./` is repeated `QUEUE_SIZE`
+                // times at the beginning.
+                if first_segment_has_leading_slash {
+                    f.write_char('/')?;
+                }
+                *only_a_slash_is_written =
+                    Some(first_segment_has_leading_slash && segname.is_empty());
+            }
+            Some(only_a_slash) => {
+                if only_a_slash && !authority_is_present {
+                    // Apply serialization like WHATWG URL Standard.
+                    // This prevents `<scheme=foo>:<path=//bar>` from written as
+                    // `foo://bar`, which is interpreted as
+                    // `<scheme=foo>://<authority=bar>`. Prepending `./`, the
+                    // serialization result would be `foo:/.//bar`, which is safe.
+                    f.write_str("./")?;
+                    *only_a_slash_is_written = Some(false);
+                }
+                f.write_char('/')?;
+            }
+        }
+
+        // Write the segment name.
+        if op.mode.case_pct_normalization() {
+            write!(f, "{}", PctCaseNormalized::<S>::new(segname))
+        } else {
+            f.write_str(segname)
+        }
+    }
+
+    /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent.
+    ///
+    /// Returns `Ok(())` when normalizable, returns `Err(_)` if not.
+    pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> {
+        /// A sink to get the prefix of the input.
+        #[derive(Default)]
+        struct PrefixRetriever {
+            /// The buffer to remember the prefix of the input.
+            buf: [u8; 3],
+            /// The next write position in the buffer.
+            cursor: usize,
+        }
+        impl PrefixRetriever {
+            /// Returns the read prefix data.
+            #[inline]
+            #[must_use]
+            fn as_bytes(&self) -> &[u8] {
+                &self.buf[..self.cursor]
+            }
+        }
+        impl fmt::Write for PrefixRetriever {
+            fn write_str(&mut self, s: &str) -> fmt::Result {
+                if !s.is_empty() && (self.cursor >= self.buf.len()) {
+                    // Enough bytes are read.
+                    return Err(fmt::Error);
+                }
+                self.buf[self.cursor..]
+                    .iter_mut()
+                    .zip(s.bytes())
+                    .for_each(|(dest, src)| *dest = src);
+                self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len());
+                Ok(())
+            }
+        }
+
+        let mut prefix = PrefixRetriever::default();
+        // The failure of this write indicates more than 3 characters are read.
+        // This is safe to ignore since the check needs only 3 characters.
+        let _ = self.fmt_write_normalize::<UriSpec, _>(
+            &mut prefix,
+            NormalizationOp {
+                mode: NormalizationMode::None,
+            },
+            // Assume the authority is absent.
+            false,
+        );
+
+        if prefix.as_bytes() == b"/./" {
+            Err(Error::new())
+        } else {
+            Ok(())
+        }
+    }
+}
+
+/// Characteristic of a path.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum PathCharacteristic {
+    /// Absolute path, not special.
+    CommonAbsolute,
+    /// Absolute path, not special.
+    CommonRelative,
+    /// The first path segment of the relative path has one or more colon characters.
+    RelativeFirstSegmentHasColon,
+    /// The path starts with the double slash.
+    StartsWithDoubleSlash,
+}
+
+impl PathCharacteristic {
+    /// Returns true if the path is absolute.
+    #[inline]
+    #[must_use]
+    pub(crate) fn is_absolute(self) -> bool {
+        matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash)
+    }
+
+    /// Returns the characteristic of the path.
+    pub(crate) fn from_path_to_display<S: Spec>(
+        path: &PathToNormalize<'_>,
+        op: NormalizationOp,
+        authority_is_present: bool,
+    ) -> Self {
+        /// Dummy writer to get necessary values.
+        #[derive(Default, Clone, Copy)]
+        struct Writer {
+            /// Result.
+            result: Option<PathCharacteristic>,
+            /// Whether the normalized path is absolute.
+            is_absolute: Option<bool>,
+        }
+        impl fmt::Write for Writer {
+            fn write_str(&mut self, mut s: &str) -> fmt::Result {
+                if self.result.is_some() {
+                    // Nothing more to do.
+                    return Err(fmt::Error);
+                }
+                while !s.is_empty() {
+                    if self.is_absolute.is_none() {
+                        // The first input.
+                        match s.strip_prefix('/') {
+                            Some(rest) => {
+                                self.is_absolute = Some(true);
+                                s = rest;
+                            }
+                            None => {
+                                self.is_absolute = Some(false);
+                            }
+                        }
+                        continue;
+                    }
+                    if self.is_absolute == Some(true) {
+                        let result = if s.starts_with('/') {
+                            PathCharacteristic::StartsWithDoubleSlash
+                        } else {
+                            PathCharacteristic::CommonAbsolute
+                        };
+                        self.result = Some(result);
+                        return Err(fmt::Error);
+                    }
+                    // Processing the first segment of the relative path.
+                    match find_split_hole(s, b'/') {
+                        Some((first_seg, _rest)) => {
+                            let result = if first_seg.contains(':') {
+                                PathCharacteristic::RelativeFirstSegmentHasColon
+                            } else {
+                                PathCharacteristic::CommonRelative
+                            };
+                            self.result = Some(result);
+                            return Err(fmt::Error);
+                        }
+                        None => {
+                            // `s` might not be the complete first segment.
+                            if s.contains(':') {
+                                self.result =
+                                    Some(PathCharacteristic::RelativeFirstSegmentHasColon);
+                                return Err(fmt::Error);
+                            }
+                            break;
+                        }
+                    }
+                }
+                Ok(())
+            }
+        }
+
+        let mut writer = Writer::default();
+        match path.fmt_write_normalize::<S, _>(&mut writer, op, authority_is_present) {
+            // Empty path.
+            Ok(_) => PathCharacteristic::CommonRelative,
+            Err(_) => writer
+                .result
+                .expect("[consistency] the formatting quits early by `Err` when the check is done"),
+        }
+    }
+}
+
+/// Path segment kind.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SegmentKind {
+    /// `.` or the equivalents.
+    Dot,
+    /// `..` or the equivalents.
+    DotDot,
+    /// Other normal (not special) segments.
+    Normal,
+}
+
+impl SegmentKind {
+    /// Creates a new `SegmentKind` from the given segment name.
+    #[must_use]
+    fn from_segment(s: &str) -> Self {
+        match s {
+            "." | "%2E" | "%2e" => SegmentKind::Dot,
+            ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E"
+            | "%2e%2e" => SegmentKind::DotDot,
+            _ => SegmentKind::Normal,
+        }
+    }
+}
+
+/// A segment with optional leading slash.
+#[derive(Debug, Clone)]
+struct PathSegment {
+    /// Presence of a leading slash.
+    has_leading_slash: bool,
+    /// Range of the segment name (without any slashes).
+    range: Range<usize>,
+}
+
+impl PathSegment {
+    /// Returns the segment without any slashes.
+    #[inline]
+    #[must_use]
+    fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str {
+        if let Some(prefix) = path.0 {
+            let prefix_len = prefix.len();
+            if self.range.end <= prefix_len {
+                &prefix[self.range.clone()]
+            } else {
+                let range = (self.range.start - prefix_len)..(self.range.end - prefix_len);
+                &path.1[range]
+            }
+        } else {
+            &path.1[self.range.clone()]
+        }
+    }
+
+    /// Returns the segment kind.
+    #[inline]
+    #[must_use]
+    fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind {
+        SegmentKind::from_segment(self.segment(path))
+    }
+}
+
+/// Iterator of path segments.
+struct PathSegmentsIter<'a> {
+    /// Path.
+    path: &'a PathToNormalize<'a>,
+    /// Current cursor position.
+    cursor: usize,
+}
+
+impl<'a> PathSegmentsIter<'a> {
+    /// Creates a new iterator of path segments.
+    #[inline]
+    #[must_use]
+    fn new(path: &'a PathToNormalize<'a>) -> Self {
+        Self { path, cursor: 0 }
+    }
+}
+
+impl Iterator for PathSegmentsIter<'_> {
+    type Item = PathSegment;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let path_len = self.path.len();
+        if self.cursor >= path_len {
+            return None;
+        }
+        let has_leading_slash = self.path.byte_at(self.cursor) == Some(b'/');
+
+        let prefix_len = self.path.len_prefix();
+        if (prefix_len != 0) && (self.cursor == prefix_len - 1) {
+            debug_assert!(has_leading_slash);
+            let end = self.path.1.find('/').unwrap_or(self.path.1.len()) + prefix_len;
+            self.cursor = end;
+            return Some(PathSegment {
+                has_leading_slash,
+                range: prefix_len..end,
+            });
+        }
+
+        if has_leading_slash {
+            // Skip the leading slash.
+            self.cursor += 1;
+        };
+        let start = self.cursor;
+        self.cursor = self.path.find_next_slash(self.cursor).unwrap_or(path_len);
+
+        Some(PathSegment {
+            has_leading_slash,
+            range: start..self.cursor,
+        })
+    }
+}
diff --git a/vendor/iri-string/src/normalize/pct_case.rs b/vendor/iri-string/src/normalize/pct_case.rs
new file mode 100644
index 00000000..75e0a777
--- /dev/null
+++ b/vendor/iri-string/src/normalize/pct_case.rs
@@ -0,0 +1,358 @@
+//! Percent-encoding normalization and case normalization.
+
+use core::cmp::Ordering;
+use core::fmt::{self, Write as _};
+use core::marker::PhantomData;
+
+use crate::format::eq_str_display;
+use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue};
+use crate::parser::str::{find_split_hole, take_first_char};
+use crate::parser::trusted::take_xdigits2;
+use crate::spec::Spec;
+
+/// Returns true if the given string is percent-encoding normalized and case
+/// normalized.
+///
+/// Note that normalization of ASCII-only host requires additional case
+/// normalization, so checking by this function is not sufficient for that case.
+pub(crate) fn is_pct_case_normalized<S: Spec>(s: &str) -> bool {
+    eq_str_display(s, &PctCaseNormalized::<S>::new(s))
+}
+
+/// Returns a character for the slice.
+///
+/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`,
+/// but this function fully trusts that the input is a valid UTF-8 string with
+/// only one character.
+fn into_char_trusted(bytes: &[u8]) -> Result<char, ()> {
+    /// The bit mask to get the content part in a continue byte.
+    const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111;
+    /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes.
+    const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000];
+
+    let len = bytes.len();
+    let c: u32 = match len {
+        2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK),
+        3 => {
+            (u32::from(bytes[0] & 0b_0000_1111) << 12)
+                | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6)
+                | u32::from(bytes[2] & CONTINUE_BYTE_MASK)
+        }
+        4 => {
+            (u32::from(bytes[0] & 0b_0000_0111) << 18)
+                | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12)
+                | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6)
+                | u32::from(bytes[3] & CONTINUE_BYTE_MASK)
+        }
+        len => unreachable!(
+            "[consistency] expected 2, 3, or 4 bytes for a character, but got {len} as the length"
+        ),
+    };
+    if c < MIN[len - 2] {
+        // Redundant UTF-8 encoding.
+        return Err(());
+    }
+    // Can be an invalid Unicode code point.
+    char::from_u32(c).ok_or(())
+}
+
+/// Writable as a normalized path segment percent-encoding IRI.
+///
+/// This wrapper does the things below when being formatted:
+///
+/// * Decode unnecessarily percent-encoded characters.
+/// * Convert alphabetic characters uppercase in percent-encoded triplets.
+///
+/// Note that this does not newly encode raw characters.
+///
+/// # Safety
+///
+/// The given string should be the valid path segment.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PctCaseNormalized<'a, S> {
+    /// Valid segment name to normalize.
+    segname: &'a str,
+    /// Spec.
+    _spec: PhantomData<fn() -> S>,
+}
+
+impl<'a, S: Spec> PctCaseNormalized<'a, S> {
+    /// Creates a new `PctCaseNormalized` value.
+    #[inline]
+    #[must_use]
+    pub(crate) fn new(source: &'a str) -> Self {
+        Self {
+            segname: source,
+            _spec: PhantomData,
+        }
+    }
+}
+
+impl<S: Spec> fmt::Display for PctCaseNormalized<'_, S> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut rest = self.segname;
+
+        'outer_loop: while !rest.is_empty() {
+            // Scan the next percent-encoded triplet.
+            let (prefix, after_percent) = match find_split_hole(rest, b'%') {
+                Some(v) => v,
+                None => return f.write_str(rest),
+            };
+            // Write the string before the percent-encoded triplet.
+            f.write_str(prefix)?;
+            // Decode the percent-encoded triplet.
+            let (first_decoded, after_first_triplet) = take_xdigits2(after_percent);
+            rest = after_first_triplet;
+
+            if first_decoded.is_ascii() {
+                if is_ascii_unreserved(first_decoded) {
+                    // Unreserved. Print the decoded.
+                    f.write_char(char::from(first_decoded))?;
+                } else {
+                    write!(f, "%{:02X}", first_decoded)?;
+                }
+                continue 'outer_loop;
+            }
+
+            // Continue byte cannot be the first byte of a character.
+            if is_utf8_byte_continue(first_decoded) {
+                write!(f, "%{:02X}", first_decoded)?;
+                continue 'outer_loop;
+            }
+
+            // Get the expected length of decoded char.
+            let expected_char_len = match (first_decoded & 0xf0).cmp(&0b1110_0000) {
+                Ordering::Less => 2,
+                Ordering::Equal => 3,
+                Ordering::Greater => 4,
+            };
+
+            // Get continue bytes.
+            let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len];
+            for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() {
+                match take_first_char(rest) {
+                    Some(('%', after_percent)) => {
+                        let (byte, after_triplet) = take_xdigits2(after_percent);
+                        if !is_utf8_byte_continue(byte) {
+                            // Note that `byte` can start the new string.
+                            // Leave the byte in the `rest` for next try (i.e.
+                            // don't update `rest` in this case).
+                            c_buf[..=i]
+                                .iter()
+                                .try_for_each(|b| write!(f, "%{:02X}", b))?;
+                            continue 'outer_loop;
+                        }
+                        *buf_dest = byte;
+                        rest = after_triplet;
+                    }
+                    // If the next character is not `%`, decoded bytes so far
+                    // won't be valid UTF-8 byte sequence.
+                    // Write the read percent-encoded triplets without decoding.
+                    // Note that all characters in `&c_buf[1..]` (if available)
+                    // will be decoded to "continue byte" of UTF-8, so they
+                    // cannot be the start of a valid UTF-8 byte sequence if
+                    // decoded.
+                    Some((c, after_percent)) => {
+                        c_buf[..=i]
+                            .iter()
+                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
+                        f.write_char(c)?;
+                        rest = after_percent;
+                        continue 'outer_loop;
+                    }
+                    None => {
+                        c_buf[..=i]
+                            .iter()
+                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
+                        // Reached the end of the string.
+                        break 'outer_loop;
+                    }
+                }
+            }
+
+            // Decode the bytes into a character.
+            match into_char_trusted(&c_buf[..expected_char_len]) {
+                Ok(decoded_c) => {
+                    if is_unreserved::<S>(decoded_c) {
+                        // Unreserved. Print the decoded.
+                        f.write_char(decoded_c)?;
+                    } else {
+                        c_buf[0..expected_char_len]
+                            .iter()
+                            .try_for_each(|b| write!(f, "%{:02X}", b))?;
+                    }
+                }
+                Err(_) => {
+                    // Skip decoding of the entire sequence of pct-encoded triplets loaded
+                    // in `c_buf`. This is valid from the reasons below.
+                    //
+                    // * The first byte in `c_buf` is valid as the first byte, and it tells the
+                    //   expected number of bytes for a code unit. The cases the bytes being too
+                    //   short and the sequence being incomplete have already been handled, and
+                    //   the execution does not reach here then.
+                    // * All of the non-first bytes are checked if they are valid as UTF8 continue
+                    //   bytes by `is_utf8_byte_continue()`. If they're not, the decoding of
+                    //   that codepoint is aborted and the bytes in the buffer are immediately
+                    //   emitted as pct-encoded, and the execution does not reach here. This
+                    //   means that the bytes in the current `c_buf` have passed these tests.
+                    // * Since all of the the non-first bytes are UTF8 continue bytes, any of
+                    //   them cannot start the new valid UTF-8 byte sequence. This means that
+                    //   if the bytes in the buffer does not consitute a valid UTF-8 bytes
+                    //   sequence, the whole buffer can immediately be emmitted as pct-encoded.
+
+                    debug_assert!(
+                        c_buf[1..expected_char_len]
+                            .iter()
+                            .copied()
+                            .all(is_utf8_byte_continue),
+                        "[consistency] all non-first bytes have been \
+                         confirmed that they are UTF-8 continue bytes"
+                    );
+                    // Note that the first pct-encoded triplet is stripped from
+                    // `after_first_triplet`.
+                    rest = &after_first_triplet[((expected_char_len - 1) * 3)..];
+                    c_buf[0..expected_char_len]
+                        .iter()
+                        .try_for_each(|b| write!(f, "%{:02X}", b))?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Writable as a normalized ASCII-only `host` (and optionally `port` followed).
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct NormalizedAsciiOnlyHost<'a> {
+    /// Valid host (and additionaly port) to normalize.
+    host_port: &'a str,
+}
+
+impl<'a> NormalizedAsciiOnlyHost<'a> {
+    /// Creates a new `NormalizedAsciiOnlyHost` value.
+    ///
+    /// # Preconditions
+    ///
+    /// The given string should be the valid ASCII-only `host` or
+    /// `host ":" port` after percent-encoding normalization.
+    /// In other words, [`parser::trusted::is_ascii_only_host`] should return
+    /// true for the given value.
+    ///
+    /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host`
+    #[inline]
+    #[must_use]
+    pub(crate) fn new(host_port: &'a str) -> Self {
+        Self { host_port }
+    }
+}
+
+impl fmt::Display for NormalizedAsciiOnlyHost<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let mut rest = self.host_port;
+
+        while !rest.is_empty() {
+            // Scan the next percent-encoded triplet.
+            let (prefix, after_percent) = match find_split_hole(rest, b'%') {
+                Some(v) => v,
+                None => {
+                    return rest
+                        .chars()
+                        .try_for_each(|c| f.write_char(c.to_ascii_lowercase()));
+                }
+            };
+            // Write the string before the percent-encoded triplet.
+            prefix
+                .chars()
+                .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?;
+            // Decode the percent-encoded triplet.
+            let (first_decoded, after_triplet) = take_xdigits2(after_percent);
+            rest = after_triplet;
+
+            assert!(
+                first_decoded.is_ascii(),
+                "[consistency] this function requires ASCII-only host as an argument"
+            );
+
+            if is_ascii_unreserved(first_decoded) {
+                // Unreserved. Convert to lowercase and print.
+                f.write_char(char::from(first_decoded.to_ascii_lowercase()))?;
+            } else {
+                write!(f, "%{:02X}", first_decoded)?;
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+#[cfg(feature = "alloc")]
+mod tests {
+    use super::*;
+
+    #[cfg(all(feature = "alloc", not(feature = "std")))]
+    use alloc::string::ToString;
+
+    use crate::spec::{IriSpec, UriSpec};
+
+    #[test]
+    fn invalid_utf8() {
+        assert_eq!(
+            PctCaseNormalized::<UriSpec>::new("%80%cc%cc%cc").to_string(),
+            "%80%CC%CC%CC"
+        );
+        assert_eq!(
+            PctCaseNormalized::<IriSpec>::new("%80%cc%cc%cc").to_string(),
+            "%80%CC%CC%CC"
+        );
+    }
+
+    #[test]
+    fn iri_unreserved() {
+        assert_eq!(
+            PctCaseNormalized::<UriSpec>::new("%ce%b1").to_string(),
+            "%CE%B1"
+        );
+        assert_eq!(
+            PctCaseNormalized::<IriSpec>::new("%ce%b1").to_string(),
+            "\u{03B1}"
+        );
+    }
+
+    #[test]
+    fn iri_middle_decode() {
+        assert_eq!(
+            PctCaseNormalized::<UriSpec>::new("%ce%ce%b1%b1").to_string(),
+            "%CE%CE%B1%B1"
+        );
+        assert_eq!(
+            PctCaseNormalized::<IriSpec>::new("%ce%ce%b1%b1").to_string(),
+            "%CE\u{03B1}%B1"
+        );
+    }
+
+    #[test]
+    fn ascii_reserved() {
+        assert_eq!(PctCaseNormalized::<UriSpec>::new("%3f").to_string(), "%3F");
+        assert_eq!(PctCaseNormalized::<IriSpec>::new("%3f").to_string(), "%3F");
+    }
+
+    #[test]
+    fn ascii_forbidden() {
+        assert_eq!(
+            PctCaseNormalized::<UriSpec>::new("%3c%3e").to_string(),
+            "%3C%3E"
+        );
+        assert_eq!(
+            PctCaseNormalized::<IriSpec>::new("%3c%3e").to_string(),
+            "%3C%3E"
+        );
+    }
+
+    #[test]
+    fn ascii_unreserved() {
+        assert_eq!(PctCaseNormalized::<UriSpec>::new("%7ea").to_string(), "~a");
+        assert_eq!(PctCaseNormalized::<IriSpec>::new("%7ea").to_string(), "~a");
+    }
+}
author	mo khan <mo@mokhan.ca>	2025-07-10 13:11:11 -0600
committer	mo khan <mo@mokhan.ca>	2025-07-10 13:11:11 -0600
commit	01959b16a21b22b5df5f16569c2a8e8f92beecef (patch)
tree	32afa5d747c5466345c59ec52161a7cba3d6d755 /vendor/iri-string/src/normalize
parent	ff30574117a996df332e23d1fb6f65259b316b5b (diff)