From 01959b16a21b22b5df5f16569c2a8e8f92beecef Mon Sep 17 00:00:00 2001 From: mo khan Date: Thu, 10 Jul 2025 13:11:11 -0600 Subject: chore: vendor dependencies --- vendor/iri-string/src/normalize/error.rs | 26 ++ vendor/iri-string/src/normalize/path.rs | 620 ++++++++++++++++++++++++++++ vendor/iri-string/src/normalize/pct_case.rs | 358 ++++++++++++++++ 3 files changed, 1004 insertions(+) create mode 100644 vendor/iri-string/src/normalize/error.rs create mode 100644 vendor/iri-string/src/normalize/path.rs create mode 100644 vendor/iri-string/src/normalize/pct_case.rs (limited to 'vendor/iri-string/src/normalize') diff --git a/vendor/iri-string/src/normalize/error.rs b/vendor/iri-string/src/normalize/error.rs new file mode 100644 index 00000000..a5c5c895 --- /dev/null +++ b/vendor/iri-string/src/normalize/error.rs @@ -0,0 +1,26 @@ +//! Normalization and resolution error. + +use core::fmt; + +/// IRI normalization and resolution error. +/// +/// For detail about resolution failure, see [the module documentation][`crate::resolve`]. +#[derive(Debug, Clone)] +pub struct Error(()); + +impl Error { + /// Creates a new error. + pub(crate) fn new() -> Self { + Self(()) + } +} + +impl fmt::Display for Error { + #[inline] + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("unresolvable IRI") + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error {} diff --git a/vendor/iri-string/src/normalize/path.rs b/vendor/iri-string/src/normalize/path.rs new file mode 100644 index 00000000..4f3e3397 --- /dev/null +++ b/vendor/iri-string/src/normalize/path.rs @@ -0,0 +1,620 @@ +//! Path normalization. + +use core::fmt; +use core::ops::Range; + +use crate::parser::str::{find_split_hole, rfind}; +use crate::spec::{Spec, UriSpec}; + +use super::pct_case::PctCaseNormalized; +use super::{Error, NormalizationMode, NormalizationOp}; + +/// Path that is (possibly) not yet processed or being processed. +#[derive(Debug, Clone, Copy)] +pub(crate) enum Path<'a> { + /// The result. No more processing is needed. + Done(&'a str), + /// Not yet completely processed path. + NeedsProcessing(PathToNormalize<'a>), +} + +/// Path that needs merge and/or dot segment removal. +/// +/// # Invariants +/// +/// If the first field (prefix field) is not `None`, it must end with a slash. +#[derive(Debug, Clone, Copy)] +pub(crate) struct PathToNormalize<'a>(Option<&'a str>, &'a str); + +impl<'a> PathToNormalize<'a> { + /// Creates a `PathToNormalize` from the given single path. + #[inline] + #[must_use] + pub(crate) fn from_single_path(path: &'a str) -> Self { + Self(None, path) + } + + /// Creates a `PathToNormalize` from the given base and reference paths to be resolved. + #[must_use] + pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self { + if reference.starts_with('/') { + return Self(None, reference); + } + + match rfind(base.as_bytes(), b'/') { + Some(last_slash_pos) => Self(Some(&base[..=last_slash_pos]), reference), + None => Self(None, reference), + } + } + + /// Returns true if the path is empty string. + #[inline] + #[must_use] + fn is_empty(&self) -> bool { + // If `self.0` is `Some(_)`, it ends with a slash, i.e. it is not empty. + self.0.is_none() && self.1.is_empty() + } + + /// Returns the length of the not yet normalized path. + #[inline] + #[must_use] + pub(super) fn len(&self) -> usize { + self.len_prefix() + self.1.len() + } + + /// Returns the length of the prefix part. + /// + /// Returns 0 if the prefix part is empty. + #[inline] + #[must_use] + fn len_prefix(&self) -> usize { + self.0.map_or(0, |s| s.len()) + } + + /// Returns a byte at the given position. + #[must_use] + fn byte_at(&self, mut i: usize) -> Option { + if let Some(prefix) = self.0 { + if i < prefix.len() { + return Some(prefix.as_bytes()[i]); + } + i -= prefix.len(); + } + self.1.as_bytes().get(i).copied() + } + + /// Returns the position of the next slash of the byte at the given position. + #[must_use] + fn find_next_slash(&self, scan_start: usize) -> Option { + if let Some(prefix) = self.0 { + let prefix_len = prefix.len(); + if scan_start < prefix_len { + prefix[scan_start..].find('/').map(|rel| rel + scan_start) + } else { + let local_i = scan_start - prefix_len; + self.1[local_i..].find('/').map(|rel| rel + scan_start) + } + } else { + self.1[scan_start..].find('/').map(|rel| rel + scan_start) + } + } + + /// Removes the `len` characters from the beginning of `self`. + fn remove_start(&mut self, len: usize) { + if let Some(prefix) = self.0 { + if let Some(suffix_trim_len) = len.checked_sub(prefix.len()) { + self.0 = None; + self.1 = &self.1[suffix_trim_len..]; + } else { + self.0 = Some(&prefix[len..]); + } + } else { + self.1 = &self.1[len..]; + } + } + + /// Removes the prefix that are ignorable on normalization. + // Skips the prefix dot segments without leading slashes (such as `./`, + // `../`, and `../.././`). + // This is necessary because such segments should be removed with the + // FOLLOWING slashes, not leading slashes. + fn remove_ignorable_prefix(&mut self) { + while let Some(seg) = PathSegmentsIter::new(self).next() { + if seg.has_leading_slash { + // The first segment starting with a slash is not target. + break; + } + match seg.kind(self) { + SegmentKind::Dot | SegmentKind::DotDot => { + // Attempt to skip the following slash by `+ 1`. + let skip = self.len().min(seg.range.end + 1); + self.remove_start(skip); + } + SegmentKind::Normal => break, + } + } + } +} + +impl PathToNormalize<'_> { + /// Writes the normalized path. + pub(crate) fn fmt_write_normalize( + &self, + f: &mut W, + op: NormalizationOp, + authority_is_present: bool, + ) -> fmt::Result { + debug_assert!( + self.0.map_or(true, |s| s.ends_with('/')), + "[validity] the prefix field of `PathToNormalize` should end with a slash" + ); + + if self.is_empty() { + return Ok(()); + } + + if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath) + && !authority_is_present + && self.byte_at(0) != Some(b'/') + { + // Treat the path as "opaque", i.e. do not apply dot segments removal. + // See . + debug_assert!( + op.mode.case_pct_normalization(), + "[consistency] case/pct normalization should still be applied" + ); + if let Some(prefix) = self.0 { + write!(f, "{}", PctCaseNormalized::::new(prefix))?; + } + write!(f, "{}", PctCaseNormalized::::new(self.1))?; + return Ok(()); + } + + let mut rest = *self; + + // Skip the prefix dot segments without leading slashes (such as `./`, + // `../`, and `../.././`). + // This is necessary because such segments should be removed with the + // FOLLOWING slashes, not leading slashes. + rest.remove_ignorable_prefix(); + if rest.is_empty() { + // Path consists of only `/.`s and `/..`s. + // In this case, if the authority component is present, the result + // should be `/`, not empty. + if authority_is_present { + f.write_char('/')?; + } + return Ok(()); + } + + // None: No segments are written yet. + // Some(false): Something other than `/` is already written as the path. + // Some(true): Only a `/` is written as the path. + let mut only_a_slash_is_written = None; + let mut too_deep_area_may_have_dot_segments = true; + while !rest.is_empty() && too_deep_area_may_have_dot_segments { + /// The size of the queue to track the path segments. + /// + /// This should be nonzero. + const QUEUE_SIZE: usize = 8; + + { + // Skip `/.` and `/..` segments at the head. + let mut skipped_len = 0; + for seg in PathSegmentsIter::new(&rest) { + match seg.kind(&rest) { + SegmentKind::Dot | SegmentKind::DotDot => { + debug_assert!( + seg.has_leading_slash, + "[consistency] `.` or `..` segments without a + leading slash have already been skipped" + ); + skipped_len = seg.range.end; + } + _ => break, + } + } + rest.remove_start(skipped_len); + if rest.is_empty() { + // Finished with a dot segment. + // The last `/.` or `/..` should be replaced to `/`. + if !authority_is_present && (only_a_slash_is_written == Some(true)) { + // Insert a dot segment to break the prefix `//`. + // Without this, the path starts with `//` and it may + // be confused with the prefix of an authority. + f.write_str(".//")?; + } else { + f.write_char('/')?; + } + break; + } + } + + let mut queue: [Option<&'_ str>; QUEUE_SIZE] = Default::default(); + let mut level: usize = 0; + let mut first_segment_has_leading_slash = false; + + // Find higher path segments. + let mut end = 0; + for seg in PathSegmentsIter::new(&rest) { + let kind = seg.kind(&rest); + match kind { + SegmentKind::Dot => { + too_deep_area_may_have_dot_segments = true; + } + SegmentKind::DotDot => { + level = level.saturating_sub(1); + too_deep_area_may_have_dot_segments = true; + if level < queue.len() { + queue[level] = None; + } + } + SegmentKind::Normal => { + if level < queue.len() { + queue[level] = Some(seg.segment(&rest)); + too_deep_area_may_have_dot_segments = false; + end = seg.range.end; + if level == 0 { + first_segment_has_leading_slash = seg.has_leading_slash; + } + } + level += 1; + } + } + } + + // Write the path segments as possible, and update the internal state. + for segname in queue.iter().flatten() { + Self::emit_segment::( + f, + &mut only_a_slash_is_written, + first_segment_has_leading_slash, + segname, + authority_is_present, + op, + )?; + } + + rest.remove_start(end); + } + + if !rest.is_empty() { + // No need of searching dot segments anymore. + assert!( + !too_deep_area_may_have_dot_segments, + "[consistency] loop condition of the previous loop" + ); + // Apply only normalization (if needed). + for seg in PathSegmentsIter::new(&rest) { + assert_eq!( + seg.kind(&rest), + SegmentKind::Normal, + "[consistency] already confirmed that there are no more dot segments" + ); + let segname = seg.segment(&rest); + Self::emit_segment::( + f, + &mut only_a_slash_is_written, + seg.has_leading_slash, + segname, + authority_is_present, + op, + )?; + } + } + + Ok(()) + } + + /// Emits a non-dot segment and update the current state. + // + // `first_segment_has_leading_slash` can be any value if the segment is not the first one. + fn emit_segment( + f: &mut W, + only_a_slash_is_written: &mut Option, + first_segment_has_leading_slash: bool, + segname: &str, + authority_is_present: bool, + op: NormalizationOp, + ) -> fmt::Result { + // Omit the leading slash of the segment only if the segment is + // the first one and marked as not having a leading slash. + match *only_a_slash_is_written { + None => { + // First segment. + // This pass can be possible if `./` is repeated `QUEUE_SIZE` + // times at the beginning. + if first_segment_has_leading_slash { + f.write_char('/')?; + } + *only_a_slash_is_written = + Some(first_segment_has_leading_slash && segname.is_empty()); + } + Some(only_a_slash) => { + if only_a_slash && !authority_is_present { + // Apply serialization like WHATWG URL Standard. + // This prevents `:` from written as + // `foo://bar`, which is interpreted as + // `://`. Prepending `./`, the + // serialization result would be `foo:/.//bar`, which is safe. + f.write_str("./")?; + *only_a_slash_is_written = Some(false); + } + f.write_char('/')?; + } + } + + // Write the segment name. + if op.mode.case_pct_normalization() { + write!(f, "{}", PctCaseNormalized::::new(segname)) + } else { + f.write_str(segname) + } + } + + /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent. + /// + /// Returns `Ok(())` when normalizable, returns `Err(_)` if not. + pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> { + /// A sink to get the prefix of the input. + #[derive(Default)] + struct PrefixRetriever { + /// The buffer to remember the prefix of the input. + buf: [u8; 3], + /// The next write position in the buffer. + cursor: usize, + } + impl PrefixRetriever { + /// Returns the read prefix data. + #[inline] + #[must_use] + fn as_bytes(&self) -> &[u8] { + &self.buf[..self.cursor] + } + } + impl fmt::Write for PrefixRetriever { + fn write_str(&mut self, s: &str) -> fmt::Result { + if !s.is_empty() && (self.cursor >= self.buf.len()) { + // Enough bytes are read. + return Err(fmt::Error); + } + self.buf[self.cursor..] + .iter_mut() + .zip(s.bytes()) + .for_each(|(dest, src)| *dest = src); + self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len()); + Ok(()) + } + } + + let mut prefix = PrefixRetriever::default(); + // The failure of this write indicates more than 3 characters are read. + // This is safe to ignore since the check needs only 3 characters. + let _ = self.fmt_write_normalize::( + &mut prefix, + NormalizationOp { + mode: NormalizationMode::None, + }, + // Assume the authority is absent. + false, + ); + + if prefix.as_bytes() == b"/./" { + Err(Error::new()) + } else { + Ok(()) + } + } +} + +/// Characteristic of a path. +#[derive(Debug, Clone, Copy)] +pub(crate) enum PathCharacteristic { + /// Absolute path, not special. + CommonAbsolute, + /// Absolute path, not special. + CommonRelative, + /// The first path segment of the relative path has one or more colon characters. + RelativeFirstSegmentHasColon, + /// The path starts with the double slash. + StartsWithDoubleSlash, +} + +impl PathCharacteristic { + /// Returns true if the path is absolute. + #[inline] + #[must_use] + pub(crate) fn is_absolute(self) -> bool { + matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash) + } + + /// Returns the characteristic of the path. + pub(crate) fn from_path_to_display( + path: &PathToNormalize<'_>, + op: NormalizationOp, + authority_is_present: bool, + ) -> Self { + /// Dummy writer to get necessary values. + #[derive(Default, Clone, Copy)] + struct Writer { + /// Result. + result: Option, + /// Whether the normalized path is absolute. + is_absolute: Option, + } + impl fmt::Write for Writer { + fn write_str(&mut self, mut s: &str) -> fmt::Result { + if self.result.is_some() { + // Nothing more to do. + return Err(fmt::Error); + } + while !s.is_empty() { + if self.is_absolute.is_none() { + // The first input. + match s.strip_prefix('/') { + Some(rest) => { + self.is_absolute = Some(true); + s = rest; + } + None => { + self.is_absolute = Some(false); + } + } + continue; + } + if self.is_absolute == Some(true) { + let result = if s.starts_with('/') { + PathCharacteristic::StartsWithDoubleSlash + } else { + PathCharacteristic::CommonAbsolute + }; + self.result = Some(result); + return Err(fmt::Error); + } + // Processing the first segment of the relative path. + match find_split_hole(s, b'/') { + Some((first_seg, _rest)) => { + let result = if first_seg.contains(':') { + PathCharacteristic::RelativeFirstSegmentHasColon + } else { + PathCharacteristic::CommonRelative + }; + self.result = Some(result); + return Err(fmt::Error); + } + None => { + // `s` might not be the complete first segment. + if s.contains(':') { + self.result = + Some(PathCharacteristic::RelativeFirstSegmentHasColon); + return Err(fmt::Error); + } + break; + } + } + } + Ok(()) + } + } + + let mut writer = Writer::default(); + match path.fmt_write_normalize::(&mut writer, op, authority_is_present) { + // Empty path. + Ok(_) => PathCharacteristic::CommonRelative, + Err(_) => writer + .result + .expect("[consistency] the formatting quits early by `Err` when the check is done"), + } + } +} + +/// Path segment kind. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum SegmentKind { + /// `.` or the equivalents. + Dot, + /// `..` or the equivalents. + DotDot, + /// Other normal (not special) segments. + Normal, +} + +impl SegmentKind { + /// Creates a new `SegmentKind` from the given segment name. + #[must_use] + fn from_segment(s: &str) -> Self { + match s { + "." | "%2E" | "%2e" => SegmentKind::Dot, + ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E" + | "%2e%2e" => SegmentKind::DotDot, + _ => SegmentKind::Normal, + } + } +} + +/// A segment with optional leading slash. +#[derive(Debug, Clone)] +struct PathSegment { + /// Presence of a leading slash. + has_leading_slash: bool, + /// Range of the segment name (without any slashes). + range: Range, +} + +impl PathSegment { + /// Returns the segment without any slashes. + #[inline] + #[must_use] + fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str { + if let Some(prefix) = path.0 { + let prefix_len = prefix.len(); + if self.range.end <= prefix_len { + &prefix[self.range.clone()] + } else { + let range = (self.range.start - prefix_len)..(self.range.end - prefix_len); + &path.1[range] + } + } else { + &path.1[self.range.clone()] + } + } + + /// Returns the segment kind. + #[inline] + #[must_use] + fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind { + SegmentKind::from_segment(self.segment(path)) + } +} + +/// Iterator of path segments. +struct PathSegmentsIter<'a> { + /// Path. + path: &'a PathToNormalize<'a>, + /// Current cursor position. + cursor: usize, +} + +impl<'a> PathSegmentsIter<'a> { + /// Creates a new iterator of path segments. + #[inline] + #[must_use] + fn new(path: &'a PathToNormalize<'a>) -> Self { + Self { path, cursor: 0 } + } +} + +impl Iterator for PathSegmentsIter<'_> { + type Item = PathSegment; + + fn next(&mut self) -> Option { + let path_len = self.path.len(); + if self.cursor >= path_len { + return None; + } + let has_leading_slash = self.path.byte_at(self.cursor) == Some(b'/'); + + let prefix_len = self.path.len_prefix(); + if (prefix_len != 0) && (self.cursor == prefix_len - 1) { + debug_assert!(has_leading_slash); + let end = self.path.1.find('/').unwrap_or(self.path.1.len()) + prefix_len; + self.cursor = end; + return Some(PathSegment { + has_leading_slash, + range: prefix_len..end, + }); + } + + if has_leading_slash { + // Skip the leading slash. + self.cursor += 1; + }; + let start = self.cursor; + self.cursor = self.path.find_next_slash(self.cursor).unwrap_or(path_len); + + Some(PathSegment { + has_leading_slash, + range: start..self.cursor, + }) + } +} diff --git a/vendor/iri-string/src/normalize/pct_case.rs b/vendor/iri-string/src/normalize/pct_case.rs new file mode 100644 index 00000000..75e0a777 --- /dev/null +++ b/vendor/iri-string/src/normalize/pct_case.rs @@ -0,0 +1,358 @@ +//! Percent-encoding normalization and case normalization. + +use core::cmp::Ordering; +use core::fmt::{self, Write as _}; +use core::marker::PhantomData; + +use crate::format::eq_str_display; +use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue}; +use crate::parser::str::{find_split_hole, take_first_char}; +use crate::parser::trusted::take_xdigits2; +use crate::spec::Spec; + +/// Returns true if the given string is percent-encoding normalized and case +/// normalized. +/// +/// Note that normalization of ASCII-only host requires additional case +/// normalization, so checking by this function is not sufficient for that case. +pub(crate) fn is_pct_case_normalized(s: &str) -> bool { + eq_str_display(s, &PctCaseNormalized::::new(s)) +} + +/// Returns a character for the slice. +/// +/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`, +/// but this function fully trusts that the input is a valid UTF-8 string with +/// only one character. +fn into_char_trusted(bytes: &[u8]) -> Result { + /// The bit mask to get the content part in a continue byte. + const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111; + /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes. + const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000]; + + let len = bytes.len(); + let c: u32 = match len { + 2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK), + 3 => { + (u32::from(bytes[0] & 0b_0000_1111) << 12) + | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6) + | u32::from(bytes[2] & CONTINUE_BYTE_MASK) + } + 4 => { + (u32::from(bytes[0] & 0b_0000_0111) << 18) + | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12) + | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6) + | u32::from(bytes[3] & CONTINUE_BYTE_MASK) + } + len => unreachable!( + "[consistency] expected 2, 3, or 4 bytes for a character, but got {len} as the length" + ), + }; + if c < MIN[len - 2] { + // Redundant UTF-8 encoding. + return Err(()); + } + // Can be an invalid Unicode code point. + char::from_u32(c).ok_or(()) +} + +/// Writable as a normalized path segment percent-encoding IRI. +/// +/// This wrapper does the things below when being formatted: +/// +/// * Decode unnecessarily percent-encoded characters. +/// * Convert alphabetic characters uppercase in percent-encoded triplets. +/// +/// Note that this does not newly encode raw characters. +/// +/// # Safety +/// +/// The given string should be the valid path segment. +#[derive(Debug, Clone, Copy)] +pub(crate) struct PctCaseNormalized<'a, S> { + /// Valid segment name to normalize. + segname: &'a str, + /// Spec. + _spec: PhantomData S>, +} + +impl<'a, S: Spec> PctCaseNormalized<'a, S> { + /// Creates a new `PctCaseNormalized` value. + #[inline] + #[must_use] + pub(crate) fn new(source: &'a str) -> Self { + Self { + segname: source, + _spec: PhantomData, + } + } +} + +impl fmt::Display for PctCaseNormalized<'_, S> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut rest = self.segname; + + 'outer_loop: while !rest.is_empty() { + // Scan the next percent-encoded triplet. + let (prefix, after_percent) = match find_split_hole(rest, b'%') { + Some(v) => v, + None => return f.write_str(rest), + }; + // Write the string before the percent-encoded triplet. + f.write_str(prefix)?; + // Decode the percent-encoded triplet. + let (first_decoded, after_first_triplet) = take_xdigits2(after_percent); + rest = after_first_triplet; + + if first_decoded.is_ascii() { + if is_ascii_unreserved(first_decoded) { + // Unreserved. Print the decoded. + f.write_char(char::from(first_decoded))?; + } else { + write!(f, "%{:02X}", first_decoded)?; + } + continue 'outer_loop; + } + + // Continue byte cannot be the first byte of a character. + if is_utf8_byte_continue(first_decoded) { + write!(f, "%{:02X}", first_decoded)?; + continue 'outer_loop; + } + + // Get the expected length of decoded char. + let expected_char_len = match (first_decoded & 0xf0).cmp(&0b1110_0000) { + Ordering::Less => 2, + Ordering::Equal => 3, + Ordering::Greater => 4, + }; + + // Get continue bytes. + let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len]; + for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() { + match take_first_char(rest) { + Some(('%', after_percent)) => { + let (byte, after_triplet) = take_xdigits2(after_percent); + if !is_utf8_byte_continue(byte) { + // Note that `byte` can start the new string. + // Leave the byte in the `rest` for next try (i.e. + // don't update `rest` in this case). + c_buf[..=i] + .iter() + .try_for_each(|b| write!(f, "%{:02X}", b))?; + continue 'outer_loop; + } + *buf_dest = byte; + rest = after_triplet; + } + // If the next character is not `%`, decoded bytes so far + // won't be valid UTF-8 byte sequence. + // Write the read percent-encoded triplets without decoding. + // Note that all characters in `&c_buf[1..]` (if available) + // will be decoded to "continue byte" of UTF-8, so they + // cannot be the start of a valid UTF-8 byte sequence if + // decoded. + Some((c, after_percent)) => { + c_buf[..=i] + .iter() + .try_for_each(|b| write!(f, "%{:02X}", b))?; + f.write_char(c)?; + rest = after_percent; + continue 'outer_loop; + } + None => { + c_buf[..=i] + .iter() + .try_for_each(|b| write!(f, "%{:02X}", b))?; + // Reached the end of the string. + break 'outer_loop; + } + } + } + + // Decode the bytes into a character. + match into_char_trusted(&c_buf[..expected_char_len]) { + Ok(decoded_c) => { + if is_unreserved::(decoded_c) { + // Unreserved. Print the decoded. + f.write_char(decoded_c)?; + } else { + c_buf[0..expected_char_len] + .iter() + .try_for_each(|b| write!(f, "%{:02X}", b))?; + } + } + Err(_) => { + // Skip decoding of the entire sequence of pct-encoded triplets loaded + // in `c_buf`. This is valid from the reasons below. + // + // * The first byte in `c_buf` is valid as the first byte, and it tells the + // expected number of bytes for a code unit. The cases the bytes being too + // short and the sequence being incomplete have already been handled, and + // the execution does not reach here then. + // * All of the non-first bytes are checked if they are valid as UTF8 continue + // bytes by `is_utf8_byte_continue()`. If they're not, the decoding of + // that codepoint is aborted and the bytes in the buffer are immediately + // emitted as pct-encoded, and the execution does not reach here. This + // means that the bytes in the current `c_buf` have passed these tests. + // * Since all of the the non-first bytes are UTF8 continue bytes, any of + // them cannot start the new valid UTF-8 byte sequence. This means that + // if the bytes in the buffer does not consitute a valid UTF-8 bytes + // sequence, the whole buffer can immediately be emmitted as pct-encoded. + + debug_assert!( + c_buf[1..expected_char_len] + .iter() + .copied() + .all(is_utf8_byte_continue), + "[consistency] all non-first bytes have been \ + confirmed that they are UTF-8 continue bytes" + ); + // Note that the first pct-encoded triplet is stripped from + // `after_first_triplet`. + rest = &after_first_triplet[((expected_char_len - 1) * 3)..]; + c_buf[0..expected_char_len] + .iter() + .try_for_each(|b| write!(f, "%{:02X}", b))?; + } + } + } + + Ok(()) + } +} + +/// Writable as a normalized ASCII-only `host` (and optionally `port` followed). +#[derive(Debug, Clone, Copy)] +pub(crate) struct NormalizedAsciiOnlyHost<'a> { + /// Valid host (and additionaly port) to normalize. + host_port: &'a str, +} + +impl<'a> NormalizedAsciiOnlyHost<'a> { + /// Creates a new `NormalizedAsciiOnlyHost` value. + /// + /// # Preconditions + /// + /// The given string should be the valid ASCII-only `host` or + /// `host ":" port` after percent-encoding normalization. + /// In other words, [`parser::trusted::is_ascii_only_host`] should return + /// true for the given value. + /// + /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host` + #[inline] + #[must_use] + pub(crate) fn new(host_port: &'a str) -> Self { + Self { host_port } + } +} + +impl fmt::Display for NormalizedAsciiOnlyHost<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let mut rest = self.host_port; + + while !rest.is_empty() { + // Scan the next percent-encoded triplet. + let (prefix, after_percent) = match find_split_hole(rest, b'%') { + Some(v) => v, + None => { + return rest + .chars() + .try_for_each(|c| f.write_char(c.to_ascii_lowercase())); + } + }; + // Write the string before the percent-encoded triplet. + prefix + .chars() + .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?; + // Decode the percent-encoded triplet. + let (first_decoded, after_triplet) = take_xdigits2(after_percent); + rest = after_triplet; + + assert!( + first_decoded.is_ascii(), + "[consistency] this function requires ASCII-only host as an argument" + ); + + if is_ascii_unreserved(first_decoded) { + // Unreserved. Convert to lowercase and print. + f.write_char(char::from(first_decoded.to_ascii_lowercase()))?; + } else { + write!(f, "%{:02X}", first_decoded)?; + } + } + + Ok(()) + } +} + +#[cfg(test)] +#[cfg(feature = "alloc")] +mod tests { + use super::*; + + #[cfg(all(feature = "alloc", not(feature = "std")))] + use alloc::string::ToString; + + use crate::spec::{IriSpec, UriSpec}; + + #[test] + fn invalid_utf8() { + assert_eq!( + PctCaseNormalized::::new("%80%cc%cc%cc").to_string(), + "%80%CC%CC%CC" + ); + assert_eq!( + PctCaseNormalized::::new("%80%cc%cc%cc").to_string(), + "%80%CC%CC%CC" + ); + } + + #[test] + fn iri_unreserved() { + assert_eq!( + PctCaseNormalized::::new("%ce%b1").to_string(), + "%CE%B1" + ); + assert_eq!( + PctCaseNormalized::::new("%ce%b1").to_string(), + "\u{03B1}" + ); + } + + #[test] + fn iri_middle_decode() { + assert_eq!( + PctCaseNormalized::::new("%ce%ce%b1%b1").to_string(), + "%CE%CE%B1%B1" + ); + assert_eq!( + PctCaseNormalized::::new("%ce%ce%b1%b1").to_string(), + "%CE\u{03B1}%B1" + ); + } + + #[test] + fn ascii_reserved() { + assert_eq!(PctCaseNormalized::::new("%3f").to_string(), "%3F"); + assert_eq!(PctCaseNormalized::::new("%3f").to_string(), "%3F"); + } + + #[test] + fn ascii_forbidden() { + assert_eq!( + PctCaseNormalized::::new("%3c%3e").to_string(), + "%3C%3E" + ); + assert_eq!( + PctCaseNormalized::::new("%3c%3e").to_string(), + "%3C%3E" + ); + } + + #[test] + fn ascii_unreserved() { + assert_eq!(PctCaseNormalized::::new("%7ea").to_string(), "~a"); + assert_eq!(PctCaseNormalized::::new("%7ea").to_string(), "~a"); + } +} -- cgit v1.2.3