summaryrefslogtreecommitdiff
path: root/vendor/iri-string/src/normalize
diff options
context:
space:
mode:
authormo khan <mo@mokhan.ca>2025-07-10 13:11:11 -0600
committermo khan <mo@mokhan.ca>2025-07-10 13:11:11 -0600
commit01959b16a21b22b5df5f16569c2a8e8f92beecef (patch)
tree32afa5d747c5466345c59ec52161a7cba3d6d755 /vendor/iri-string/src/normalize
parentff30574117a996df332e23d1fb6f65259b316b5b (diff)
chore: vendor dependencies
Diffstat (limited to 'vendor/iri-string/src/normalize')
-rw-r--r--vendor/iri-string/src/normalize/error.rs26
-rw-r--r--vendor/iri-string/src/normalize/path.rs620
-rw-r--r--vendor/iri-string/src/normalize/pct_case.rs358
3 files changed, 1004 insertions, 0 deletions
diff --git a/vendor/iri-string/src/normalize/error.rs b/vendor/iri-string/src/normalize/error.rs
new file mode 100644
index 00000000..a5c5c895
--- /dev/null
+++ b/vendor/iri-string/src/normalize/error.rs
@@ -0,0 +1,26 @@
+//! Normalization and resolution error.
+
+use core::fmt;
+
+/// IRI normalization and resolution error.
+///
+/// For detail about resolution failure, see [the module documentation][`crate::resolve`].
+#[derive(Debug, Clone)]
+pub struct Error(());
+
+impl Error {
+ /// Creates a new error.
+ pub(crate) fn new() -> Self {
+ Self(())
+ }
+}
+
+impl fmt::Display for Error {
+ #[inline]
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ f.write_str("unresolvable IRI")
+ }
+}
+
+#[cfg(feature = "std")]
+impl std::error::Error for Error {}
diff --git a/vendor/iri-string/src/normalize/path.rs b/vendor/iri-string/src/normalize/path.rs
new file mode 100644
index 00000000..4f3e3397
--- /dev/null
+++ b/vendor/iri-string/src/normalize/path.rs
@@ -0,0 +1,620 @@
+//! Path normalization.
+
+use core::fmt;
+use core::ops::Range;
+
+use crate::parser::str::{find_split_hole, rfind};
+use crate::spec::{Spec, UriSpec};
+
+use super::pct_case::PctCaseNormalized;
+use super::{Error, NormalizationMode, NormalizationOp};
+
+/// Path that is (possibly) not yet processed or being processed.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum Path<'a> {
+ /// The result. No more processing is needed.
+ Done(&'a str),
+ /// Not yet completely processed path.
+ NeedsProcessing(PathToNormalize<'a>),
+}
+
+/// Path that needs merge and/or dot segment removal.
+///
+/// # Invariants
+///
+/// If the first field (prefix field) is not `None`, it must end with a slash.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PathToNormalize<'a>(Option<&'a str>, &'a str);
+
+impl<'a> PathToNormalize<'a> {
+ /// Creates a `PathToNormalize` from the given single path.
+ #[inline]
+ #[must_use]
+ pub(crate) fn from_single_path(path: &'a str) -> Self {
+ Self(None, path)
+ }
+
+ /// Creates a `PathToNormalize` from the given base and reference paths to be resolved.
+ #[must_use]
+ pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self {
+ if reference.starts_with('/') {
+ return Self(None, reference);
+ }
+
+ match rfind(base.as_bytes(), b'/') {
+ Some(last_slash_pos) => Self(Some(&base[..=last_slash_pos]), reference),
+ None => Self(None, reference),
+ }
+ }
+
+ /// Returns true if the path is empty string.
+ #[inline]
+ #[must_use]
+ fn is_empty(&self) -> bool {
+ // If `self.0` is `Some(_)`, it ends with a slash, i.e. it is not empty.
+ self.0.is_none() && self.1.is_empty()
+ }
+
+ /// Returns the length of the not yet normalized path.
+ #[inline]
+ #[must_use]
+ pub(super) fn len(&self) -> usize {
+ self.len_prefix() + self.1.len()
+ }
+
+ /// Returns the length of the prefix part.
+ ///
+ /// Returns 0 if the prefix part is empty.
+ #[inline]
+ #[must_use]
+ fn len_prefix(&self) -> usize {
+ self.0.map_or(0, |s| s.len())
+ }
+
+ /// Returns a byte at the given position.
+ #[must_use]
+ fn byte_at(&self, mut i: usize) -> Option<u8> {
+ if let Some(prefix) = self.0 {
+ if i < prefix.len() {
+ return Some(prefix.as_bytes()[i]);
+ }
+ i -= prefix.len();
+ }
+ self.1.as_bytes().get(i).copied()
+ }
+
+ /// Returns the position of the next slash of the byte at the given position.
+ #[must_use]
+ fn find_next_slash(&self, scan_start: usize) -> Option<usize> {
+ if let Some(prefix) = self.0 {
+ let prefix_len = prefix.len();
+ if scan_start < prefix_len {
+ prefix[scan_start..].find('/').map(|rel| rel + scan_start)
+ } else {
+ let local_i = scan_start - prefix_len;
+ self.1[local_i..].find('/').map(|rel| rel + scan_start)
+ }
+ } else {
+ self.1[scan_start..].find('/').map(|rel| rel + scan_start)
+ }
+ }
+
+ /// Removes the `len` characters from the beginning of `self`.
+ fn remove_start(&mut self, len: usize) {
+ if let Some(prefix) = self.0 {
+ if let Some(suffix_trim_len) = len.checked_sub(prefix.len()) {
+ self.0 = None;
+ self.1 = &self.1[suffix_trim_len..];
+ } else {
+ self.0 = Some(&prefix[len..]);
+ }
+ } else {
+ self.1 = &self.1[len..];
+ }
+ }
+
+ /// Removes the prefix that are ignorable on normalization.
+ // Skips the prefix dot segments without leading slashes (such as `./`,
+ // `../`, and `../.././`).
+ // This is necessary because such segments should be removed with the
+ // FOLLOWING slashes, not leading slashes.
+ fn remove_ignorable_prefix(&mut self) {
+ while let Some(seg) = PathSegmentsIter::new(self).next() {
+ if seg.has_leading_slash {
+ // The first segment starting with a slash is not target.
+ break;
+ }
+ match seg.kind(self) {
+ SegmentKind::Dot | SegmentKind::DotDot => {
+ // Attempt to skip the following slash by `+ 1`.
+ let skip = self.len().min(seg.range.end + 1);
+ self.remove_start(skip);
+ }
+ SegmentKind::Normal => break,
+ }
+ }
+ }
+}
+
+impl PathToNormalize<'_> {
+ /// Writes the normalized path.
+ pub(crate) fn fmt_write_normalize<S: Spec, W: fmt::Write>(
+ &self,
+ f: &mut W,
+ op: NormalizationOp,
+ authority_is_present: bool,
+ ) -> fmt::Result {
+ debug_assert!(
+ self.0.map_or(true, |s| s.ends_with('/')),
+ "[validity] the prefix field of `PathToNormalize` should end with a slash"
+ );
+
+ if self.is_empty() {
+ return Ok(());
+ }
+
+ if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath)
+ && !authority_is_present
+ && self.byte_at(0) != Some(b'/')
+ {
+ // Treat the path as "opaque", i.e. do not apply dot segments removal.
+ // See <https://github.com/lo48576/iri-string/issues/29>.
+ debug_assert!(
+ op.mode.case_pct_normalization(),
+ "[consistency] case/pct normalization should still be applied"
+ );
+ if let Some(prefix) = self.0 {
+ write!(f, "{}", PctCaseNormalized::<S>::new(prefix))?;
+ }
+ write!(f, "{}", PctCaseNormalized::<S>::new(self.1))?;
+ return Ok(());
+ }
+
+ let mut rest = *self;
+
+ // Skip the prefix dot segments without leading slashes (such as `./`,
+ // `../`, and `../.././`).
+ // This is necessary because such segments should be removed with the
+ // FOLLOWING slashes, not leading slashes.
+ rest.remove_ignorable_prefix();
+ if rest.is_empty() {
+ // Path consists of only `/.`s and `/..`s.
+ // In this case, if the authority component is present, the result
+ // should be `/`, not empty.
+ if authority_is_present {
+ f.write_char('/')?;
+ }
+ return Ok(());
+ }
+
+ // None: No segments are written yet.
+ // Some(false): Something other than `/` is already written as the path.
+ // Some(true): Only a `/` is written as the path.
+ let mut only_a_slash_is_written = None;
+ let mut too_deep_area_may_have_dot_segments = true;
+ while !rest.is_empty() && too_deep_area_may_have_dot_segments {
+ /// The size of the queue to track the path segments.
+ ///
+ /// This should be nonzero.
+ const QUEUE_SIZE: usize = 8;
+
+ {
+ // Skip `/.` and `/..` segments at the head.
+ let mut skipped_len = 0;
+ for seg in PathSegmentsIter::new(&rest) {
+ match seg.kind(&rest) {
+ SegmentKind::Dot | SegmentKind::DotDot => {
+ debug_assert!(
+ seg.has_leading_slash,
+ "[consistency] `.` or `..` segments without a
+ leading slash have already been skipped"
+ );
+ skipped_len = seg.range.end;
+ }
+ _ => break,
+ }
+ }
+ rest.remove_start(skipped_len);
+ if rest.is_empty() {
+ // Finished with a dot segment.
+ // The last `/.` or `/..` should be replaced to `/`.
+ if !authority_is_present && (only_a_slash_is_written == Some(true)) {
+ // Insert a dot segment to break the prefix `//`.
+ // Without this, the path starts with `//` and it may
+ // be confused with the prefix of an authority.
+ f.write_str(".//")?;
+ } else {
+ f.write_char('/')?;
+ }
+ break;
+ }
+ }
+
+ let mut queue: [Option<&'_ str>; QUEUE_SIZE] = Default::default();
+ let mut level: usize = 0;
+ let mut first_segment_has_leading_slash = false;
+
+ // Find higher path segments.
+ let mut end = 0;
+ for seg in PathSegmentsIter::new(&rest) {
+ let kind = seg.kind(&rest);
+ match kind {
+ SegmentKind::Dot => {
+ too_deep_area_may_have_dot_segments = true;
+ }
+ SegmentKind::DotDot => {
+ level = level.saturating_sub(1);
+ too_deep_area_may_have_dot_segments = true;
+ if level < queue.len() {
+ queue[level] = None;
+ }
+ }
+ SegmentKind::Normal => {
+ if level < queue.len() {
+ queue[level] = Some(seg.segment(&rest));
+ too_deep_area_may_have_dot_segments = false;
+ end = seg.range.end;
+ if level == 0 {
+ first_segment_has_leading_slash = seg.has_leading_slash;
+ }
+ }
+ level += 1;
+ }
+ }
+ }
+
+ // Write the path segments as possible, and update the internal state.
+ for segname in queue.iter().flatten() {
+ Self::emit_segment::<S, _>(
+ f,
+ &mut only_a_slash_is_written,
+ first_segment_has_leading_slash,
+ segname,
+ authority_is_present,
+ op,
+ )?;
+ }
+
+ rest.remove_start(end);
+ }
+
+ if !rest.is_empty() {
+ // No need of searching dot segments anymore.
+ assert!(
+ !too_deep_area_may_have_dot_segments,
+ "[consistency] loop condition of the previous loop"
+ );
+ // Apply only normalization (if needed).
+ for seg in PathSegmentsIter::new(&rest) {
+ assert_eq!(
+ seg.kind(&rest),
+ SegmentKind::Normal,
+ "[consistency] already confirmed that there are no more dot segments"
+ );
+ let segname = seg.segment(&rest);
+ Self::emit_segment::<S, _>(
+ f,
+ &mut only_a_slash_is_written,
+ seg.has_leading_slash,
+ segname,
+ authority_is_present,
+ op,
+ )?;
+ }
+ }
+
+ Ok(())
+ }
+
+ /// Emits a non-dot segment and update the current state.
+ //
+ // `first_segment_has_leading_slash` can be any value if the segment is not the first one.
+ fn emit_segment<S: Spec, W: fmt::Write>(
+ f: &mut W,
+ only_a_slash_is_written: &mut Option<bool>,
+ first_segment_has_leading_slash: bool,
+ segname: &str,
+ authority_is_present: bool,
+ op: NormalizationOp,
+ ) -> fmt::Result {
+ // Omit the leading slash of the segment only if the segment is
+ // the first one and marked as not having a leading slash.
+ match *only_a_slash_is_written {
+ None => {
+ // First segment.
+ // This pass can be possible if `./` is repeated `QUEUE_SIZE`
+ // times at the beginning.
+ if first_segment_has_leading_slash {
+ f.write_char('/')?;
+ }
+ *only_a_slash_is_written =
+ Some(first_segment_has_leading_slash && segname.is_empty());
+ }
+ Some(only_a_slash) => {
+ if only_a_slash && !authority_is_present {
+ // Apply serialization like WHATWG URL Standard.
+ // This prevents `<scheme=foo>:<path=//bar>` from written as
+ // `foo://bar`, which is interpreted as
+ // `<scheme=foo>://<authority=bar>`. Prepending `./`, the
+ // serialization result would be `foo:/.//bar`, which is safe.
+ f.write_str("./")?;
+ *only_a_slash_is_written = Some(false);
+ }
+ f.write_char('/')?;
+ }
+ }
+
+ // Write the segment name.
+ if op.mode.case_pct_normalization() {
+ write!(f, "{}", PctCaseNormalized::<S>::new(segname))
+ } else {
+ f.write_str(segname)
+ }
+ }
+
+ /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent.
+ ///
+ /// Returns `Ok(())` when normalizable, returns `Err(_)` if not.
+ pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> {
+ /// A sink to get the prefix of the input.
+ #[derive(Default)]
+ struct PrefixRetriever {
+ /// The buffer to remember the prefix of the input.
+ buf: [u8; 3],
+ /// The next write position in the buffer.
+ cursor: usize,
+ }
+ impl PrefixRetriever {
+ /// Returns the read prefix data.
+ #[inline]
+ #[must_use]
+ fn as_bytes(&self) -> &[u8] {
+ &self.buf[..self.cursor]
+ }
+ }
+ impl fmt::Write for PrefixRetriever {
+ fn write_str(&mut self, s: &str) -> fmt::Result {
+ if !s.is_empty() && (self.cursor >= self.buf.len()) {
+ // Enough bytes are read.
+ return Err(fmt::Error);
+ }
+ self.buf[self.cursor..]
+ .iter_mut()
+ .zip(s.bytes())
+ .for_each(|(dest, src)| *dest = src);
+ self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len());
+ Ok(())
+ }
+ }
+
+ let mut prefix = PrefixRetriever::default();
+ // The failure of this write indicates more than 3 characters are read.
+ // This is safe to ignore since the check needs only 3 characters.
+ let _ = self.fmt_write_normalize::<UriSpec, _>(
+ &mut prefix,
+ NormalizationOp {
+ mode: NormalizationMode::None,
+ },
+ // Assume the authority is absent.
+ false,
+ );
+
+ if prefix.as_bytes() == b"/./" {
+ Err(Error::new())
+ } else {
+ Ok(())
+ }
+ }
+}
+
+/// Characteristic of a path.
+#[derive(Debug, Clone, Copy)]
+pub(crate) enum PathCharacteristic {
+ /// Absolute path, not special.
+ CommonAbsolute,
+ /// Absolute path, not special.
+ CommonRelative,
+ /// The first path segment of the relative path has one or more colon characters.
+ RelativeFirstSegmentHasColon,
+ /// The path starts with the double slash.
+ StartsWithDoubleSlash,
+}
+
+impl PathCharacteristic {
+ /// Returns true if the path is absolute.
+ #[inline]
+ #[must_use]
+ pub(crate) fn is_absolute(self) -> bool {
+ matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash)
+ }
+
+ /// Returns the characteristic of the path.
+ pub(crate) fn from_path_to_display<S: Spec>(
+ path: &PathToNormalize<'_>,
+ op: NormalizationOp,
+ authority_is_present: bool,
+ ) -> Self {
+ /// Dummy writer to get necessary values.
+ #[derive(Default, Clone, Copy)]
+ struct Writer {
+ /// Result.
+ result: Option<PathCharacteristic>,
+ /// Whether the normalized path is absolute.
+ is_absolute: Option<bool>,
+ }
+ impl fmt::Write for Writer {
+ fn write_str(&mut self, mut s: &str) -> fmt::Result {
+ if self.result.is_some() {
+ // Nothing more to do.
+ return Err(fmt::Error);
+ }
+ while !s.is_empty() {
+ if self.is_absolute.is_none() {
+ // The first input.
+ match s.strip_prefix('/') {
+ Some(rest) => {
+ self.is_absolute = Some(true);
+ s = rest;
+ }
+ None => {
+ self.is_absolute = Some(false);
+ }
+ }
+ continue;
+ }
+ if self.is_absolute == Some(true) {
+ let result = if s.starts_with('/') {
+ PathCharacteristic::StartsWithDoubleSlash
+ } else {
+ PathCharacteristic::CommonAbsolute
+ };
+ self.result = Some(result);
+ return Err(fmt::Error);
+ }
+ // Processing the first segment of the relative path.
+ match find_split_hole(s, b'/') {
+ Some((first_seg, _rest)) => {
+ let result = if first_seg.contains(':') {
+ PathCharacteristic::RelativeFirstSegmentHasColon
+ } else {
+ PathCharacteristic::CommonRelative
+ };
+ self.result = Some(result);
+ return Err(fmt::Error);
+ }
+ None => {
+ // `s` might not be the complete first segment.
+ if s.contains(':') {
+ self.result =
+ Some(PathCharacteristic::RelativeFirstSegmentHasColon);
+ return Err(fmt::Error);
+ }
+ break;
+ }
+ }
+ }
+ Ok(())
+ }
+ }
+
+ let mut writer = Writer::default();
+ match path.fmt_write_normalize::<S, _>(&mut writer, op, authority_is_present) {
+ // Empty path.
+ Ok(_) => PathCharacteristic::CommonRelative,
+ Err(_) => writer
+ .result
+ .expect("[consistency] the formatting quits early by `Err` when the check is done"),
+ }
+ }
+}
+
+/// Path segment kind.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SegmentKind {
+ /// `.` or the equivalents.
+ Dot,
+ /// `..` or the equivalents.
+ DotDot,
+ /// Other normal (not special) segments.
+ Normal,
+}
+
+impl SegmentKind {
+ /// Creates a new `SegmentKind` from the given segment name.
+ #[must_use]
+ fn from_segment(s: &str) -> Self {
+ match s {
+ "." | "%2E" | "%2e" => SegmentKind::Dot,
+ ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E"
+ | "%2e%2e" => SegmentKind::DotDot,
+ _ => SegmentKind::Normal,
+ }
+ }
+}
+
+/// A segment with optional leading slash.
+#[derive(Debug, Clone)]
+struct PathSegment {
+ /// Presence of a leading slash.
+ has_leading_slash: bool,
+ /// Range of the segment name (without any slashes).
+ range: Range<usize>,
+}
+
+impl PathSegment {
+ /// Returns the segment without any slashes.
+ #[inline]
+ #[must_use]
+ fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str {
+ if let Some(prefix) = path.0 {
+ let prefix_len = prefix.len();
+ if self.range.end <= prefix_len {
+ &prefix[self.range.clone()]
+ } else {
+ let range = (self.range.start - prefix_len)..(self.range.end - prefix_len);
+ &path.1[range]
+ }
+ } else {
+ &path.1[self.range.clone()]
+ }
+ }
+
+ /// Returns the segment kind.
+ #[inline]
+ #[must_use]
+ fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind {
+ SegmentKind::from_segment(self.segment(path))
+ }
+}
+
+/// Iterator of path segments.
+struct PathSegmentsIter<'a> {
+ /// Path.
+ path: &'a PathToNormalize<'a>,
+ /// Current cursor position.
+ cursor: usize,
+}
+
+impl<'a> PathSegmentsIter<'a> {
+ /// Creates a new iterator of path segments.
+ #[inline]
+ #[must_use]
+ fn new(path: &'a PathToNormalize<'a>) -> Self {
+ Self { path, cursor: 0 }
+ }
+}
+
+impl Iterator for PathSegmentsIter<'_> {
+ type Item = PathSegment;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ let path_len = self.path.len();
+ if self.cursor >= path_len {
+ return None;
+ }
+ let has_leading_slash = self.path.byte_at(self.cursor) == Some(b'/');
+
+ let prefix_len = self.path.len_prefix();
+ if (prefix_len != 0) && (self.cursor == prefix_len - 1) {
+ debug_assert!(has_leading_slash);
+ let end = self.path.1.find('/').unwrap_or(self.path.1.len()) + prefix_len;
+ self.cursor = end;
+ return Some(PathSegment {
+ has_leading_slash,
+ range: prefix_len..end,
+ });
+ }
+
+ if has_leading_slash {
+ // Skip the leading slash.
+ self.cursor += 1;
+ };
+ let start = self.cursor;
+ self.cursor = self.path.find_next_slash(self.cursor).unwrap_or(path_len);
+
+ Some(PathSegment {
+ has_leading_slash,
+ range: start..self.cursor,
+ })
+ }
+}
diff --git a/vendor/iri-string/src/normalize/pct_case.rs b/vendor/iri-string/src/normalize/pct_case.rs
new file mode 100644
index 00000000..75e0a777
--- /dev/null
+++ b/vendor/iri-string/src/normalize/pct_case.rs
@@ -0,0 +1,358 @@
+//! Percent-encoding normalization and case normalization.
+
+use core::cmp::Ordering;
+use core::fmt::{self, Write as _};
+use core::marker::PhantomData;
+
+use crate::format::eq_str_display;
+use crate::parser::char::{is_ascii_unreserved, is_unreserved, is_utf8_byte_continue};
+use crate::parser::str::{find_split_hole, take_first_char};
+use crate::parser::trusted::take_xdigits2;
+use crate::spec::Spec;
+
+/// Returns true if the given string is percent-encoding normalized and case
+/// normalized.
+///
+/// Note that normalization of ASCII-only host requires additional case
+/// normalization, so checking by this function is not sufficient for that case.
+pub(crate) fn is_pct_case_normalized<S: Spec>(s: &str) -> bool {
+ eq_str_display(s, &PctCaseNormalized::<S>::new(s))
+}
+
+/// Returns a character for the slice.
+///
+/// Essentially equivalent to `core::str::from_utf8(bytes).unwrap().and_then(|s| s.get(0))`,
+/// but this function fully trusts that the input is a valid UTF-8 string with
+/// only one character.
+fn into_char_trusted(bytes: &[u8]) -> Result<char, ()> {
+ /// The bit mask to get the content part in a continue byte.
+ const CONTINUE_BYTE_MASK: u8 = 0b_0011_1111;
+ /// Minimum valid values for a code point in a UTF-8 sequence of 2, 3, and 4 bytes.
+ const MIN: [u32; 3] = [0x80, 0x800, 0x1_0000];
+
+ let len = bytes.len();
+ let c: u32 = match len {
+ 2 => (u32::from(bytes[0] & 0b_0001_1111) << 6) | u32::from(bytes[1] & CONTINUE_BYTE_MASK),
+ 3 => {
+ (u32::from(bytes[0] & 0b_0000_1111) << 12)
+ | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 6)
+ | u32::from(bytes[2] & CONTINUE_BYTE_MASK)
+ }
+ 4 => {
+ (u32::from(bytes[0] & 0b_0000_0111) << 18)
+ | (u32::from(bytes[1] & CONTINUE_BYTE_MASK) << 12)
+ | (u32::from(bytes[2] & CONTINUE_BYTE_MASK) << 6)
+ | u32::from(bytes[3] & CONTINUE_BYTE_MASK)
+ }
+ len => unreachable!(
+ "[consistency] expected 2, 3, or 4 bytes for a character, but got {len} as the length"
+ ),
+ };
+ if c < MIN[len - 2] {
+ // Redundant UTF-8 encoding.
+ return Err(());
+ }
+ // Can be an invalid Unicode code point.
+ char::from_u32(c).ok_or(())
+}
+
+/// Writable as a normalized path segment percent-encoding IRI.
+///
+/// This wrapper does the things below when being formatted:
+///
+/// * Decode unnecessarily percent-encoded characters.
+/// * Convert alphabetic characters uppercase in percent-encoded triplets.
+///
+/// Note that this does not newly encode raw characters.
+///
+/// # Safety
+///
+/// The given string should be the valid path segment.
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct PctCaseNormalized<'a, S> {
+ /// Valid segment name to normalize.
+ segname: &'a str,
+ /// Spec.
+ _spec: PhantomData<fn() -> S>,
+}
+
+impl<'a, S: Spec> PctCaseNormalized<'a, S> {
+ /// Creates a new `PctCaseNormalized` value.
+ #[inline]
+ #[must_use]
+ pub(crate) fn new(source: &'a str) -> Self {
+ Self {
+ segname: source,
+ _spec: PhantomData,
+ }
+ }
+}
+
+impl<S: Spec> fmt::Display for PctCaseNormalized<'_, S> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut rest = self.segname;
+
+ 'outer_loop: while !rest.is_empty() {
+ // Scan the next percent-encoded triplet.
+ let (prefix, after_percent) = match find_split_hole(rest, b'%') {
+ Some(v) => v,
+ None => return f.write_str(rest),
+ };
+ // Write the string before the percent-encoded triplet.
+ f.write_str(prefix)?;
+ // Decode the percent-encoded triplet.
+ let (first_decoded, after_first_triplet) = take_xdigits2(after_percent);
+ rest = after_first_triplet;
+
+ if first_decoded.is_ascii() {
+ if is_ascii_unreserved(first_decoded) {
+ // Unreserved. Print the decoded.
+ f.write_char(char::from(first_decoded))?;
+ } else {
+ write!(f, "%{:02X}", first_decoded)?;
+ }
+ continue 'outer_loop;
+ }
+
+ // Continue byte cannot be the first byte of a character.
+ if is_utf8_byte_continue(first_decoded) {
+ write!(f, "%{:02X}", first_decoded)?;
+ continue 'outer_loop;
+ }
+
+ // Get the expected length of decoded char.
+ let expected_char_len = match (first_decoded & 0xf0).cmp(&0b1110_0000) {
+ Ordering::Less => 2,
+ Ordering::Equal => 3,
+ Ordering::Greater => 4,
+ };
+
+ // Get continue bytes.
+ let c_buf = &mut [first_decoded, 0, 0, 0][..expected_char_len];
+ for (i, buf_dest) in c_buf[1..].iter_mut().enumerate() {
+ match take_first_char(rest) {
+ Some(('%', after_percent)) => {
+ let (byte, after_triplet) = take_xdigits2(after_percent);
+ if !is_utf8_byte_continue(byte) {
+ // Note that `byte` can start the new string.
+ // Leave the byte in the `rest` for next try (i.e.
+ // don't update `rest` in this case).
+ c_buf[..=i]
+ .iter()
+ .try_for_each(|b| write!(f, "%{:02X}", b))?;
+ continue 'outer_loop;
+ }
+ *buf_dest = byte;
+ rest = after_triplet;
+ }
+ // If the next character is not `%`, decoded bytes so far
+ // won't be valid UTF-8 byte sequence.
+ // Write the read percent-encoded triplets without decoding.
+ // Note that all characters in `&c_buf[1..]` (if available)
+ // will be decoded to "continue byte" of UTF-8, so they
+ // cannot be the start of a valid UTF-8 byte sequence if
+ // decoded.
+ Some((c, after_percent)) => {
+ c_buf[..=i]
+ .iter()
+ .try_for_each(|b| write!(f, "%{:02X}", b))?;
+ f.write_char(c)?;
+ rest = after_percent;
+ continue 'outer_loop;
+ }
+ None => {
+ c_buf[..=i]
+ .iter()
+ .try_for_each(|b| write!(f, "%{:02X}", b))?;
+ // Reached the end of the string.
+ break 'outer_loop;
+ }
+ }
+ }
+
+ // Decode the bytes into a character.
+ match into_char_trusted(&c_buf[..expected_char_len]) {
+ Ok(decoded_c) => {
+ if is_unreserved::<S>(decoded_c) {
+ // Unreserved. Print the decoded.
+ f.write_char(decoded_c)?;
+ } else {
+ c_buf[0..expected_char_len]
+ .iter()
+ .try_for_each(|b| write!(f, "%{:02X}", b))?;
+ }
+ }
+ Err(_) => {
+ // Skip decoding of the entire sequence of pct-encoded triplets loaded
+ // in `c_buf`. This is valid from the reasons below.
+ //
+ // * The first byte in `c_buf` is valid as the first byte, and it tells the
+ // expected number of bytes for a code unit. The cases the bytes being too
+ // short and the sequence being incomplete have already been handled, and
+ // the execution does not reach here then.
+ // * All of the non-first bytes are checked if they are valid as UTF8 continue
+ // bytes by `is_utf8_byte_continue()`. If they're not, the decoding of
+ // that codepoint is aborted and the bytes in the buffer are immediately
+ // emitted as pct-encoded, and the execution does not reach here. This
+ // means that the bytes in the current `c_buf` have passed these tests.
+ // * Since all of the the non-first bytes are UTF8 continue bytes, any of
+ // them cannot start the new valid UTF-8 byte sequence. This means that
+ // if the bytes in the buffer does not consitute a valid UTF-8 bytes
+ // sequence, the whole buffer can immediately be emmitted as pct-encoded.
+
+ debug_assert!(
+ c_buf[1..expected_char_len]
+ .iter()
+ .copied()
+ .all(is_utf8_byte_continue),
+ "[consistency] all non-first bytes have been \
+ confirmed that they are UTF-8 continue bytes"
+ );
+ // Note that the first pct-encoded triplet is stripped from
+ // `after_first_triplet`.
+ rest = &after_first_triplet[((expected_char_len - 1) * 3)..];
+ c_buf[0..expected_char_len]
+ .iter()
+ .try_for_each(|b| write!(f, "%{:02X}", b))?;
+ }
+ }
+ }
+
+ Ok(())
+ }
+}
+
+/// Writable as a normalized ASCII-only `host` (and optionally `port` followed).
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct NormalizedAsciiOnlyHost<'a> {
+ /// Valid host (and additionaly port) to normalize.
+ host_port: &'a str,
+}
+
+impl<'a> NormalizedAsciiOnlyHost<'a> {
+ /// Creates a new `NormalizedAsciiOnlyHost` value.
+ ///
+ /// # Preconditions
+ ///
+ /// The given string should be the valid ASCII-only `host` or
+ /// `host ":" port` after percent-encoding normalization.
+ /// In other words, [`parser::trusted::is_ascii_only_host`] should return
+ /// true for the given value.
+ ///
+ /// [`parser::trusted::is_ascii_only_host`]: `crate::parser::trusted::is_ascii_only_host`
+ #[inline]
+ #[must_use]
+ pub(crate) fn new(host_port: &'a str) -> Self {
+ Self { host_port }
+ }
+}
+
+impl fmt::Display for NormalizedAsciiOnlyHost<'_> {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ let mut rest = self.host_port;
+
+ while !rest.is_empty() {
+ // Scan the next percent-encoded triplet.
+ let (prefix, after_percent) = match find_split_hole(rest, b'%') {
+ Some(v) => v,
+ None => {
+ return rest
+ .chars()
+ .try_for_each(|c| f.write_char(c.to_ascii_lowercase()));
+ }
+ };
+ // Write the string before the percent-encoded triplet.
+ prefix
+ .chars()
+ .try_for_each(|c| f.write_char(c.to_ascii_lowercase()))?;
+ // Decode the percent-encoded triplet.
+ let (first_decoded, after_triplet) = take_xdigits2(after_percent);
+ rest = after_triplet;
+
+ assert!(
+ first_decoded.is_ascii(),
+ "[consistency] this function requires ASCII-only host as an argument"
+ );
+
+ if is_ascii_unreserved(first_decoded) {
+ // Unreserved. Convert to lowercase and print.
+ f.write_char(char::from(first_decoded.to_ascii_lowercase()))?;
+ } else {
+ write!(f, "%{:02X}", first_decoded)?;
+ }
+ }
+
+ Ok(())
+ }
+}
+
+#[cfg(test)]
+#[cfg(feature = "alloc")]
+mod tests {
+ use super::*;
+
+ #[cfg(all(feature = "alloc", not(feature = "std")))]
+ use alloc::string::ToString;
+
+ use crate::spec::{IriSpec, UriSpec};
+
+ #[test]
+ fn invalid_utf8() {
+ assert_eq!(
+ PctCaseNormalized::<UriSpec>::new("%80%cc%cc%cc").to_string(),
+ "%80%CC%CC%CC"
+ );
+ assert_eq!(
+ PctCaseNormalized::<IriSpec>::new("%80%cc%cc%cc").to_string(),
+ "%80%CC%CC%CC"
+ );
+ }
+
+ #[test]
+ fn iri_unreserved() {
+ assert_eq!(
+ PctCaseNormalized::<UriSpec>::new("%ce%b1").to_string(),
+ "%CE%B1"
+ );
+ assert_eq!(
+ PctCaseNormalized::<IriSpec>::new("%ce%b1").to_string(),
+ "\u{03B1}"
+ );
+ }
+
+ #[test]
+ fn iri_middle_decode() {
+ assert_eq!(
+ PctCaseNormalized::<UriSpec>::new("%ce%ce%b1%b1").to_string(),
+ "%CE%CE%B1%B1"
+ );
+ assert_eq!(
+ PctCaseNormalized::<IriSpec>::new("%ce%ce%b1%b1").to_string(),
+ "%CE\u{03B1}%B1"
+ );
+ }
+
+ #[test]
+ fn ascii_reserved() {
+ assert_eq!(PctCaseNormalized::<UriSpec>::new("%3f").to_string(), "%3F");
+ assert_eq!(PctCaseNormalized::<IriSpec>::new("%3f").to_string(), "%3F");
+ }
+
+ #[test]
+ fn ascii_forbidden() {
+ assert_eq!(
+ PctCaseNormalized::<UriSpec>::new("%3c%3e").to_string(),
+ "%3C%3E"
+ );
+ assert_eq!(
+ PctCaseNormalized::<IriSpec>::new("%3c%3e").to_string(),
+ "%3C%3E"
+ );
+ }
+
+ #[test]
+ fn ascii_unreserved() {
+ assert_eq!(PctCaseNormalized::<UriSpec>::new("%7ea").to_string(), "~a");
+ assert_eq!(PctCaseNormalized::<IriSpec>::new("%7ea").to_string(), "~a");
+ }
+}