diff options
Diffstat (limited to 'vendor/iri-string/src/normalize/path.rs')
| -rw-r--r-- | vendor/iri-string/src/normalize/path.rs | 620 |
1 files changed, 620 insertions, 0 deletions
diff --git a/vendor/iri-string/src/normalize/path.rs b/vendor/iri-string/src/normalize/path.rs new file mode 100644 index 00000000..4f3e3397 --- /dev/null +++ b/vendor/iri-string/src/normalize/path.rs @@ -0,0 +1,620 @@ +//! Path normalization. + +use core::fmt; +use core::ops::Range; + +use crate::parser::str::{find_split_hole, rfind}; +use crate::spec::{Spec, UriSpec}; + +use super::pct_case::PctCaseNormalized; +use super::{Error, NormalizationMode, NormalizationOp}; + +/// Path that is (possibly) not yet processed or being processed. +#[derive(Debug, Clone, Copy)] +pub(crate) enum Path<'a> { + /// The result. No more processing is needed. + Done(&'a str), + /// Not yet completely processed path. + NeedsProcessing(PathToNormalize<'a>), +} + +/// Path that needs merge and/or dot segment removal. +/// +/// # Invariants +/// +/// If the first field (prefix field) is not `None`, it must end with a slash. +#[derive(Debug, Clone, Copy)] +pub(crate) struct PathToNormalize<'a>(Option<&'a str>, &'a str); + +impl<'a> PathToNormalize<'a> { + /// Creates a `PathToNormalize` from the given single path. + #[inline] + #[must_use] + pub(crate) fn from_single_path(path: &'a str) -> Self { + Self(None, path) + } + + /// Creates a `PathToNormalize` from the given base and reference paths to be resolved. + #[must_use] + pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self { + if reference.starts_with('/') { + return Self(None, reference); + } + + match rfind(base.as_bytes(), b'/') { + Some(last_slash_pos) => Self(Some(&base[..=last_slash_pos]), reference), + None => Self(None, reference), + } + } + + /// Returns true if the path is empty string. + #[inline] + #[must_use] + fn is_empty(&self) -> bool { + // If `self.0` is `Some(_)`, it ends with a slash, i.e. it is not empty. + self.0.is_none() && self.1.is_empty() + } + + /// Returns the length of the not yet normalized path. + #[inline] + #[must_use] + pub(super) fn len(&self) -> usize { + self.len_prefix() + self.1.len() + } + + /// Returns the length of the prefix part. + /// + /// Returns 0 if the prefix part is empty. + #[inline] + #[must_use] + fn len_prefix(&self) -> usize { + self.0.map_or(0, |s| s.len()) + } + + /// Returns a byte at the given position. + #[must_use] + fn byte_at(&self, mut i: usize) -> Option<u8> { + if let Some(prefix) = self.0 { + if i < prefix.len() { + return Some(prefix.as_bytes()[i]); + } + i -= prefix.len(); + } + self.1.as_bytes().get(i).copied() + } + + /// Returns the position of the next slash of the byte at the given position. + #[must_use] + fn find_next_slash(&self, scan_start: usize) -> Option<usize> { + if let Some(prefix) = self.0 { + let prefix_len = prefix.len(); + if scan_start < prefix_len { + prefix[scan_start..].find('/').map(|rel| rel + scan_start) + } else { + let local_i = scan_start - prefix_len; + self.1[local_i..].find('/').map(|rel| rel + scan_start) + } + } else { + self.1[scan_start..].find('/').map(|rel| rel + scan_start) + } + } + + /// Removes the `len` characters from the beginning of `self`. + fn remove_start(&mut self, len: usize) { + if let Some(prefix) = self.0 { + if let Some(suffix_trim_len) = len.checked_sub(prefix.len()) { + self.0 = None; + self.1 = &self.1[suffix_trim_len..]; + } else { + self.0 = Some(&prefix[len..]); + } + } else { + self.1 = &self.1[len..]; + } + } + + /// Removes the prefix that are ignorable on normalization. + // Skips the prefix dot segments without leading slashes (such as `./`, + // `../`, and `../.././`). + // This is necessary because such segments should be removed with the + // FOLLOWING slashes, not leading slashes. + fn remove_ignorable_prefix(&mut self) { + while let Some(seg) = PathSegmentsIter::new(self).next() { + if seg.has_leading_slash { + // The first segment starting with a slash is not target. + break; + } + match seg.kind(self) { + SegmentKind::Dot | SegmentKind::DotDot => { + // Attempt to skip the following slash by `+ 1`. + let skip = self.len().min(seg.range.end + 1); + self.remove_start(skip); + } + SegmentKind::Normal => break, + } + } + } +} + +impl PathToNormalize<'_> { + /// Writes the normalized path. + pub(crate) fn fmt_write_normalize<S: Spec, W: fmt::Write>( + &self, + f: &mut W, + op: NormalizationOp, + authority_is_present: bool, + ) -> fmt::Result { + debug_assert!( + self.0.map_or(true, |s| s.ends_with('/')), + "[validity] the prefix field of `PathToNormalize` should end with a slash" + ); + + if self.is_empty() { + return Ok(()); + } + + if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath) + && !authority_is_present + && self.byte_at(0) != Some(b'/') + { + // Treat the path as "opaque", i.e. do not apply dot segments removal. + // See <https://github.com/lo48576/iri-string/issues/29>. + debug_assert!( + op.mode.case_pct_normalization(), + "[consistency] case/pct normalization should still be applied" + ); + if let Some(prefix) = self.0 { + write!(f, "{}", PctCaseNormalized::<S>::new(prefix))?; + } + write!(f, "{}", PctCaseNormalized::<S>::new(self.1))?; + return Ok(()); + } + + let mut rest = *self; + + // Skip the prefix dot segments without leading slashes (such as `./`, + // `../`, and `../.././`). + // This is necessary because such segments should be removed with the + // FOLLOWING slashes, not leading slashes. + rest.remove_ignorable_prefix(); + if rest.is_empty() { + // Path consists of only `/.`s and `/..`s. + // In this case, if the authority component is present, the result + // should be `/`, not empty. + if authority_is_present { + f.write_char('/')?; + } + return Ok(()); + } + + // None: No segments are written yet. + // Some(false): Something other than `/` is already written as the path. + // Some(true): Only a `/` is written as the path. + let mut only_a_slash_is_written = None; + let mut too_deep_area_may_have_dot_segments = true; + while !rest.is_empty() && too_deep_area_may_have_dot_segments { + /// The size of the queue to track the path segments. + /// + /// This should be nonzero. + const QUEUE_SIZE: usize = 8; + + { + // Skip `/.` and `/..` segments at the head. + let mut skipped_len = 0; + for seg in PathSegmentsIter::new(&rest) { + match seg.kind(&rest) { + SegmentKind::Dot | SegmentKind::DotDot => { + debug_assert!( + seg.has_leading_slash, + "[consistency] `.` or `..` segments without a + leading slash have already been skipped" + ); + skipped_len = seg.range.end; + } + _ => break, + } + } + rest.remove_start(skipped_len); + if rest.is_empty() { + // Finished with a dot segment. + // The last `/.` or `/..` should be replaced to `/`. + if !authority_is_present && (only_a_slash_is_written == Some(true)) { + // Insert a dot segment to break the prefix `//`. + // Without this, the path starts with `//` and it may + // be confused with the prefix of an authority. + f.write_str(".//")?; + } else { + f.write_char('/')?; + } + break; + } + } + + let mut queue: [Option<&'_ str>; QUEUE_SIZE] = Default::default(); + let mut level: usize = 0; + let mut first_segment_has_leading_slash = false; + + // Find higher path segments. + let mut end = 0; + for seg in PathSegmentsIter::new(&rest) { + let kind = seg.kind(&rest); + match kind { + SegmentKind::Dot => { + too_deep_area_may_have_dot_segments = true; + } + SegmentKind::DotDot => { + level = level.saturating_sub(1); + too_deep_area_may_have_dot_segments = true; + if level < queue.len() { + queue[level] = None; + } + } + SegmentKind::Normal => { + if level < queue.len() { + queue[level] = Some(seg.segment(&rest)); + too_deep_area_may_have_dot_segments = false; + end = seg.range.end; + if level == 0 { + first_segment_has_leading_slash = seg.has_leading_slash; + } + } + level += 1; + } + } + } + + // Write the path segments as possible, and update the internal state. + for segname in queue.iter().flatten() { + Self::emit_segment::<S, _>( + f, + &mut only_a_slash_is_written, + first_segment_has_leading_slash, + segname, + authority_is_present, + op, + )?; + } + + rest.remove_start(end); + } + + if !rest.is_empty() { + // No need of searching dot segments anymore. + assert!( + !too_deep_area_may_have_dot_segments, + "[consistency] loop condition of the previous loop" + ); + // Apply only normalization (if needed). + for seg in PathSegmentsIter::new(&rest) { + assert_eq!( + seg.kind(&rest), + SegmentKind::Normal, + "[consistency] already confirmed that there are no more dot segments" + ); + let segname = seg.segment(&rest); + Self::emit_segment::<S, _>( + f, + &mut only_a_slash_is_written, + seg.has_leading_slash, + segname, + authority_is_present, + op, + )?; + } + } + + Ok(()) + } + + /// Emits a non-dot segment and update the current state. + // + // `first_segment_has_leading_slash` can be any value if the segment is not the first one. + fn emit_segment<S: Spec, W: fmt::Write>( + f: &mut W, + only_a_slash_is_written: &mut Option<bool>, + first_segment_has_leading_slash: bool, + segname: &str, + authority_is_present: bool, + op: NormalizationOp, + ) -> fmt::Result { + // Omit the leading slash of the segment only if the segment is + // the first one and marked as not having a leading slash. + match *only_a_slash_is_written { + None => { + // First segment. + // This pass can be possible if `./` is repeated `QUEUE_SIZE` + // times at the beginning. + if first_segment_has_leading_slash { + f.write_char('/')?; + } + *only_a_slash_is_written = + Some(first_segment_has_leading_slash && segname.is_empty()); + } + Some(only_a_slash) => { + if only_a_slash && !authority_is_present { + // Apply serialization like WHATWG URL Standard. + // This prevents `<scheme=foo>:<path=//bar>` from written as + // `foo://bar`, which is interpreted as + // `<scheme=foo>://<authority=bar>`. Prepending `./`, the + // serialization result would be `foo:/.//bar`, which is safe. + f.write_str("./")?; + *only_a_slash_is_written = Some(false); + } + f.write_char('/')?; + } + } + + // Write the segment name. + if op.mode.case_pct_normalization() { + write!(f, "{}", PctCaseNormalized::<S>::new(segname)) + } else { + f.write_str(segname) + } + } + + /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent. + /// + /// Returns `Ok(())` when normalizable, returns `Err(_)` if not. + pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> { + /// A sink to get the prefix of the input. + #[derive(Default)] + struct PrefixRetriever { + /// The buffer to remember the prefix of the input. + buf: [u8; 3], + /// The next write position in the buffer. + cursor: usize, + } + impl PrefixRetriever { + /// Returns the read prefix data. + #[inline] + #[must_use] + fn as_bytes(&self) -> &[u8] { + &self.buf[..self.cursor] + } + } + impl fmt::Write for PrefixRetriever { + fn write_str(&mut self, s: &str) -> fmt::Result { + if !s.is_empty() && (self.cursor >= self.buf.len()) { + // Enough bytes are read. + return Err(fmt::Error); + } + self.buf[self.cursor..] + .iter_mut() + .zip(s.bytes()) + .for_each(|(dest, src)| *dest = src); + self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len()); + Ok(()) + } + } + + let mut prefix = PrefixRetriever::default(); + // The failure of this write indicates more than 3 characters are read. + // This is safe to ignore since the check needs only 3 characters. + let _ = self.fmt_write_normalize::<UriSpec, _>( + &mut prefix, + NormalizationOp { + mode: NormalizationMode::None, + }, + // Assume the authority is absent. + false, + ); + + if prefix.as_bytes() == b"/./" { + Err(Error::new()) + } else { + Ok(()) + } + } +} + +/// Characteristic of a path. +#[derive(Debug, Clone, Copy)] +pub(crate) enum PathCharacteristic { + /// Absolute path, not special. + CommonAbsolute, + /// Absolute path, not special. + CommonRelative, + /// The first path segment of the relative path has one or more colon characters. + RelativeFirstSegmentHasColon, + /// The path starts with the double slash. + StartsWithDoubleSlash, +} + +impl PathCharacteristic { + /// Returns true if the path is absolute. + #[inline] + #[must_use] + pub(crate) fn is_absolute(self) -> bool { + matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash) + } + + /// Returns the characteristic of the path. + pub(crate) fn from_path_to_display<S: Spec>( + path: &PathToNormalize<'_>, + op: NormalizationOp, + authority_is_present: bool, + ) -> Self { + /// Dummy writer to get necessary values. + #[derive(Default, Clone, Copy)] + struct Writer { + /// Result. + result: Option<PathCharacteristic>, + /// Whether the normalized path is absolute. + is_absolute: Option<bool>, + } + impl fmt::Write for Writer { + fn write_str(&mut self, mut s: &str) -> fmt::Result { + if self.result.is_some() { + // Nothing more to do. + return Err(fmt::Error); + } + while !s.is_empty() { + if self.is_absolute.is_none() { + // The first input. + match s.strip_prefix('/') { + Some(rest) => { + self.is_absolute = Some(true); + s = rest; + } + None => { + self.is_absolute = Some(false); + } + } + continue; + } + if self.is_absolute == Some(true) { + let result = if s.starts_with('/') { + PathCharacteristic::StartsWithDoubleSlash + } else { + PathCharacteristic::CommonAbsolute + }; + self.result = Some(result); + return Err(fmt::Error); + } + // Processing the first segment of the relative path. + match find_split_hole(s, b'/') { + Some((first_seg, _rest)) => { + let result = if first_seg.contains(':') { + PathCharacteristic::RelativeFirstSegmentHasColon + } else { + PathCharacteristic::CommonRelative + }; + self.result = Some(result); + return Err(fmt::Error); + } + None => { + // `s` might not be the complete first segment. + if s.contains(':') { + self.result = + Some(PathCharacteristic::RelativeFirstSegmentHasColon); + return Err(fmt::Error); + } + break; + } + } + } + Ok(()) + } + } + + let mut writer = Writer::default(); + match path.fmt_write_normalize::<S, _>(&mut writer, op, authority_is_present) { + // Empty path. + Ok(_) => PathCharacteristic::CommonRelative, + Err(_) => writer + .result + .expect("[consistency] the formatting quits early by `Err` when the check is done"), + } + } +} + +/// Path segment kind. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum SegmentKind { + /// `.` or the equivalents. + Dot, + /// `..` or the equivalents. + DotDot, + /// Other normal (not special) segments. + Normal, +} + +impl SegmentKind { + /// Creates a new `SegmentKind` from the given segment name. + #[must_use] + fn from_segment(s: &str) -> Self { + match s { + "." | "%2E" | "%2e" => SegmentKind::Dot, + ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E" + | "%2e%2e" => SegmentKind::DotDot, + _ => SegmentKind::Normal, + } + } +} + +/// A segment with optional leading slash. +#[derive(Debug, Clone)] +struct PathSegment { + /// Presence of a leading slash. + has_leading_slash: bool, + /// Range of the segment name (without any slashes). + range: Range<usize>, +} + +impl PathSegment { + /// Returns the segment without any slashes. + #[inline] + #[must_use] + fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str { + if let Some(prefix) = path.0 { + let prefix_len = prefix.len(); + if self.range.end <= prefix_len { + &prefix[self.range.clone()] + } else { + let range = (self.range.start - prefix_len)..(self.range.end - prefix_len); + &path.1[range] + } + } else { + &path.1[self.range.clone()] + } + } + + /// Returns the segment kind. + #[inline] + #[must_use] + fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind { + SegmentKind::from_segment(self.segment(path)) + } +} + +/// Iterator of path segments. +struct PathSegmentsIter<'a> { + /// Path. + path: &'a PathToNormalize<'a>, + /// Current cursor position. + cursor: usize, +} + +impl<'a> PathSegmentsIter<'a> { + /// Creates a new iterator of path segments. + #[inline] + #[must_use] + fn new(path: &'a PathToNormalize<'a>) -> Self { + Self { path, cursor: 0 } + } +} + +impl Iterator for PathSegmentsIter<'_> { + type Item = PathSegment; + + fn next(&mut self) -> Option<Self::Item> { + let path_len = self.path.len(); + if self.cursor >= path_len { + return None; + } + let has_leading_slash = self.path.byte_at(self.cursor) == Some(b'/'); + + let prefix_len = self.path.len_prefix(); + if (prefix_len != 0) && (self.cursor == prefix_len - 1) { + debug_assert!(has_leading_slash); + let end = self.path.1.find('/').unwrap_or(self.path.1.len()) + prefix_len; + self.cursor = end; + return Some(PathSegment { + has_leading_slash, + range: prefix_len..end, + }); + } + + if has_leading_slash { + // Skip the leading slash. + self.cursor += 1; + }; + let start = self.cursor; + self.cursor = self.path.find_next_slash(self.cursor).unwrap_or(path_len); + + Some(PathSegment { + has_leading_slash, + range: start..self.cursor, + }) + } +} |
