diff options
Diffstat (limited to 'vendor/iri-string/src/parser/trusted.rs')
| -rw-r--r-- | vendor/iri-string/src/parser/trusted.rs | 476 |
1 files changed, 476 insertions, 0 deletions
diff --git a/vendor/iri-string/src/parser/trusted.rs b/vendor/iri-string/src/parser/trusted.rs new file mode 100644 index 00000000..f15c075e --- /dev/null +++ b/vendor/iri-string/src/parser/trusted.rs @@ -0,0 +1,476 @@ +//! Fast parsers for trusted (already validated) input. +//! +//! Using this in wrong way will lead to unexpected wrong result. + +pub(crate) mod authority; + +use core::cmp::Ordering; +use core::num::NonZeroUsize; + +use crate::components::{RiReferenceComponents, Splitter}; +use crate::format::eq_str_display; +use crate::normalize::{is_pct_case_normalized, NormalizedAsciiOnlyHost, NormalizednessCheckMode}; +use crate::parser::str::{find_split2, find_split3, find_split4_hole, find_split_hole}; +use crate::spec::Spec; +use crate::types::RiReferenceStr; + +/// Eats a `scheme` and a following colon, and returns the rest and the scheme. +/// +/// Returns `(rest, scheme)`. +/// +/// This should be called at the head of an absolute IRIs/URIs. +#[must_use] +fn scheme_colon(i: &str) -> (&str, &str) { + let (scheme, rest) = + find_split_hole(i, b':').expect("[precondition] absolute IRIs must have `scheme` part"); + (rest, scheme) +} + +/// Eats a `scheme` and a following colon if available, and returns the rest and the scheme. +/// +/// This should be called at the head of an `IRI-reference` or similar. +#[must_use] +fn scheme_colon_opt(i: &str) -> (&str, Option<&str>) { + match find_split4_hole(i, b':', b'/', b'?', b'#') { + Some((scheme, b':', rest)) => (rest, Some(scheme)), + _ => (i, None), + } +} + +/// Eats double slash and the following authority if available, and returns the authority. +/// +/// This should be called at the head of an `IRI-reference`, or at the result of `scheme_colon`. +#[must_use] +fn slash_slash_authority_opt(i: &str) -> (&str, Option<&str>) { + let s = match i.strip_prefix("//") { + Some(rest) => rest, + None => return (i, None), + }; + // `i` might match `path-abempty` (which can start with `//`), but it is not + // allowed as `relative-part`, so no need to care `path-abempty` rule here. + // A slash, question mark, and hash character won't appear in `authority`. + match find_split3(s, b'/', b'?', b'#') { + Some((authority, rest)) => (rest, Some(authority)), + None => ("", Some(s)), + } +} + +/// Eats a string until the query, and returns that part (excluding `?` for the query). +#[must_use] +fn until_query(i: &str) -> (&str, &str) { + // `?` won't appear before the query part. + match find_split2(i, b'?', b'#') { + Some((before_query, rest)) => (rest, before_query), + None => ("", i), + } +} + +/// Decomposes query and fragment, if available. +/// +/// The string must starts with `?`, or `#`, or be empty. +#[must_use] +fn decompose_query_and_fragment(i: &str) -> (Option<&str>, Option<&str>) { + match i.as_bytes().first().copied() { + None => (None, None), + Some(b'?') => { + let rest = &i[1..]; + match find_split_hole(rest, b'#') { + Some((query, fragment)) => (Some(query), Some(fragment)), + None => (Some(rest), None), + } + } + Some(c) => { + debug_assert_eq!(c, b'#'); + (None, Some(&i[1..])) + } + } +} + +/// Decomposes the given valid `IRI-reference`. +#[must_use] +pub(crate) fn decompose_iri_reference<S: Spec>( + i: &RiReferenceStr<S>, +) -> RiReferenceComponents<'_, S> { + /// Inner function to avoid unnecessary monomorphizations on `S`. + fn decompose(i: &str) -> Splitter { + let len = i.len(); + + let (i, scheme_end) = { + let (i, scheme) = scheme_colon_opt(i); + let end = scheme.and_then(|s| NonZeroUsize::new(s.len())); + (i, end) + }; + let (i, authority_end) = { + // 2: "//".len() + let start = len - i.len() + 2; + // `authority` does not contain the two slashes of `://'. + let (i, authority) = slash_slash_authority_opt(i); + let end = authority.and_then(|s| NonZeroUsize::new(start + s.len())); + (i, end) + }; + let (i, _path) = until_query(i); + + let (query_start, fragment_start) = { + // This could theoretically be zero if `len` is `usize::MAX` and + // `i` has neither a query nor a fragment. However, this is + // practically impossible. + let after_first_prefix = NonZeroUsize::new((len - i.len()).wrapping_add(1)); + + let (query, fragment) = decompose_query_and_fragment(i); + match (query.is_some(), fragment) { + (true, Some(fragment)) => { + (after_first_prefix, NonZeroUsize::new(len - fragment.len())) + } + (true, None) => (after_first_prefix, None), + (false, Some(_fragment)) => (None, after_first_prefix), + (false, None) => (None, None), + } + }; + + Splitter::new(scheme_end, authority_end, query_start, fragment_start) + } + + RiReferenceComponents { + iri: i, + splitter: decompose(i.as_str()), + } +} + +/// Extracts `scheme` part from an IRI reference. +/// +/// # Precondition +/// +/// The given string must be a valid IRI reference. +#[inline] +#[must_use] +pub(crate) fn extract_scheme(i: &str) -> Option<&str> { + scheme_colon_opt(i).1 +} + +/// Extracts `scheme` part from an absolute IRI. +/// +/// # Precondition +/// +/// The given string must be a valid absolute IRI. +#[inline] +#[must_use] +pub(crate) fn extract_scheme_absolute(i: &str) -> &str { + scheme_colon(i).1 +} + +/// Extracts `authority` part from an IRI reference. +/// +/// # Precondition +/// +/// The given string must be a valid IRI reference. +#[inline] +#[must_use] +pub(crate) fn extract_authority(i: &str) -> Option<&str> { + let (i, _scheme) = scheme_colon_opt(i); + slash_slash_authority_opt(i).1 +} + +/// Extracts `authority` part from an absolute IRI. +/// +/// # Precondition +/// +/// The given string must be a valid absolute IRI. +#[inline] +#[must_use] +pub(crate) fn extract_authority_absolute(i: &str) -> Option<&str> { + let (i, _scheme) = scheme_colon(i); + slash_slash_authority_opt(i).1 +} + +/// Extracts `authority` part from a relative IRI. +/// +/// # Precondition +/// +/// The given string must be a valid relative IRI. +#[inline] +#[must_use] +pub(crate) fn extract_authority_relative(i: &str) -> Option<&str> { + slash_slash_authority_opt(i).1 +} + +/// Extracts `path` part from an IRI reference. +/// +/// # Precondition +/// +/// The given string must be a valid IRI reference. +#[inline] +#[must_use] +pub(crate) fn extract_path(i: &str) -> &str { + let (i, _scheme) = scheme_colon_opt(i); + let (i, _authority) = slash_slash_authority_opt(i); + until_query(i).1 +} + +/// Extracts `path` part from an absolute IRI. +/// +/// # Precondition +/// +/// The given string must be a valid absolute IRI. +#[inline] +#[must_use] +pub(crate) fn extract_path_absolute(i: &str) -> &str { + let (i, _scheme) = scheme_colon(i); + let (i, _authority) = slash_slash_authority_opt(i); + until_query(i).1 +} + +/// Extracts `path` part from a relative IRI. +/// +/// # Precondition +/// +/// The given string must be a valid relative IRI. +#[inline] +#[must_use] +pub(crate) fn extract_path_relative(i: &str) -> &str { + let (i, _authority) = slash_slash_authority_opt(i); + until_query(i).1 +} + +/// Extracts `query` part from an IRI reference. +/// +/// # Precondition +/// +/// The given string must be a valid IRI reference. +#[inline] +#[must_use] +pub(crate) fn extract_query(i: &str) -> Option<&str> { + let (i, _before_query) = until_query(i); + decompose_query_and_fragment(i).0 +} + +/// Extracts `query` part from an `absolute-IRI` string. +/// +/// # Precondition +/// +/// The given string must be a valid `absolute-IRI` string. +#[must_use] +pub(crate) fn extract_query_absolute_iri(i: &str) -> Option<&str> { + let (i, _before_query) = until_query(i); + if i.is_empty() { + None + } else { + debug_assert_eq!( + i.as_bytes().first(), + Some(&b'?'), + "`absolute-IRI` string must not have `fragment part" + ); + Some(&i[1..]) + } +} + +/// Splits an IRI string into the prefix and the fragment part. +/// +/// A leading `#` character is truncated if the fragment part exists. +/// +/// # Precondition +/// +/// The given string must be a valid IRI reference. +#[inline] +#[must_use] +pub(crate) fn split_fragment(iri: &str) -> (&str, Option<&str>) { + // It is completely OK to find the first `#` character from valid IRI to get fragment part, + // because the spec says that there are no `#` characters before the fragment part. + // + // > ``` + // > scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + // > ``` + // > + // > --- [RFC 3986, section 3.1. Scheme](https://tools.ietf.org/html/rfc3986#section-3.1) + // + // > The authority component is preceded by a double slash ("//") and is terminated by the + // > next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end + // > of the URI. + // > + // > --- [RFC 3986, section 3.2. Authority](https://tools.ietf.org/html/rfc3986#section-3.2) + // + // > The path is terminated by the first question mark ("?") or number sign ("#") + // > character, or by the end of the URI. + // > + // > --- [RFC 3986, section 3.3. Path](https://tools.ietf.org/html/rfc3986#section-3.3) + // + // > The query component is indicated by the first question mark ("?") character and + // > terminated by a number sign ("#") character or by the end of the URI. + // > + // > --- [RFC 3986, section 3.4. Query](https://tools.ietf.org/html/rfc3986#section-3.4) + match find_split_hole(iri, b'#') { + Some((prefix, fragment)) => (prefix, Some(fragment)), + None => (iri, None), + } +} + +/// Returns the fragment part of the given IRI. +/// +/// A leading `#` character of the fragment is truncated. +#[inline] +#[must_use] +pub(crate) fn extract_fragment(iri: &str) -> Option<&str> { + split_fragment(iri).1 +} + +/// Returns `Ok(_)` if the string is normalized. +/// +/// If this function returns `true`, normalization input and output will be identical. +/// +/// In this function, "normalized" means that any of the normalization below +/// won't change the input on normalization: +/// +/// * syntax-based normalization, +/// * case normalization, +/// * percent-encoding normalization, and +/// * path segment normalizaiton. +/// +/// Note that scheme-based normalization is not considered. +#[must_use] +pub(crate) fn is_normalized<S: Spec>(i: &str, mode: NormalizednessCheckMode) -> bool { + let (i, scheme) = scheme_colon(i); + let (after_authority, authority) = slash_slash_authority_opt(i); + let (_after_path, path) = until_query(after_authority); + + // Syntax-based normalization: uppercase chars in `scheme` should be + // converted to lowercase. + if scheme.bytes().any(|b| b.is_ascii_uppercase()) { + return false; + } + + // Case normalization: ASCII alphabets in US-ASCII only `host` should be + // normalized to lowercase. + // Case normalization: ASCII alphabets in percent-encoding triplet should be + // normalized to uppercase. + // Percent-encoding normalization: unresreved characters should be decoded + // in `userinfo`, `host`, `path`, `query`, and `fragments`. + // Path segment normalization: the path should not have dot segments (`.` + // and/or `..`). + // + // Note that `authority` can have percent-encoded `userinfo`. + if let Some(authority) = authority { + let authority_components = authority::decompose_authority(authority); + + // Check `host`. + let host = authority_components.host(); + let host_is_normalized = if is_ascii_only_host(host) { + eq_str_display(host, &NormalizedAsciiOnlyHost::new(host)) + } else { + // If the host is not ASCII-only, conversion to lowercase is not performed. + is_pct_case_normalized::<S>(host) + }; + if !host_is_normalized { + return false; + } + + // Check pencent encodings in `userinfo`. + if let Some(userinfo) = authority_components.userinfo() { + if !is_pct_case_normalized::<S>(userinfo) { + return false; + } + } + } + + // Check `path`. + // + // Syntax-based normalization: Dot segments might be removed. + // Note that we don't have to care `%2e` and `%2E` since `.` is unreserved + // and they will be decoded if not normalized. + // Also note that WHATWG serialization will use `/.//` as a path prefix if + // the path is absolute and won't modify the path if the path is relative. + // + // Percent-encoding normalization: unresreved characters should be decoded + // in `path`, `query`, and `fragments`. + let path_span_no_dot_segments = if authority.is_some() { + Some(path) + } else { + match mode { + NormalizednessCheckMode::Default => Some(path.strip_prefix("/.//").unwrap_or(path)), + NormalizednessCheckMode::Rfc3986 => Some(path), + NormalizednessCheckMode::PreserveAuthoritylessRelativePath => { + if path.starts_with('/') { + // Absolute. + Some(path.strip_prefix("/.//").unwrap_or(path)) + } else { + // Relative. Treat the path as "opaque". No span to check. + None + } + } + } + }; + if let Some(path_span_no_dot_segments) = path_span_no_dot_segments { + if path_span_no_dot_segments + .split('/') + .any(|segment| matches!(segment, "." | "..")) + { + return false; + } + } + is_pct_case_normalized::<S>(after_authority) +} + +/// Decodes two hexdigits into a byte. +/// +/// # Preconditions +/// +/// The parameters `upper` and `lower` should be an ASCII hexadecimal digit. +#[must_use] +pub(super) fn hexdigits_to_byte([upper, lower]: [u8; 2]) -> u8 { + let i_upper = match (upper & 0xf0).cmp(&0x40) { + Ordering::Less => upper - b'0', + Ordering::Equal => upper - (b'A' - 10), + Ordering::Greater => upper - (b'a' - 10), + }; + let i_lower = match (lower & 0xf0).cmp(&0x40) { + Ordering::Less => lower - b'0', + Ordering::Equal => lower - (b'A' - 10), + Ordering::Greater => lower - (b'a' - 10), + }; + (i_upper << 4) + i_lower +} + +/// Converts the first two hexdigit bytes in the buffer into a byte. +/// +/// # Panics +/// +/// Panics if the string does not start with two hexdigits. +#[must_use] +pub(crate) fn take_xdigits2(s: &str) -> (u8, &str) { + let mut bytes = s.bytes(); + let upper_xdigit = bytes + .next() + .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference"); + let lower_xdigit = bytes + .next() + .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference"); + let v = hexdigits_to_byte([upper_xdigit, lower_xdigit]); + (v, &s[2..]) +} + +/// Returns true if the given `host`/`ihost` string consists of only US-ASCII characters. +/// +/// # Precondition +/// +/// The given string should be valid `host` or `host ":" port` string. +#[must_use] +pub(crate) fn is_ascii_only_host(mut host: &str) -> bool { + while let Some((i, c)) = host + .char_indices() + .find(|(_i, c)| !c.is_ascii() || *c == '%') + { + if c != '%' { + // Non-ASCII character found. + debug_assert!(!c.is_ascii()); + return false; + } + // Percent-encoded character found. + let after_pct = &host[(i + 1)..]; + let (byte, rest) = take_xdigits2(after_pct); + if !byte.is_ascii() { + return false; + } + host = rest; + } + + // Neither non-ASCII characters nor percent-encoded characters found. + true +} |
