//! Normalization. //! //! # IRI normalization (and resolution) can fail //! //! Though this is not explicitly stated in RFC 3986, IRI normalization can fail. //! For example, `foo:.///bar`, `foo:./..//bar`, and `foo:/..//bar` are all //! normalized to `foo://bar` as a string. However, IRI without authority (note //! that this is different from "with empty authority") cannot have a path //! starting with `//`, since it is ambiguous and can be interpreted as an IRI //! with authority. So, `foo://bar` is decomposed as scheme `foo`, authority //! `bar`, and empty path. The expected result is the combination of scheme //! `foo`, no authority, and path `//bar` (though this is not possible to //! serialize), so the algorithm fails as it cannot return the intended result. //! //! IRI resolution can also fail since it (conditionally) invokes normalization //! during the resolution process. For example, resolving a reference `.///bar` //! or `/..//bar` against the base `foo:` fail. //! //! Thus, IRI resolution can fail for some abnormal cases. //! //! Note that this kind of failure can happen only when the base IRI has no //! authority and empty path. This would be rare in the wild, since many people //! would use an IRI with authority part, such as `http://`. //! //! If you are handling `scheme://`-style URIs and IRIs, don't worry about the //! failure. Currently no cases are known to fail when at least one of the base //! IRI or the relative IRI contains authorities. //! //! To know what will happen on resolution failure, see the module documentation //! for [`resolve`][`crate::resolve`]. //! //! ## Examples //! //! ### Normalization failure //! //! ``` //! # #[cfg(feature = "alloc")] { //! use iri_string::normalize::Error; //! use iri_string::types::{IriAbsoluteStr, IriReferenceStr}; //! //! let base = IriAbsoluteStr::new("foo:.///bar")?; //! assert!( //! base.normalize().ensure_rfc3986_normalizable().is_err(), //! "this normalization should fails without WAHTWG URL Standard serialization" //! ); //! # } //! # Ok::<_, iri_string::validate::Error>(()) //! ``` //! //! ### Resolution failure //! //! ``` //! # #[cfg(feature = "alloc")] { //! use iri_string::types::{IriAbsoluteStr, IriReferenceStr}; //! //! let base = IriAbsoluteStr::new("scheme:")?; //! { //! let reference = IriReferenceStr::new(".///bar")?; //! let result = reference.resolve_against(base) //! .ensure_rfc3986_normalizable(); //! assert!(result.is_err()); //! } //! //! { //! let reference2 = IriReferenceStr::new("/..//bar")?; //! // Resulting string will be `scheme://bar`, but `bar` should be a path //! // segment, not a host. So, the semantically correct target IRI cannot //! // be represented. //! let result2 = reference2.resolve_against(base) //! .ensure_rfc3986_normalizable(); //! assert!(result2.is_err()); //! } //! # } //! # Ok::<_, iri_string::validate::Error>(()) //! ``` mod error; mod path; mod pct_case; use core::fmt::{self, Display as _, Write as _}; use core::marker::PhantomData; #[cfg(feature = "alloc")] use alloc::collections::TryReserveError; use crate::components::{RiReferenceComponents, Splitter}; #[cfg(feature = "alloc")] use crate::format::{ToDedicatedString, ToStringFallible}; use crate::parser::str::rfind_split_hole; use crate::parser::trusted::is_ascii_only_host; use crate::spec::Spec; use crate::types::{RiAbsoluteStr, RiReferenceStr, RiStr}; #[cfg(feature = "alloc")] use crate::types::{RiAbsoluteString, RiString}; pub use self::error::Error; pub(crate) use self::path::{Path, PathCharacteristic, PathToNormalize}; pub(crate) use self::pct_case::{ is_pct_case_normalized, NormalizedAsciiOnlyHost, PctCaseNormalized, }; /// Normalization algorithm. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum NormalizationMode { /// No normalization. None, /// Default normalization mode. /// /// Applies RFC 3986 normalization whenever possible. When not possible, /// applies serialization algorithm defined in WHATWG URL standard. Default, /// WHATWG-like normalization mode. /// /// Preserves relative path as is (modulo case/pct normalization) when the /// authority component is absent. PreserveAuthoritylessRelativePath, } impl NormalizationMode { /// Returns true if case normalization and percent-encoding normalization should be applied. /// /// Note that even when this option is `true`, plain US-ASCII characters /// won't be automatically lowered. Users should apply case normalization /// for US-ASCII only `host` component by themselves. #[inline] #[must_use] fn case_pct_normalization(self) -> bool { match self { Self::None => false, Self::Default | Self::PreserveAuthoritylessRelativePath => true, } } } /// Normalizedness check algorithm. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) enum NormalizednessCheckMode { /// Default algorithm (corresponding to [`NormalizationMode::Default`]). Default, /// Strict RFC 3986 normalization. Rfc3986, /// WHATWG-like normalization algorithm (corresponding to /// [`NormalizationMode::PreserveAuthoritylessRelativePath`]). PreserveAuthoritylessRelativePath, } /// Normalization operation. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub(crate) struct NormalizationOp { /// Normalization mode. pub(crate) mode: NormalizationMode, } /// Spec-agnostic IRI normalization/resolution input. #[derive(Debug, Clone, Copy)] pub(crate) struct NormalizationInput<'a> { /// Target scheme. scheme: &'a str, /// Target authority. authority: Option<&'a str>, /// Target path without dot-removal. path: Path<'a>, /// Target query. query: Option<&'a str>, /// Target fragment. fragment: Option<&'a str>, /// Normalization type. op: NormalizationOp, } impl<'a> NormalizationInput<'a> { /// Creates a `NormalizedInput` from IRIs to resolve. #[inline] #[must_use] pub(crate) fn with_resolution_params( base_components: &RiReferenceComponents<'a, S>, reference: &'a RiReferenceStr, ) -> Self { let r = RiReferenceComponents::from(reference); Self::create_normalization_input( r.iri.as_str(), &r.splitter, base_components.iri.as_str(), &base_components.splitter, ) } /// Creates a `NormalizationInput` from components to resolve an IRI. #[must_use] fn create_normalization_input( r_iri: &'a str, r: &Splitter, b_iri: &'a str, b: &Splitter, ) -> Self { /// The toplevel component the reference has. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] enum RefToplevel { /// Scheme. Scheme, /// Authority. Authority, /// Path. Path, /// Query. Query, /// Reference is empty or has only fragment. None, } impl RefToplevel { /// Choose a component from either of the reference or the base, /// based on the toplevel component of the reference. #[inline] #[must_use] fn choose_then(self, component: RefToplevel, reference: F, base: G) -> T where F: FnOnce() -> T, G: FnOnce() -> T, { if self <= component { reference() } else { base() } } } let ref_toplevel = if r.has_scheme() { RefToplevel::Scheme } else if r.has_authority() { RefToplevel::Authority } else if !r.is_path_empty(r_iri.len()) { RefToplevel::Path } else if r.has_query() { RefToplevel::Query } else { RefToplevel::None }; let path = match ref_toplevel { RefToplevel::Scheme | RefToplevel::Authority => { Path::NeedsProcessing(PathToNormalize::from_single_path(r.path_str(r_iri))) } RefToplevel::Path => { let r_path = r.path_str(r_iri); if r_path.starts_with('/') { Path::NeedsProcessing(PathToNormalize::from_single_path(r_path)) } else { // About this branch, see // . // // > o If the base URI has a defined authority component and an empty // > path, then return a string consisting of "/" concatenated with the // > reference's path; otherwise, let b_path = b.path_str(b_iri); let b_path = if b.has_authority() && b_path.is_empty() { "/" } else { b_path }; Path::NeedsProcessing(PathToNormalize::from_paths_to_be_resolved( b_path, r_path, )) } } RefToplevel::Query | RefToplevel::None => Path::Done(b.path_str(b_iri)), }; Self { scheme: r.scheme_str(r_iri).unwrap_or_else(|| { b.scheme_str(b_iri) .expect("[validity] non-relative IRI must have a scheme") }), authority: ref_toplevel.choose_then( RefToplevel::Authority, || r.authority_str(r_iri), || b.authority_str(b_iri), ), path, query: ref_toplevel.choose_then( RefToplevel::Query, || r.query_str(r_iri), || b.query_str(b_iri), ), fragment: r.fragment_str(r_iri), op: NormalizationOp { mode: NormalizationMode::None, }, } } } impl<'a, S: Spec> From<&'a RiStr> for NormalizationInput<'a> { fn from(iri: &'a RiStr) -> Self { let components = RiReferenceComponents::::from(iri.as_ref()); let (scheme, authority, path, query, fragment) = components.to_major(); let scheme = scheme.expect("[validity] `absolute IRI must have `scheme`"); let path = Path::NeedsProcessing(PathToNormalize::from_single_path(path)); NormalizationInput { scheme, authority, path, query, fragment, op: NormalizationOp { mode: NormalizationMode::None, }, } } } #[cfg(feature = "alloc")] impl<'a, S: Spec> From<&'a RiString> for NormalizationInput<'a> { #[inline] fn from(iri: &'a RiString) -> Self { Self::from(iri.as_slice()) } } impl<'a, S: Spec> From<&'a RiAbsoluteStr> for NormalizationInput<'a> { fn from(iri: &'a RiAbsoluteStr) -> Self { let components = RiReferenceComponents::::from(iri.as_ref()); let (scheme, authority, path, query, fragment) = components.to_major(); let scheme = scheme.expect("[validity] `absolute IRI must have `scheme`"); let path = Path::NeedsProcessing(PathToNormalize::from_single_path(path)); NormalizationInput { scheme, authority, path, query, fragment, op: NormalizationOp { mode: NormalizationMode::None, }, } } } #[cfg(feature = "alloc")] impl<'a, S: Spec> From<&'a RiAbsoluteString> for NormalizationInput<'a> { #[inline] fn from(iri: &'a RiAbsoluteString) -> Self { Self::from(iri.as_slice()) } } impl NormalizationInput<'_> { /// Checks if the path is normalizable by RFC 3986 algorithm. /// /// Returns `Ok(())` when normalizable, returns `Err(_)` if not. pub(crate) fn ensure_rfc3986_normalizable(&self) -> Result<(), Error> { if self.authority.is_some() { return Ok(()); } match self.path { Path::Done(_) => Ok(()), Path::NeedsProcessing(path) => path.ensure_rfc3986_normalizable_with_authority_absent(), } } } /// Writable as a normalized IRI. /// /// Note that this implicitly apply serialization rule defined by WHATWG URL /// Standard (to handle normalization impossible by RFC 3986) because `Display` /// should not fail by reasons other than backend I/O failure. If you make the /// normalization fail in such cases, check if the path starts with `/./`. /// When the normalization succeeds by RFC 3986 algorithm, the path never starts /// with `/./`. struct NormalizedInner<'a, S> { /// Spec-agnostic normalization input. input: NormalizationInput<'a>, /// Spec. _spec: PhantomData S>, } impl fmt::Debug for NormalizedInner<'_, S> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Normalized") .field("input", &self.input) .finish() } } impl<'a, S: Spec> NormalizedInner<'a, S> { /// Creates a new `Normalized` object from the given input. #[inline] #[must_use] fn from_input(input: NormalizationInput<'a>) -> Self { Self { input, _spec: PhantomData, } } } impl fmt::Display for NormalizedInner<'_, S> { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // Write the scheme. if self.input.op.mode.case_pct_normalization() { normalize_scheme(f, self.input.scheme)?; } else { f.write_str(self.input.scheme)?; } f.write_str(":")?; // Write the authority if available. if let Some(authority) = self.input.authority { f.write_str("//")?; if self.input.op.mode.case_pct_normalization() { normalize_authority::(f, authority)?; } else { // No case/pct normalization. f.write_str(authority)?; } } // Process and write the path. match self.input.path { Path::Done(s) => { if self.input.op.mode.case_pct_normalization() { // Normalize the path. PathToNormalize::from_single_path(s).fmt_write_normalize::( f, self.input.op, self.input.authority.is_some(), )? } else { // No normalization. f.write_str(s)? } } Path::NeedsProcessing(path) => { path.fmt_write_normalize::(f, self.input.op, self.input.authority.is_some())? } } // Write the query if available. if let Some(query) = self.input.query { f.write_char('?')?; if self.input.op.mode.case_pct_normalization() { normalize_query::(f, query)?; } else { f.write_str(query)?; } } // Write the fragment if available. if let Some(fragment) = self.input.fragment { f.write_char('#')?; if self.input.op.mode.case_pct_normalization() { normalize_fragment::(f, fragment)?; } else { f.write_str(fragment)?; } } Ok(()) } } /// Writes the normalized scheme. pub(crate) fn normalize_scheme(f: &mut fmt::Formatter<'_>, scheme: &str) -> fmt::Result { // Apply case normalization. // // > namely, that the scheme and US-ASCII only host are case // > insensitive and therefore should be normalized to lowercase. // > // > --- . // // Note that `scheme` consists of only ASCII characters and contains // no percent-encoded characters. scheme .chars() .map(|c| c.to_ascii_lowercase()) .try_for_each(|c| f.write_char(c)) } /// Writes the normalized authority. fn normalize_authority(f: &mut fmt::Formatter<'_>, authority: &str) -> fmt::Result { let host_port = match rfind_split_hole(authority, b'@') { Some((userinfo, host_port)) => { // Don't lowercase `userinfo` even if it is ASCII only. `userinfo` // is not a part of `host`. PctCaseNormalized::::new(userinfo).fmt(f)?; f.write_char('@')?; host_port } None => authority, }; normalize_host_port::(f, host_port) } /// Writes the normalized host and port. pub(crate) fn normalize_host_port( f: &mut fmt::Formatter<'_>, host_port: &str, ) -> fmt::Result { // If the suffix is a colon, it is a delimiter between the host and empty // port. An empty port should be removed during normalization (see RFC 3986 // section 3.2.3), so strip it. // // > URI producers and normalizers should omit the port component and its // > ":" delimiter if port is empty or if its value would be the same as // > that of the scheme's default. // > // > --- [RFC 3986 section 3.2.3. Port](https://www.rfc-editor.org/rfc/rfc3986.html#section-3.2.3) let host_port = host_port.strip_suffix(':').unwrap_or(host_port); // Apply case normalization and percent-encoding normalization to `host`. // Optional `":" port` part only consists of an ASCII colon and ASCII // digits, so this won't affect to the test result. if is_ascii_only_host(host_port) { // If the host is ASCII characters only, make plain alphabets lower case. NormalizedAsciiOnlyHost::new(host_port).fmt(f) } else { PctCaseNormalized::::new(host_port).fmt(f) } } /// Writes the normalized query without the '?' prefix. pub(crate) fn normalize_query(f: &mut fmt::Formatter<'_>, query: &str) -> fmt::Result { // Apply percent-encoding normalization. PctCaseNormalized::::new(query).fmt(f) } /// Writes the normalized query without the '#' prefix. pub(crate) fn normalize_fragment( f: &mut fmt::Formatter<'_>, fragment: &str, ) -> fmt::Result { // Apply percent-encoding normalization. PctCaseNormalized::::new(fragment).fmt(f) } /// Normalized OR resolved IRI. /// /// Resolved IRI can be represented by this type. In that case, the result might /// not be normalized. If you want the IRI resolution result to be normalized, /// use [`enable_normalization`][`Self::enable_normalization`] method. /// /// [`Display`]: `core::fmt::Display` pub struct Normalized<'a, T: ?Sized> { /// Spec-agnostic normalization input. input: NormalizationInput<'a>, /// Expected result type. _ty_str: PhantomData T>, } impl fmt::Debug for Normalized<'_, T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Normalized") .field("input", &self.input) .finish() } } impl<'a, T: ?Sized> Normalized<'a, T> { /// Creates a new `Normalized` object from the given input. #[inline] #[must_use] pub(crate) fn from_input(input: NormalizationInput<'a>) -> Self { Self { input, _ty_str: PhantomData, } } /// Enables the normalization. /// /// This lets the normalizer apply the case normalization, percent-encoding /// normalization, and dot segments removal. #[inline] pub fn enable_normalization(&mut self) { self.input.op.mode = NormalizationMode::Default; } /// Enables the normalization that preserve relative path under some condition. /// /// Note that this normalization algorithm is not compatible with RFC 3986 /// algorithm for some inputs. /// /// See [`RiStr::normalize_but_preserve_authorityless_relative_path()`] /// for detail. #[inline] pub fn enable_normalization_preserving_authorityless_relative_path(&mut self) { self.input.op.mode = NormalizationMode::PreserveAuthoritylessRelativePath; } /// Returns `Self` with normalization enabled. #[inline] #[must_use] pub fn and_normalize(mut self) -> Self { self.enable_normalization(); self } /// Returns `Self` with special normalization enabled. /// /// Note that this normalization algorithm is not compatible with RFC 3986 /// algorithm for some inputs. /// /// See [`RiStr::normalize_but_preserve_authorityless_relative_path()`] /// for detail. #[inline] #[must_use] pub fn and_normalize_but_preserve_authorityless_relative_path(mut self) -> Self { self.enable_normalization_preserving_authorityless_relative_path(); self } /// Checks if the path is normalizable by RFC 3986 algorithm. /// /// Returns `Ok(())` when normalizable, returns `Err(_)` if not. #[inline] pub fn ensure_rfc3986_normalizable(&self) -> Result<(), Error> { self.input.ensure_rfc3986_normalizable() } } impl fmt::Display for Normalized<'_, RiStr> { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { NormalizedInner::::from_input(self.input).fmt(f) } } impl fmt::Display for Normalized<'_, RiAbsoluteStr> { #[inline] fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { NormalizedInner::::from_input(self.input).fmt(f) } } #[cfg(feature = "alloc")] impl ToDedicatedString for Normalized<'_, RiStr> { type Target = RiString; fn try_to_dedicated_string(&self) -> Result { let s = self.try_to_string()?; Ok(TryFrom::try_from(s).expect("[validity] the normalization result must be a valid IRI")) } } #[cfg(feature = "alloc")] impl From>> for RiString { #[inline] fn from(v: Normalized<'_, RiStr>) -> Self { v.to_dedicated_string() } } #[cfg(feature = "alloc")] impl From<&Normalized<'_, RiStr>> for RiString { #[inline] fn from(v: &Normalized<'_, RiStr>) -> Self { v.to_dedicated_string() } } #[cfg(feature = "alloc")] impl ToDedicatedString for Normalized<'_, RiAbsoluteStr> { type Target = RiAbsoluteString; fn try_to_dedicated_string(&self) -> Result { let s = self.try_to_string()?; Ok(TryFrom::try_from(s).expect("[validity] the normalization result must be a valid IRI")) } } #[cfg(feature = "alloc")] impl From>> for RiAbsoluteString { #[inline] fn from(v: Normalized<'_, RiAbsoluteStr>) -> Self { v.to_dedicated_string() } } #[cfg(feature = "alloc")] impl From<&Normalized<'_, RiAbsoluteStr>> for RiAbsoluteString { #[inline] fn from(v: &Normalized<'_, RiAbsoluteStr>) -> Self { v.to_dedicated_string() } }