diff options
Diffstat (limited to 'vendor/unicode-normalization/scripts/unicode.py')
| -rwxr-xr-x | vendor/unicode-normalization/scripts/unicode.py | 617 |
1 files changed, 0 insertions, 617 deletions
diff --git a/vendor/unicode-normalization/scripts/unicode.py b/vendor/unicode-normalization/scripts/unicode.py deleted file mode 100755 index d9bb69cf..00000000 --- a/vendor/unicode-normalization/scripts/unicode.py +++ /dev/null @@ -1,617 +0,0 @@ -#!/usr/bin/env python -# -# Copyright 2011-2018 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# This script uses the following Unicode tables: -# - DerivedNormalizationProps.txt -# - NormalizationTest.txt -# - UnicodeData.txt -# - StandardizedVariants.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the tables.rs and normalization_tests.rs files into git. -import collections -import urllib.request -from itertools import batched - -UNICODE_VERSION = "16.0.0" -UCD_URL = "https://www.unicode.org/Public/%s/ucd/" % UNICODE_VERSION - -PREAMBLE = """// Copyright 2012-2018 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or -// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license -// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly - -#![allow(missing_docs)] -""" - -NormalizationTest = collections.namedtuple( - "NormalizationTest", - ["source", "nfc", "nfd", "nfkc", "nfkd"], -) - -# Mapping taken from Table 12 from: -# http://www.unicode.org/reports/tr44/#General_Category_Values -expanded_categories = { - 'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'], - 'Lm': ['L'], 'Lo': ['L'], - 'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'], - 'Nd': ['N'], 'Nl': ['N'], 'No': ['No'], - 'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'], - 'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'], - 'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'], - 'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'], - 'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'], -} - -# Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior -# http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior -S_BASE, L_COUNT, V_COUNT, T_COUNT = 0xAC00, 19, 21, 28 -S_COUNT = L_COUNT * V_COUNT * T_COUNT - -class UnicodeData(object): - def __init__(self): - self._load_unicode_data() - self.norm_props = self._load_norm_props() - self.norm_tests = self._load_norm_tests() - - self.canon_comp = self._compute_canonical_comp() - self.canon_fully_decomp, self.compat_fully_decomp = self._compute_fully_decomposed() - - self.cjk_compat_variants_fully_decomp = {} - self._load_cjk_compat_ideograph_variants() - - def stats(name, table): - count = sum(len(v) for v in table.values()) - print("%s: %d chars => %d decomposed chars" % (name, len(table), count)) - - print("Decomposition table stats:") - stats("Canonical decomp", self.canon_decomp) - stats("Compatible decomp", self.compat_decomp) - stats("Canonical fully decomp", self.canon_fully_decomp) - stats("Compatible fully decomp", self.compat_fully_decomp) - stats("CJK Compat Variants fully decomp", self.cjk_compat_variants_fully_decomp) - - self.ss_leading, self.ss_trailing = self._compute_stream_safe_tables() - - def _fetch(self, filename): - resp = urllib.request.urlopen(UCD_URL + filename) - return resp.read().decode('utf-8') - - def _load_unicode_data(self): - self.name_to_char_int = {} - self.combining_classes = {} - self.compat_decomp = {} - self.canon_decomp = {} - self.general_category_mark = [] - self.general_category_public_assigned = [] - - assigned_start = 0; - prev_char_int = -1; - prev_name = ""; - - for line in self._fetch("UnicodeData.txt").splitlines(): - # See ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html - pieces = line.split(';') - assert len(pieces) == 15 - char, name, category, cc, decomp = pieces[0], pieces[1], pieces[2], pieces[3], pieces[5] - char_int = int(char, 16) - - name = pieces[1].strip() - self.name_to_char_int[name] = char_int - - if cc != '0': - self.combining_classes[char_int] = cc - - if decomp.startswith('<'): - self.compat_decomp[char_int] = [int(c, 16) for c in decomp.split()[1:]] - elif decomp != '': - self.canon_decomp[char_int] = [int(c, 16) for c in decomp.split()] - - if category == 'M' or 'M' in expanded_categories.get(category, []): - self.general_category_mark.append(char_int) - - assert category != 'Cn', "Unexpected: Unassigned codepoint in UnicodeData.txt" - if category not in ['Co', 'Cs']: - if char_int != prev_char_int + 1 and not is_first_and_last(prev_name, name): - self.general_category_public_assigned.append((assigned_start, prev_char_int)) - assigned_start = char_int - prev_char_int = char_int - prev_name = name; - - self.general_category_public_assigned.append((assigned_start, prev_char_int)) - - def _load_cjk_compat_ideograph_variants(self): - for line in self._fetch("StandardizedVariants.txt").splitlines(): - strip_comments = line.split('#', 1)[0].strip() - if not strip_comments: - continue - - variation_sequence, description, differences = strip_comments.split(';') - description = description.strip() - - # Don't use variations that only apply in particular shaping environments. - if differences: - continue - - # Look for entries where the description field is a codepoint name. - if description not in self.name_to_char_int: - continue - - # Only consider the CJK Compatibility Ideographs. - if not description.startswith('CJK COMPATIBILITY IDEOGRAPH-'): - continue - - char_int = self.name_to_char_int[description] - - assert not char_int in self.combining_classes, "Unexpected: CJK compat variant with a combining class" - assert not char_int in self.compat_decomp, "Unexpected: CJK compat variant and compatibility decomposition" - assert len(self.canon_decomp[char_int]) == 1, "Unexpected: CJK compat variant and non-singleton canonical decomposition" - # If we ever need to handle Hangul here, we'll need to handle it separately. - assert not (S_BASE <= char_int < S_BASE + S_COUNT) - - cjk_compat_variant_parts = [int(c, 16) for c in variation_sequence.split()] - for c in cjk_compat_variant_parts: - assert not c in self.canon_decomp, "Unexpected: CJK compat variant is unnormalized (canon)" - assert not c in self.compat_decomp, "Unexpected: CJK compat variant is unnormalized (compat)" - self.cjk_compat_variants_fully_decomp[char_int] = cjk_compat_variant_parts - - def _load_norm_props(self): - props = collections.defaultdict(list) - - for line in self._fetch("DerivedNormalizationProps.txt").splitlines(): - (prop_data, _, _) = line.partition("#") - prop_pieces = prop_data.split(";") - - if len(prop_pieces) < 2: - continue - - assert len(prop_pieces) <= 3 - (low, _, high) = prop_pieces[0].strip().partition("..") - - prop = prop_pieces[1].strip() - - data = None - if len(prop_pieces) == 3: - data = prop_pieces[2].strip() - - props[prop].append((low, high, data)) - - return props - - def _load_norm_tests(self): - tests = [] - for line in self._fetch("NormalizationTest.txt").splitlines(): - (test_data, _, _) = line.partition("#") - test_pieces = test_data.split(";") - - if len(test_pieces) < 5: - continue - - source, nfc, nfd, nfkc, nfkd = [[c.strip() for c in p.split()] for p in test_pieces[:5]] - tests.append(NormalizationTest(source, nfc, nfd, nfkc, nfkd)) - - return tests - - def _compute_canonical_comp(self): - canon_comp = {} - comp_exclusions = [ - (int(low, 16), int(high or low, 16)) - for low, high, _ in self.norm_props["Full_Composition_Exclusion"] - ] - for char_int, decomp in self.canon_decomp.items(): - if any(lo <= char_int <= hi for lo, hi in comp_exclusions): - continue - - assert len(decomp) == 2 - assert (decomp[0], decomp[1]) not in canon_comp - canon_comp[(decomp[0], decomp[1])] = char_int - - return canon_comp - - def _compute_fully_decomposed(self): - """ - Even though the decomposition algorithm is recursive, it is possible - to precompute the recursion at table generation time with modest - increase to the table size. Then, for these precomputed tables, we - note that 1) compatible decomposition is a subset of canonical - decomposition and 2) they mostly agree on their intersection. - Therefore, we don't store entries in the compatible table for - characters that decompose the same way under canonical decomposition. - - Decomposition table stats: - Canonical decomp: 2060 chars => 3085 decomposed chars - Compatible decomp: 3662 chars => 5440 decomposed chars - Canonical fully decomp: 2060 chars => 3404 decomposed chars - Compatible fully decomp: 3678 chars => 5599 decomposed chars - - The upshot is that decomposition code is very simple and easy to inline - at mild code size cost. - """ - def _decompose(char_int, compatible): - # 7-bit ASCII never decomposes - if char_int <= 0x7f: - yield char_int - return - - # Assert that we're handling Hangul separately. - assert not (S_BASE <= char_int < S_BASE + S_COUNT) - - decomp = self.canon_decomp.get(char_int) - if decomp is not None: - for decomposed_ch in decomp: - for fully_decomposed_ch in _decompose(decomposed_ch, compatible): - yield fully_decomposed_ch - return - - if compatible and char_int in self.compat_decomp: - for decomposed_ch in self.compat_decomp[char_int]: - for fully_decomposed_ch in _decompose(decomposed_ch, compatible): - yield fully_decomposed_ch - return - - yield char_int - return - - end_codepoint = max( - max(self.canon_decomp.keys()), - max(self.compat_decomp.keys()), - ) - - canon_fully_decomp = {} - compat_fully_decomp = {} - - for char_int in range(0, end_codepoint + 1): - # Always skip Hangul, since it's more efficient to represent its - # decomposition programmatically. - if S_BASE <= char_int < S_BASE + S_COUNT: - continue - - canon = list(_decompose(char_int, False)) - if not (len(canon) == 1 and canon[0] == char_int): - canon_fully_decomp[char_int] = canon - - compat = list(_decompose(char_int, True)) - if not (len(compat) == 1 and compat[0] == char_int): - compat_fully_decomp[char_int] = compat - - # Since canon_fully_decomp is a subset of compat_fully_decomp, we don't - # need to store their overlap when they agree. When they don't agree, - # store the decomposition in the compatibility table since we'll check - # that first when normalizing to NFKD. - assert set(canon_fully_decomp) <= set(compat_fully_decomp) - - for ch in set(canon_fully_decomp) & set(compat_fully_decomp): - if canon_fully_decomp[ch] == compat_fully_decomp[ch]: - del compat_fully_decomp[ch] - - return canon_fully_decomp, compat_fully_decomp - - def _compute_stream_safe_tables(self): - """ - To make a text stream-safe with the Stream-Safe Text Process (UAX15-D4), - we need to be able to know the number of contiguous non-starters *after* - applying compatibility decomposition to each character. - - We can do this incrementally by computing the number of leading and - trailing non-starters for each character's compatibility decomposition - with the following rules: - - 1) If a character is not affected by compatibility decomposition, look - up its canonical combining class to find out if it's a non-starter. - 2) All Hangul characters are starters, even under decomposition. - 3) Otherwise, very few decomposing characters have a nonzero count - of leading or trailing non-starters, so store these characters - with their associated counts in a separate table. - """ - leading_nonstarters = {} - trailing_nonstarters = {} - - for c in set(self.canon_fully_decomp) | set(self.compat_fully_decomp): - decomposed = self.compat_fully_decomp.get(c) or self.canon_fully_decomp[c] - - num_leading = 0 - for d in decomposed: - if d not in self.combining_classes: - break - num_leading += 1 - - num_trailing = 0 - for d in reversed(decomposed): - if d not in self.combining_classes: - break - num_trailing += 1 - - if num_leading > 0: - leading_nonstarters[c] = num_leading - if num_trailing > 0: - trailing_nonstarters[c] = num_trailing - - return leading_nonstarters, trailing_nonstarters - -hexify = lambda c: '{:04X}'.format(c) - -# Test whether `first` and `last` are corresponding "<..., First>" and -# "<..., Last>" markers. -def is_first_and_last(first, last): - if not first.startswith('<') or not first.endswith(', First>'): - return False - if not last.startswith('<') or not last.endswith(', Last>'): - return False - return first[1:-8] == last[1:-7] - -def gen_mph_data(name, d, kv_type, kv_callback, kv_row_width): - (salt, keys) = minimal_perfect_hash(d) - out.write(f"\npub(crate) const {name.upper()}_SALT: &[u16] = &[\n") - for s_row in batched(salt, 13): - out.write(" ") - for s in s_row: - out.write(f" 0x{s:03X},") - out.write("\n") - out.write("];\n") - out.write(f"pub(crate) const {name.upper()}_KV: &[{kv_type}] = &[\n") - for k_row in batched(keys, kv_row_width): - out.write(" ") - for k in k_row: - out.write(f" {kv_callback(k)},") - out.write("\n") - out.write("];\n") - -def gen_combining_class(combining_classes, out): - gen_mph_data('canonical_combining_class', combining_classes, 'u32', - lambda k: f"0x{int(combining_classes[k]) | (k << 8):07X}", 8) - -def gen_composition_table(canon_comp, out): - table = {} - for (c1, c2), c3 in canon_comp.items(): - if c1 < 0x10000 and c2 < 0x10000: - table[(c1 << 16) | c2] = c3 - (salt, keys) = minimal_perfect_hash(table) - gen_mph_data('COMPOSITION_TABLE', table, '(u32, char)', - lambda k: f"(0x{k:08X}, '\\u{{{table[k]:06X}}}')", 1) - - out.write("pub(crate) fn composition_table_astral(c1: char, c2: char) -> Option<char> {\n") - out.write(" match (c1, c2) {\n") - for (c1, c2), c3 in sorted(canon_comp.items()): - if c1 >= 0x10000 or c2 >= 0x10000: - out.write(" ('\\u{%s}', '\\u{%s}') => Some('\\u{%s}'),\n" % (hexify(c1), hexify(c2), hexify(c3))) - - out.write(" _ => None,\n") - out.write(" }\n") - out.write("}\n") - -def gen_decomposition_tables(canon_decomp, compat_decomp, cjk_compat_variants_decomp, out): - tables = [(canon_decomp, 'canonical'), (compat_decomp, 'compatibility'), (cjk_compat_variants_decomp, 'cjk_compat_variants')] - for table, name in tables: - offsets = {} - offset = 0 - out.write("pub(crate) const %s_DECOMPOSED_CHARS: &[char] = &[\n" % name.upper()) - for k, v in table.items(): - offsets[k] = offset - offset += len(v) - for c in v: - out.write(" '\\u{%s}',\n" % hexify(c)) - # The largest offset must fit in a u16. - assert offset < 65536 - out.write("];\n") - gen_mph_data(name + '_decomposed', table, "(u32, (u16, u16))", - lambda k: f"(0x{k:05X}, (0x{offsets[k]:03X}, 0x{len(table[k]):X}))", 1) - -def gen_qc_match(prop_table, out): - out.write(" match c {\n") - - for low, high, data in prop_table: - assert data in ('N', 'M') - result = "No" if data == 'N' else "Maybe" - if high: - out.write(r" '\u{%s}'..='\u{%s}' => %s," % (low, high, result)) - else: - out.write(r" '\u{%s}' => %s," % (low, result)) - out.write("\n") - - out.write(" _ => Yes,\n") - out.write(" }\n") - -def gen_nfc_qc(prop_tables, out): - out.write("\n#[inline]\n") - out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") - out.write("pub fn qc_nfc(c: char) -> IsNormalized {\n") - gen_qc_match(prop_tables['NFC_QC'], out) - out.write("}\n") - -def gen_nfkc_qc(prop_tables, out): - out.write("#[inline]\n") - out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") - out.write("pub fn qc_nfkc(c: char) -> IsNormalized {\n") - gen_qc_match(prop_tables['NFKC_QC'], out) - out.write("}\n") - -def gen_nfd_qc(prop_tables, out): - out.write("#[inline]\n") - out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") - out.write("pub fn qc_nfd(c: char) -> IsNormalized {\n") - gen_qc_match(prop_tables['NFD_QC'], out) - out.write("}\n") - -def gen_nfkd_qc(prop_tables, out): - out.write("#[inline]\n") - out.write("#[allow(ellipsis_inclusive_range_patterns)]\n") - out.write("pub fn qc_nfkd(c: char) -> IsNormalized {\n") - gen_qc_match(prop_tables['NFKD_QC'], out) - out.write("}\n") - -def gen_combining_mark(general_category_mark, out): - gen_mph_data('combining_mark', general_category_mark, 'u32', - lambda k: '0x{:05X}'.format(k), 10) - -def gen_public_assigned(general_category_public_assigned, out): - # This could be done as a hash but the table is somewhat small. - out.write("#[inline]\n") - out.write("pub fn is_public_assigned(c: char) -> bool {\n") - out.write(" match c {\n") - - start = True - for first, last in general_category_public_assigned: - if start: - out.write(" ") - start = False - else: - out.write("\n | ") - if first == last: - out.write("'\\u{%s}'" % hexify(first)) - else: - out.write("'\\u{%s}'..='\\u{%s}'" % (hexify(first), hexify(last))) - out.write(" => true,\n") - - out.write(" _ => false,\n") - out.write(" }\n") - out.write("}\n") - -def gen_stream_safe(leading, trailing, out): - # This could be done as a hash but the table is very small. - out.write("#[inline]\n") - out.write("pub fn stream_safe_leading_nonstarters(c: char) -> usize {\n") - out.write(" match c {\n") - - for char, num_leading in sorted(leading.items()): - out.write(" '\\u{%s}' => %d,\n" % (hexify(char), num_leading)) - - out.write(" _ => 0,\n") - out.write(" }\n") - out.write("}\n") - - gen_mph_data('trailing_nonstarters', trailing, 'u32', - lambda k: f"0x{int(trailing[k]) | (k << 8):07X}", 8) - -def gen_tests(tests, out): - out.write("""#[derive(Debug)] -pub struct NormalizationTest { - pub source: &'static str, - pub nfc: &'static str, - pub nfd: &'static str, - pub nfkc: &'static str, - pub nfkd: &'static str, -} - -""") - - out.write("pub const NORMALIZATION_TESTS: &[NormalizationTest] = &[\n") - str_literal = lambda s: '"%s"' % "".join("\\u{%s}" % c for c in s) - - for test in tests: - out.write(" NormalizationTest {\n") - out.write(" source: %s,\n" % str_literal(test.source)) - out.write(" nfc: %s,\n" % str_literal(test.nfc)) - out.write(" nfd: %s,\n" % str_literal(test.nfd)) - out.write(" nfkc: %s,\n" % str_literal(test.nfkc)) - out.write(" nfkd: %s,\n" % str_literal(test.nfkd)) - out.write(" },\n") - - out.write("];\n") - -# Guaranteed to be less than n. -def my_hash(x, salt, n): - # This is hash based on the theory that multiplication is efficient - mask_32 = 0xffffffff - y = ((x + salt) * 2654435769) & mask_32 - y ^= (x * 0x31415926) & mask_32 - return (y * n) >> 32 - -# Compute minimal perfect hash function, d can be either a dict or list of keys. -def minimal_perfect_hash(d): - n = len(d) - buckets = dict((h, []) for h in range(n)) - for key in d: - h = my_hash(key, 0, n) - buckets[h].append(key) - bsorted = [(len(buckets[h]), h) for h in range(n)] - bsorted.sort(reverse = True) - claimed = [False] * n - salts = [0] * n - keys = [0] * n - for (bucket_size, h) in bsorted: - # Note: the traditional perfect hashing approach would also special-case - # bucket_size == 1 here and assign any empty slot, rather than iterating - # until rehash finds an empty slot. But we're not doing that so we can - # avoid the branch. - if bucket_size == 0: - break - else: - for salt in range(1, 32768): - rehashes = [my_hash(key, salt, n) for key in buckets[h]] - # Make sure there are no rehash collisions within this bucket. - if all(not claimed[hash] for hash in rehashes): - if len(set(rehashes)) < bucket_size: - continue - salts[h] = salt - for key in buckets[h]: - rehash = my_hash(key, salt, n) - claimed[rehash] = True - keys[rehash] = key - break - if salts[h] == 0: - print("minimal perfect hashing failed") - # Note: if this happens (because of unfortunate data), then there are - # a few things that could be done. First, the hash function could be - # tweaked. Second, the bucket order could be scrambled (especially the - # singletons). Right now, the buckets are sorted, which has the advantage - # of being deterministic. - # - # As a more extreme approach, the singleton bucket optimization could be - # applied (give the direct address for singleton buckets, rather than - # relying on a rehash). That is definitely the more standard approach in - # the minimal perfect hashing literature, but in testing the branch was a - # significant slowdown. - exit(1) - return (salts, keys) - -if __name__ == '__main__': - data = UnicodeData() - with open("tables.rs", "w", newline = "\n") as out: - out.write(PREAMBLE) - out.write("use crate::quick_check::IsNormalized;\n") - out.write("use crate::quick_check::IsNormalized::*;\n") - out.write("\n") - - version = "(%s, %s, %s)" % tuple(UNICODE_VERSION.split(".")) - out.write("#[allow(unused)]\n") - out.write("pub const UNICODE_VERSION: (u8, u8, u8) = %s;\n" % version) - - gen_combining_class(data.combining_classes, out) - - gen_composition_table(data.canon_comp, out) - - gen_decomposition_tables(data.canon_fully_decomp, data.compat_fully_decomp, data.cjk_compat_variants_fully_decomp, out) - - gen_combining_mark(data.general_category_mark, out) - - gen_public_assigned(data.general_category_public_assigned, out) - - gen_nfc_qc(data.norm_props, out) - - gen_nfkc_qc(data.norm_props, out) - - gen_nfd_qc(data.norm_props, out) - - gen_nfkd_qc(data.norm_props, out) - - gen_stream_safe(data.ss_leading, data.ss_trailing, out) - - with open("normalization_tests.rs", "w", newline = "\n") as out: - out.write(PREAMBLE) - gen_tests(data.norm_tests, out) |
