From 45df4d0d9b577fecee798d672695fe24ff57fb1b Mon Sep 17 00:00:00 2001 From: mo khan Date: Tue, 15 Jul 2025 16:37:08 -0600 Subject: feat: migrate from Cedar to SpiceDB authorization system This is a major architectural change that replaces the Cedar policy-based authorization system with SpiceDB's relation-based authorization. Key changes: - Migrate from Rust to Go implementation - Replace Cedar policies with SpiceDB schema and relationships - Switch from envoy `ext_authz` with Cedar to SpiceDB permission checks - Update build system and dependencies for Go ecosystem - Maintain Envoy integration for external authorization This change enables more flexible permission modeling through SpiceDB's Google Zanzibar inspired relation-based system, supporting complex hierarchical permissions that were difficult to express in Cedar. Breaking change: Existing Cedar policies and Rust-based configuration will no longer work and need to be migrated to SpiceDB schema. --- vendor/unicode-width/scripts/unicode.py | 2156 ------------------------------- 1 file changed, 2156 deletions(-) delete mode 100755 vendor/unicode-width/scripts/unicode.py (limited to 'vendor/unicode-width/scripts') diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py deleted file mode 100755 index 320da14e..00000000 --- a/vendor/unicode-width/scripts/unicode.py +++ /dev/null @@ -1,2156 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT -# file at the top-level directory of this distribution and at -# http://rust-lang.org/COPYRIGHT. -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - -# This script uses the following Unicode tables: -# -# - DerivedCoreProperties.txt -# - EastAsianWidth.txt -# - HangulSyllableType.txt -# - NormalizationTest.txt (for tests only) -# - PropList.txt -# - ReadMe.txt -# - UnicodeData.txt -# - auxiliary/GraphemeBreakProperty.txt -# - emoji/emoji-data.txt -# - emoji/emoji-variation-sequences.txt -# - extracted/DerivedGeneralCategory.txt -# -# Since this should not require frequent updates, we just store this -# out-of-line and check the generated module into git. - -import enum -import math -import operator -import os -import re -import sys -import urllib.request -from collections import defaultdict -from itertools import batched -from typing import Callable, Iterable - -UNICODE_VERSION = "15.1.0" -"""The version of the Unicode data files to download.""" - -NUM_CODEPOINTS = 0x110000 -"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" - -MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) -"""The maximum number of bits required to represent a Unicode codepoint.""" - - -class OffsetType(enum.IntEnum): - """Represents the data type of a lookup table's offsets. Each variant's value represents the - number of bits required to represent that variant's type.""" - - U2 = 2 - """Offsets are 2-bit unsigned integers, packed four-per-byte.""" - U4 = 4 - """Offsets are 4-bit unsigned integers, packed two-per-byte.""" - U8 = 8 - """Each offset is a single byte (u8).""" - - -MODULE_PATH = "../src/tables.rs" -"""The path of the emitted Rust module (relative to the working directory)""" - -TABLE_SPLITS = [7, 13] -"""The splits between the bits of the codepoint used to index each subtable. -Adjust these values to change the sizes of the subtables""" - -Codepoint = int -BitPos = int - - -def fetch_open(filename: str, local_prefix: str = "", emoji: bool = False): - """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, - fetches it from `https://www.unicode.org/Public/`. Exits with code 1 on failure. - """ - basename = os.path.basename(filename) - localname = os.path.join(local_prefix, basename) - if not os.path.exists(localname): - if emoji: - prefix = f"emoji/{UNICODE_VERSION[:-2]}" - else: - prefix = f"{UNICODE_VERSION}/ucd" - urllib.request.urlretrieve( - f"https://www.unicode.org/Public/{prefix}/{filename}", - localname, - ) - try: - return open(localname, encoding="utf-8") - except OSError: - sys.stderr.write(f"cannot load {localname}") - sys.exit(1) - - -def load_unicode_version() -> tuple[int, int, int]: - """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" - with fetch_open("ReadMe.txt") as readme: - pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" - return tuple(map(int, re.search(pattern, readme.read()).groups())) # type: ignore - - -def load_property(filename: str, pattern: str, action: Callable[[int], None]): - with fetch_open(filename) as properties: - single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+") - multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+") - - for line in properties.readlines(): - raw_data = None # (low, high) - if match := single.match(line): - raw_data = (match.group(1), match.group(1)) - elif match := multiple.match(line): - raw_data = (match.group(1), match.group(2)) - else: - continue - low = int(raw_data[0], 16) - high = int(raw_data[1], 16) - for cp in range(low, high + 1): - action(cp) - - -def to_sorted_ranges(iter: Iterable[Codepoint]) -> list[tuple[Codepoint, Codepoint]]: - "Creates a sorted list of ranges from an iterable of codepoints" - lst = [c for c in iter] - lst.sort() - ret = [] - for cp in lst: - if len(ret) > 0 and ret[-1][1] == cp - 1: - ret[-1] = (ret[-1][0], cp) - else: - ret.append((cp, cp)) - return ret - - -class EastAsianWidth(enum.IntEnum): - """Represents the width of a Unicode character according to UAX 16. - All East Asian Width classes resolve into either - `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`. - """ - - NARROW = 1 - """ One column wide. """ - WIDE = 2 - """ Two columns wide. """ - AMBIGUOUS = 3 - """ Two columns wide in a CJK context. One column wide in all other contexts. """ - - -class CharWidthInTable(enum.IntEnum): - """Represents the width of a Unicode character - as stored in the tables.""" - - ZERO = 0 - ONE = 1 - TWO = 2 - SPECIAL = 3 - - -class WidthState(enum.IntEnum): - """ - Width calculation proceeds according to a state machine. - We iterate over the characters of the string from back to front; - the next character encountered determines the transition to take. - - The integer values of these variants have special meaning: - - Top bit: whether this is Vs16 - - 2nd from top: whether this is Vs15 - - 3rd bit from top: whether this is transparent to emoji/text presentation - (if set, should also set 4th) - - 4th bit: whether to set top bit on emoji presentation. - If this is set but 3rd is not, the width mode is related to zwj sequences - - 5th from top: whether this is unaffected by ligature-transparent - - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state - where no ZWJ has been encountered yet; encountering one flips this on""" - - # BASIC WIDTHS - - ZERO = 0x1_0000 - "Zero columns wide." - - NARROW = 0x1_0001 - "One column wide." - - WIDE = 0x1_0002 - "Two columns wide." - - THREE = 0x1_0003 - "Three columns wide." - - # \r\n - LINE_FEED = 0b0000_0000_0000_0001 - "\\n (CRLF has width 1)" - - # EMOJI - - # Emoji skintone modifiers - EMOJI_MODIFIER = 0b0000_0000_0000_0010 - "`Emoji_Modifier`" - - # Emoji ZWJ sequences - - REGIONAL_INDICATOR = 0b0000_0000_0000_0011 - "`Regional_Indicator`" - - SEVERAL_REGIONAL_INDICATOR = 0b0000_0000_0000_0100 - "At least two `Regional_Indicator`in sequence" - - EMOJI_PRESENTATION = 0b0000_0000_0000_0101 - "`Emoji_Presentation`" - - ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0110 - "\\u200D `Emoji_Presentation`" - - VS16_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0110 - "\\uFE0F \\u200D `Emoji_Presentation`" - - KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0111 - "\\u20E3 \\u200D `Emoji_Presentation`" - - VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0111 - "\\uFE0F \\u20E3 \\u200D `Emoji_Presentation`" - - REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1001 - "`Regional_Indicator` \\u200D `Emoji_Presentation`" - - EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1010 - "(`Regional_Indicator` `Regional_Indicator`)+ \\u200D `Emoji_Presentation`" - - ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1011 - "(`Regional_Indicator` `Regional_Indicator`)+ `Regional_Indicator` \\u200D `Emoji_Presentation`" - - TAG_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0000 - "\\uE007F \\u200D `Emoji_Presentation`" - - TAG_D1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0001 - "\\uE0030..=\\uE0039 \\uE007F \\u200D `Emoji_Presentation`" - - TAG_D2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0010 - "(\\uE0030..=\\uE0039){2} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_D3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0011 - "(\\uE0030..=\\uE0039){3} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1001 - "\\uE0061..=\\uE007A \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1010 - "(\\uE0061..=\\uE007A){2} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1011 - "(\\uE0061..=\\uE007A){3} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A4_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1100 - "(\\uE0061..=\\uE007A){4} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A5_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1101 - "(\\uE0061..=\\uE007A){35} \\uE007F \\u200D `Emoji_Presentation`" - - TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110 - "(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`" - - # VARIATION SELECTORS - - # Text presentation sequences (not CJK) - VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000 - "\\uFE0E (text presentation sequences)" - - # Emoji presentation sequences - VARIATION_SELECTOR_16 = 0b1000_0000_0000_0000 - "\\uFE0F (emoji presentation sequences)" - - # ARABIC LAM ALEF - - JOINING_GROUP_ALEF = 0b0011_0000_1111_1111 - "Joining_Group=Alef (Arabic Lam-Alef ligature)" - - # COMBINING SOLIDUS (CJK only) - - COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1100_1111_1111 - "\\u0338 (CJK only, makes <, =, > width 2)" - - # SOLIDUS + ALEF (solidus is Joining_Type=Transparent) - SOLIDUS_OVERLAY_ALEF = 0b0011_1000_1111_1111 - "\\u0338 followed by Joining_Group=Alef" - - # SCRIPT ZWJ LIGATURES - - # Hebrew alef lamed - - HEBREW_LETTER_LAMED = 0b0011_1000_0000_0000 - "\\u05DC (Alef-ZWJ-Lamed ligature)" - - ZWJ_HEBREW_LETTER_LAMED = 0b0011_1100_0000_0000 - "\\u200D\\u05DC (Alef-ZWJ-Lamed ligature)" - - # Buginese ya - - BUGINESE_LETTER_YA = 0b0011_1000_0000_0001 - "\\u1A10 ( + ya ligature)" - - ZWJ_BUGINESE_LETTER_YA = 0b0011_1100_0000_0001 - "\\u200D\\u1A10 ( + ya ligature)" - - BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_1100_0000_0010 - "\\u1A17\\u200D\\u1A10 ( + ya ligature)" - - # Tifinagh bi-consonants - - TIFINAGH_CONSONANT = 0b0011_1000_0000_0011 - "\\u2D31..=\\u2D65 or \\u2D6F (joined by ZWJ or \\u2D7F TIFINAGH CONSONANT JOINER)" - - ZWJ_TIFINAGH_CONSONANT = 0b0011_1100_0000_0011 - "ZWJ then \\u2D31..=\\u2D65 or \\u2D6F" - - TIFINAGH_JOINER_CONSONANT = 0b0011_1100_0000_0100 - "\\u2D7F then \\u2D31..=\\u2D65 or \\u2D6F" - - # Lisu tone letters - LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_1100_0000_0101 - "\\uA4FC or \\uA4FD (https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078)" - - # Old Turkic orkhon ec - orkhon i - - OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1000_0000_0110 - "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" - - ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110 - "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" - - # Khmer coeng signs - - KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111 - "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF" - - def table_width(self) -> CharWidthInTable: - "The width of a character as stored in the lookup tables." - match self: - case WidthState.ZERO: - return CharWidthInTable.ZERO - case WidthState.NARROW: - return CharWidthInTable.ONE - case WidthState.WIDE: - return CharWidthInTable.TWO - case _: - return CharWidthInTable.SPECIAL - - def is_carried(self) -> bool: - "Whether this corresponds to a non-default `WidthInfo`." - return int(self) <= 0xFFFF - - def width_alone(self) -> int: - "The width of a character with this type when it appears alone." - match self: - case ( - WidthState.ZERO - | WidthState.COMBINING_LONG_SOLIDUS_OVERLAY - | WidthState.VARIATION_SELECTOR_15 - | WidthState.VARIATION_SELECTOR_16 - ): - return 0 - case ( - WidthState.WIDE - | WidthState.EMOJI_MODIFIER - | WidthState.EMOJI_PRESENTATION - ): - return 2 - case WidthState.THREE: - return 3 - case _: - return 1 - - def is_cjk_only(self) -> bool: - return self in [ - WidthState.COMBINING_LONG_SOLIDUS_OVERLAY, - WidthState.SOLIDUS_OVERLAY_ALEF, - ] - - def is_non_cjk_only(self) -> bool: - return self == WidthState.VARIATION_SELECTOR_15 - - -assert len(set([v.value for v in WidthState])) == len([v.value for v in WidthState]) - - -def load_east_asian_widths() -> list[EastAsianWidth]: - """Return a list of effective widths, indexed by codepoint. - Widths are determined by fetching and parsing `EastAsianWidth.txt`. - - `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. - - `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. - - `Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`.""" - - with fetch_open("EastAsianWidth.txt") as eaw: - # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." - single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") - # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." - multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") - # map between width category code and condensed width - width_codes = { - **{c: EastAsianWidth.NARROW for c in ["N", "Na", "H"]}, - **{c: EastAsianWidth.WIDE for c in ["W", "F"]}, - "A": EastAsianWidth.AMBIGUOUS, - } - - width_map = [] - current = 0 - for line in eaw.readlines(): - raw_data = None # (low, high, width) - if match := single.match(line): - raw_data = (match.group(1), match.group(1), match.group(2)) - elif match := multiple.match(line): - raw_data = (match.group(1), match.group(2), match.group(3)) - else: - continue - low = int(raw_data[0], 16) - high = int(raw_data[1], 16) - width = width_codes[raw_data[2]] - - assert current <= high - while current <= high: - # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. - # All such codepoints are implicitly given Neural width (resolves to narrow) - width_map.append(EastAsianWidth.NARROW if current < low else width) - current += 1 - - while len(width_map) < NUM_CODEPOINTS: - # Catch any leftover codepoints and assign them implicit Neutral/narrow width. - width_map.append(EastAsianWidth.NARROW) - - # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow - load_property( - "extracted/DerivedGeneralCategory.txt", - r"(:?Lu|Ll|Lt|Lm|Lo|Sk)", - lambda cp: ( - operator.setitem(width_map, cp, EastAsianWidth.NARROW) - if width_map[cp] == EastAsianWidth.AMBIGUOUS - else None - ), - ) - - # GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT - width_map[0x0387] = EastAsianWidth.AMBIGUOUS - - # Canonical equivalence for symbols with stroke - with fetch_open("UnicodeData.txt") as udata: - single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;") - for line in udata.readlines(): - if match := single.match(line): - composed = int(match.group(1), 16) - decomposed = int(match.group(2), 16) - if width_map[decomposed] == EastAsianWidth.AMBIGUOUS: - width_map[composed] = EastAsianWidth.AMBIGUOUS - - return width_map - - -def load_zero_widths() -> list[bool]: - """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width - character. `c` is considered a zero-width character if - - - it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), - - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), - - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug, - - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). - """ - - zw_map = [False] * NUM_CODEPOINTS - - # `Default_Ignorable_Code_Point`s also have 0 width: - # https://www.unicode.org/faq/unsup_char.html#3 - # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095 - # - # `Grapheme_Extend` includes characters with general category `Mn` or `Me`, - # as well as a few `Mc` characters that need to be included so that - # canonically equivalent sequences have the same width. - load_property( - "DerivedCoreProperties.txt", - r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)", - lambda cp: operator.setitem(zw_map, cp, True), - ) - - # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, - # as they canonically decompose to two characters with this property, - # but they aren't. - for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]: - zw_map[c] = True - - # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo` - # as zero-width. This matches the behavior of glibc `wcwidth`. - # - # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`, - # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine - # into a single wide grapheme. So we treat vowel and trailing jamo as - # 0-width, such that only the width of the leading jamo is counted - # and the resulting grapheme has width 2. - # - # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul) - load_property( - "HangulSyllableType.txt", - r"(?:V|T)", - lambda cp: operator.setitem(zw_map, cp, True), - ) - - # Syriac abbreviation mark: - # Zero-width `Prepended_Concatenation_Mark` - zw_map[0x070F] = True - - # Some Arabic Prepended_Concatenation_Mark`s - # https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G27820 - zw_map[0x0605] = True - zw_map[0x0890] = True - zw_map[0x0891] = True - zw_map[0x08E2] = True - - # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]` - gcb_prepend = set() - load_property( - "auxiliary/GraphemeBreakProperty.txt", - "Prepend", - lambda cp: gcb_prepend.add(cp), - ) - load_property( - "PropList.txt", - "Prepended_Concatenation_Mark", - lambda cp: gcb_prepend.remove(cp), - ) - for cp in gcb_prepend: - zw_map[cp] = True - - # HANGUL CHOSEONG FILLER - # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have - # zero width. However, the expected usage is to combine it with vowel or trailing jamo - # (which are considered 0-width on their own) to form a composed Hangul syllable with - # width 2. Therefore, we treat it as having width 2. - zw_map[0x115F] = False - - # TIFINAGH CONSONANT JOINER - # (invisible only when used to join two Tifinagh consonants - zw_map[0x2D7F] = False - - # DEVANAGARI CARET - # https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447 - zw_map[0xA8FA] = True - - return zw_map - - -def load_width_maps() -> tuple[list[WidthState], list[WidthState]]: - """Load complete width table, including characters needing special handling. - (Returns 2 tables, one for East Asian and one for not.)""" - - eaws = load_east_asian_widths() - zws = load_zero_widths() - - not_ea = [] - ea = [] - - for eaw, zw in zip(eaws, zws): - if zw: - not_ea.append(WidthState.ZERO) - ea.append(WidthState.ZERO) - else: - if eaw == EastAsianWidth.WIDE: - not_ea.append(WidthState.WIDE) - else: - not_ea.append(WidthState.NARROW) - - if eaw == EastAsianWidth.NARROW: - ea.append(WidthState.NARROW) - else: - ea.append(WidthState.WIDE) - - # Joining_Group=Alef (Arabic Lam-Alef ligature) - alef_joining = [] - load_property( - "extracted/DerivedJoiningGroup.txt", - "Alef", - lambda cp: alef_joining.append(cp), - ) - - # Regional indicators - regional_indicators = [] - load_property( - "PropList.txt", - "Regional_Indicator", - lambda cp: regional_indicators.append(cp), - ) - - # Emoji modifiers - emoji_modifiers = [] - load_property( - "emoji/emoji-data.txt", - "Emoji_Modifier", - lambda cp: emoji_modifiers.append(cp), - ) - - # Default emoji presentation (for ZWJ sequences) - emoji_presentation = [] - load_property( - "emoji/emoji-data.txt", - "Emoji_Presentation", - lambda cp: emoji_presentation.append(cp), - ) - - for cps, width in [ - ([0x0A], WidthState.LINE_FEED), - ([0x05DC], WidthState.HEBREW_LETTER_LAMED), - (alef_joining, WidthState.JOINING_GROUP_ALEF), - (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER), - ( - [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF], - WidthState.KHMER_COENG_ELIGIBLE_LETTER, - ), - ([0x17A4], WidthState.WIDE), - ([0x17D8], WidthState.THREE), - ([0x1A10], WidthState.BUGINESE_LETTER_YA), - (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT), - ([0x2D6F], WidthState.TIFINAGH_CONSONANT), - ([0xA4FC], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), - ([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), - ([0xFE0F], WidthState.VARIATION_SELECTOR_16), - ([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I), - (emoji_presentation, WidthState.EMOJI_PRESENTATION), - (emoji_modifiers, WidthState.EMOJI_MODIFIER), - (regional_indicators, WidthState.REGIONAL_INDICATOR), - ]: - for cp in cps: - not_ea[cp] = width - ea[cp] = width - - # East-Asian only - ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY - - # Not East Asian only - not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15 - - return (not_ea, ea) - - -def load_joining_group_lam() -> list[tuple[Codepoint, Codepoint]]: - "Returns a list of character ranges with Joining_Group=Lam" - lam_joining = [] - load_property( - "extracted/DerivedJoiningGroup.txt", - "Lam", - lambda cp: lam_joining.append(cp), - ) - - return to_sorted_ranges(lam_joining) - - -def load_non_transparent_zero_widths( - width_map: list[WidthState], -) -> list[tuple[Codepoint, Codepoint]]: - "Returns a list of characters with zero width but not 'Joining_Type=Transparent'" - - zero_widths = set() - for cp, width in enumerate(width_map): - if width.width_alone() == 0: - zero_widths.add(cp) - transparent = set() - load_property( - "extracted/DerivedJoiningType.txt", - "T", - lambda cp: transparent.add(cp), - ) - - return to_sorted_ranges(zero_widths - transparent) - - -def load_ligature_transparent() -> list[tuple[Codepoint, Codepoint]]: - """Returns a list of character ranges corresponding to all combining marks that are also - `Default_Ignorable_Code_Point`s, plus ZWJ. This is the set of characters that won't interrupt - a ligature.""" - default_ignorables = set() - load_property( - "DerivedCoreProperties.txt", - "Default_Ignorable_Code_Point", - lambda cp: default_ignorables.add(cp), - ) - - combining_marks = set() - load_property( - "extracted/DerivedGeneralCategory.txt", - "(?:Mc|Mn|Me)", - lambda cp: combining_marks.add(cp), - ) - - default_ignorable_combinings = default_ignorables.intersection(combining_marks) - default_ignorable_combinings.add(0x200D) # ZWJ - - return to_sorted_ranges(default_ignorable_combinings) - - -def load_solidus_transparent( - ligature_transparents: list[tuple[Codepoint, Codepoint]], - cjk_width_map: list[WidthState], -) -> list[tuple[Codepoint, Codepoint]]: - """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above. - Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also. - """ - - ccc_above_1 = set() - load_property( - "extracted/DerivedCombiningClass.txt", - "(?:[2-9]|(?:[1-9][0-9]+))", - lambda cp: ccc_above_1.add(cp), - ) - - for lo, hi in ligature_transparents: - for cp in range(lo, hi + 1): - ccc_above_1.add(cp) - - num_chars = len(ccc_above_1) - - # Recursive decompositions - while True: - with fetch_open("UnicodeData.txt") as udata: - single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-F ]+);") - for line in udata.readlines(): - if match := single.match(line): - composed = int(match.group(1), 16) - decomposed = [int(c, 16) for c in match.group(2).split(" ")] - if all([c in ccc_above_1 for c in decomposed]): - ccc_above_1.add(composed) - if len(ccc_above_1) == num_chars: - break - else: - num_chars = len(ccc_above_1) - - for cp in ccc_above_1: - if cp != 0xFE0F: - assert ( - cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL - ), f"U+{cp:X}" - - sorted = to_sorted_ranges(ccc_above_1) - return list(filter(lambda range: range not in ligature_transparents, sorted)) - - -def load_normalization_tests() -> list[tuple[str, str, str, str, str]]: - def parse_codepoints(cps: str) -> str: - return "".join(map(lambda cp: chr(int(cp, 16)), cps.split(" "))) - - with fetch_open("NormalizationTest.txt") as normtests: - ret = [] - single = re.compile( - r"^([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);" - ) - for line in normtests.readlines(): - if match := single.match(line): - ret.append( - ( - parse_codepoints(match.group(1)), - parse_codepoints(match.group(2)), - parse_codepoints(match.group(3)), - parse_codepoints(match.group(4)), - parse_codepoints(match.group(5)), - ) - ) - return ret - - -def make_special_ranges( - width_map: list[WidthState], -) -> list[tuple[tuple[Codepoint, Codepoint], WidthState]]: - "Assign ranges of characters to their special behavior (used in match)" - ret = [] - can_merge_with_prev = False - for cp, width in enumerate(width_map): - if width == WidthState.EMOJI_PRESENTATION: - can_merge_with_prev = False - elif width.table_width() == CharWidthInTable.SPECIAL: - if can_merge_with_prev and ret[-1][1] == width: - ret[-1] = ((ret[-1][0][0], cp), width) - else: - ret.append(((cp, cp), width)) - can_merge_with_prev = True - return ret - - -class Bucket: - """A bucket contains a group of codepoints and an ordered width list. If one bucket's width - list overlaps with another's width list, those buckets can be merged via `try_extend`. - """ - - def __init__(self): - """Creates an empty bucket.""" - self.entry_set = set() - self.widths = [] - - def append(self, codepoint: Codepoint, width: CharWidthInTable): - """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" - self.entry_set.add((codepoint, width)) - self.widths.append(width) - - def try_extend(self, attempt: "Bucket") -> bool: - """If either `self` or `attempt`'s width list starts with the other bucket's width list, - set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints - into `self`, and return `True`. Otherwise, return `False`.""" - (less, more) = (self.widths, attempt.widths) - if len(self.widths) > len(attempt.widths): - (less, more) = (attempt.widths, self.widths) - if less != more[: len(less)]: - return False - self.entry_set |= attempt.entry_set - self.widths = more - return True - - def entries(self) -> list[tuple[Codepoint, CharWidthInTable]]: - """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" - result = list(self.entry_set) - result.sort() - return result - - def width(self) -> CharWidthInTable | None: - """If all codepoints in this bucket have the same width, return that width; otherwise, - return `None`.""" - if len(self.widths) == 0: - return None - potential_width = self.widths[0] - for width in self.widths[1:]: - if potential_width != width: - return None - return potential_width - - -def make_buckets( - entries: Iterable[tuple[int, CharWidthInTable]], low_bit: BitPos, cap_bit: BitPos -) -> list[Bucket]: - """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All - codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the - same bucket. Returns a list of the buckets in increasing order of those bits.""" - num_bits = cap_bit - low_bit - assert num_bits > 0 - buckets = [Bucket() for _ in range(0, 2**num_bits)] - mask = (1 << num_bits) - 1 - for codepoint, width in entries: - buckets[(codepoint >> low_bit) & mask].append(codepoint, width) - return buckets - - -class Table: - """Represents a lookup table. Each table contains a certain number of subtables; each - subtable is indexed by a contiguous bit range of the codepoint and contains a list - of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) - - Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should - be indexed by sub-table `i` in the next-level lookup table. The entries of this table are - indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The - key to compression is that two different buckets in two different sub-tables may have the - same width list, which means that they can be merged into the same bucket. - - If no bucket contains two codepoints with different widths, calling `indices_to_widths` will - discard the buckets and convert the entries into `EffectiveWidth` values.""" - - def __init__( - self, - name: str, - entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], - secondary_entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], - low_bit: BitPos, - cap_bit: BitPos, - offset_type: OffsetType, - align: int, - bytes_per_row: int | None = None, - starting_indexed: list[Bucket] = [], - cfged: bool = False, - ): - """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator - in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, - and each table entry is represented in the format specified by `offset_type`. Asserts - that this table is actually representable with `offset_type`.""" - starting_indexed_len = len(starting_indexed) - self.name = name - self.low_bit = low_bit - self.cap_bit = cap_bit - self.offset_type = offset_type - self.entries: list[int] = [] - self.indexed: list[Bucket] = list(starting_indexed) - self.align = align - self.bytes_per_row = bytes_per_row - self.cfged = cfged - - buckets: list[Bucket] = [] - for entries in entry_groups: - buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) - - for bucket in buckets: - for i, existing in enumerate(self.indexed): - if existing.try_extend(bucket): - self.entries.append(i) - break - else: - self.entries.append(len(self.indexed)) - self.indexed.append(bucket) - - self.primary_len = len(self.entries) - self.primary_bucket_len = len(self.indexed) - - buckets = [] - for entries in secondary_entry_groups: - buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) - - for bucket in buckets: - for i, existing in enumerate(self.indexed): - if existing.try_extend(bucket): - self.entries.append(i) - break - else: - self.entries.append(len(self.indexed)) - self.indexed.append(bucket) - - # Validate offset type - max_index = 1 << int(self.offset_type) - for index in self.entries: - assert index < max_index, f"{index} <= {max_index}" - - self.indexed = self.indexed[starting_indexed_len:] - - def indices_to_widths(self): - """Destructively converts the indices in this table to the `EffectiveWidth` values of - their buckets. Assumes that no bucket contains codepoints with different widths. - """ - self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) # type: ignore - del self.indexed - - def buckets(self): - """Returns an iterator over this table's buckets.""" - return self.indexed - - def to_bytes(self) -> list[int]: - """Returns this table's entries as a list of bytes. The bytes are formatted according to - the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries - to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will - contain four packed 2-bit entries.""" - entries_per_byte = 8 // int(self.offset_type) - byte_array = [] - for i in range(0, len(self.entries), entries_per_byte): - byte = 0 - for j in range(0, entries_per_byte): - byte |= self.entries[i + j] << (j * int(self.offset_type)) - byte_array.append(byte) - return byte_array - - -def make_tables( - width_map: list[WidthState], - cjk_width_map: list[WidthState], -) -> list[Table]: - """Creates a table for each configuration in `table_cfgs`, with the first config corresponding - to the top-level lookup table, the second config corresponding to the second-level lookup - table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs - to include in the top-level table.""" - - entries = enumerate([w.table_width() for w in width_map]) - cjk_entries = enumerate([w.table_width() for w in cjk_width_map]) - - root_table = Table( - "WIDTH_ROOT", - [entries], - [], - TABLE_SPLITS[1], - MAX_CODEPOINT_BITS, - OffsetType.U8, - 128, - ) - - cjk_root_table = Table( - "WIDTH_ROOT_CJK", - [cjk_entries], - [], - TABLE_SPLITS[1], - MAX_CODEPOINT_BITS, - OffsetType.U8, - 128, - starting_indexed=root_table.indexed, - cfged=True, - ) - - middle_table = Table( - "WIDTH_MIDDLE", - map(lambda bucket: bucket.entries(), root_table.buckets()), - map(lambda bucket: bucket.entries(), cjk_root_table.buckets()), - TABLE_SPLITS[0], - TABLE_SPLITS[1], - OffsetType.U8, - 2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), - bytes_per_row=2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), - ) - - leaves_table = Table( - "WIDTH_LEAVES", - map( - lambda bucket: bucket.entries(), - middle_table.buckets()[: middle_table.primary_bucket_len], - ), - map( - lambda bucket: bucket.entries(), - middle_table.buckets()[middle_table.primary_bucket_len :], - ), - 0, - TABLE_SPLITS[0], - OffsetType.U2, - 2 ** (TABLE_SPLITS[0] - 2), - bytes_per_row=2 ** (TABLE_SPLITS[0] - 2), - ) - - return [root_table, cjk_root_table, middle_table, leaves_table] - - -def load_emoji_presentation_sequences() -> list[Codepoint]: - """Outputs a list of cpodepoints, corresponding to all the valid characters for starting - an emoji presentation sequence.""" - - with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: - # Match all emoji presentation sequences - # (one codepoint followed by U+FE0F, and labeled "emoji style") - sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style") - codepoints = [] - for line in sequences.readlines(): - if match := sequence.match(line): - cp = int(match.group(1), 16) - codepoints.append(cp) - return codepoints - - -def load_text_presentation_sequences() -> list[Codepoint]: - """Outputs a list of codepoints, corresponding to all the valid characters - whose widths change with a text presentation sequence.""" - - text_presentation_seq_codepoints = set() - with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: - # Match all text presentation sequences - # (one codepoint followed by U+FE0E, and labeled "text style") - sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style") - for line in sequences.readlines(): - if match := sequence.match(line): - cp = int(match.group(1), 16) - text_presentation_seq_codepoints.add(cp) - - default_emoji_codepoints = set() - - load_property( - "emoji/emoji-data.txt", - "Emoji_Presentation", - lambda cp: default_emoji_codepoints.add(cp), - ) - - codepoints = [] - for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints): - # "Enclosed Ideographic Supplement" block; - # wide even in text presentation - if not cp in range(0x1F200, 0x1F300): - codepoints.append(cp) - - codepoints.sort() - return codepoints - - -def load_emoji_modifier_bases() -> list[Codepoint]: - """Outputs a list of codepoints, corresponding to all the valid characters - whose widths change with a text presentation sequence.""" - - ret = [] - load_property( - "emoji/emoji-data.txt", - "Emoji_Modifier_Base", - lambda cp: ret.append(cp), - ) - ret.sort() - return ret - - -def make_presentation_sequence_table( - seqs: list[Codepoint], - lsb: int = 10, -) -> tuple[list[tuple[int, int]], list[list[int]]]: - """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence. - The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. - """ - - prefixes_dict = defaultdict(set) - for cp in seqs: - prefixes_dict[cp >> lsb].add(cp & (2**lsb - 1)) - - msbs: list[int] = list(prefixes_dict.keys()) - - leaves: list[list[int]] = [] - for cps in prefixes_dict.values(): - leaf = [0] * (2 ** (lsb - 3)) - for cp in cps: - idx_in_leaf, bit_shift = divmod(cp, 8) - leaf[idx_in_leaf] |= 1 << bit_shift - leaves.append(leaf) - - indexes = [(msb, index) for (index, msb) in enumerate(msbs)] - - # Cull duplicate leaves - i = 0 - while i < len(leaves): - first_idx = leaves.index(leaves[i]) - if first_idx == i: - i += 1 - else: - for j in range(0, len(indexes)): - if indexes[j][1] == i: - indexes[j] = (indexes[j][0], first_idx) - elif indexes[j][1] > i: - indexes[j] = (indexes[j][0], indexes[j][1] - 1) - - leaves.pop(i) - - return (indexes, leaves) - - -def make_ranges_table( - seqs: list[Codepoint], -) -> tuple[list[tuple[int, int]], list[list[tuple[int, int]]]]: - """Generates 2-level lookup table for a binary property of a codepoint. - First level is all but the last byte, second level is ranges for last byte - """ - - prefixes_dict = defaultdict(list) - for cp in seqs: - prefixes_dict[cp >> 8].append(cp & 0xFF) - - msbs: list[int] = list(prefixes_dict.keys()) - - leaves: list[list[tuple[int, int]]] = [] - for cps in prefixes_dict.values(): - leaf = [] - for cp in cps: - if len(leaf) > 0 and leaf[-1][1] == cp - 1: - leaf[-1] = (leaf[-1][0], cp) - else: - leaf.append((cp, cp)) - leaves.append(leaf) - - indexes = [(msb, index) for (index, msb) in enumerate(msbs)] - - # Cull duplicate leaves - i = 0 - while i < len(leaves): - first_idx = leaves.index(leaves[i]) - if first_idx == i: - i += 1 - else: - for j in range(0, len(indexes)): - if indexes[j][1] == i: - indexes[j] = (indexes[j][0], first_idx) - elif indexes[j][1] > i: - indexes[j] = (indexes[j][0], indexes[j][1] - 1) - - leaves.pop(i) - - return (indexes, leaves) - - -def lookup_fns( - is_cjk: bool, - special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], - joining_group_lam: list[tuple[Codepoint, Codepoint]], -) -> str: - if is_cjk: - cfg = '#[cfg(feature = "cjk")]\n' - cjk_lo = "_cjk" - cjk_cap = "_CJK" - ambig = "wide" - else: - cfg = "" - cjk_lo = "" - cjk_cap = "" - ambig = "narrow" - s = f""" -/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by -/// consulting a multi-level lookup table. -/// -/// # Maintenance -/// The tables themselves are autogenerated but this function is hardcoded. You should have -/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) -/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the -/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes. -{cfg}#[inline] -fn lookup_width{cjk_lo}(c: char) -> (u8, WidthInfo) {{ - let cp = c as usize; - - let t1_offset = WIDTH_ROOT{cjk_cap}.0[cp >> {TABLE_SPLITS[1]}]; - - // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte, - // so each sub-table is 128 bytes in size. - // (Sub-tables are selected using the computed offset from the previous table.) - let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> {TABLE_SPLITS[0]} & 0x{(2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]) - 1):X}]; - - // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits. - // This is accomplished by packing four stored entries into one byte. - // So each sub-table is 2**(7-2) == 32 bytes in size. - // Since this is the last table, each entry represents an encoded width. - let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x{(2 ** (TABLE_SPLITS[0] - 2) - 1):X}]; - - // Extract the packed width - let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; - - if width < 3 {{ - (width, WidthInfo::DEFAULT) - }} else {{ - match c {{ -""" - - for (lo, hi), width in special_ranges: - s += f" '\\u{{{lo:X}}}'" - if hi != lo: - s += f"..='\\u{{{hi:X}}}'" - if width.is_carried(): - width_info = width.name - else: - width_info = "DEFAULT" - s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n" - - s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION), - }} - }} -}} - -/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or -/// `None` if `c` is a control character. -/// Ambiguous width characters are treated as {ambig}. -{cfg}#[inline] -pub fn single_char_width{cjk_lo}(c: char) -> Option {{ - if c < '\\u{{7F}}' {{ - if c >= '\\u{{20}}' {{ - // U+0020 to U+007F (exclusive) are single-width ASCII codepoints - Some(1) - }} else {{ - // U+0000 to U+0020 (exclusive) are control codes - None - }} - }} else if c >= '\\u{{A0}}' {{ - // No characters >= U+00A0 are control codes, so we can consult the lookup tables - Some(lookup_width{cjk_lo}(c).0.into()) - }} else {{ - // U+007F to U+00A0 (exclusive) are control codes - None - }} -}} - -/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`. -/// Ambiguous width characters are treated as {ambig}. -{cfg}#[inline] -fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{ - if next_info.is_emoji_presentation() {{ - if starts_emoji_presentation_seq(c) {{ - let width = if next_info.is_zwj_emoji_presentation() {{ - 0 - }} else {{ - 2 - }}; - return (width, WidthInfo::EMOJI_PRESENTATION); - }} else {{ - next_info = next_info.unset_emoji_presentation(); - }} - }}""" - - if is_cjk: - s += """ - if (matches!( - next_info, - WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY | WidthInfo::SOLIDUS_OVERLAY_ALEF - ) && matches!(c, '<' | '=' | '>')) - { - return (2, WidthInfo::DEFAULT); - }""" - - s += """ - if c <= '\\u{A0}' { - match c { - // According to the spec, LF should be width 1, which is how it is often rendered when it is forced to have a single-line rendering - // However, this makes it harder to use this crate to calculate line breaks, and breaks assumptions of downstream crates. - // https://github.com/unicode-rs/unicode-width/issues/60 - '\\n' => (0, WidthInfo::LINE_FEED), - '\\r' if next_info == WidthInfo::LINE_FEED => (0, WidthInfo::DEFAULT), - _ => (1, WidthInfo::DEFAULT), - } - } else { - // Fast path - if next_info != WidthInfo::DEFAULT { - if c == '\\u{FE0F}' { - return (0, next_info.set_emoji_presentation()); - }""" - - if not is_cjk: - s += """ - if c == '\\u{FE0E}' { - return (0, next_info.set_text_presentation()); - } - if next_info.is_text_presentation() { - if starts_non_ideographic_text_presentation_seq(c) { - return (1, WidthInfo::DEFAULT); - } else { - next_info = next_info.unset_text_presentation(); - } - }""" - - s += """ - if next_info.is_ligature_transparent() { - if c == '\\u{200D}' { - return (0, next_info.set_zwj_bit()); - } else if is_ligature_transparent(c) { - return (0, next_info); - } - } - - match (next_info, c) {""" - if is_cjk: - s += """ - (WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, _) if is_solidus_transparent(c) => { - return ( - lookup_width_cjk(c).0 as i8, - WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, - ); - } - (WidthInfo::JOINING_GROUP_ALEF, '\\u{0338}') => { - return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF); - } - // Arabic Lam-Alef ligature - ( - WidthInfo::JOINING_GROUP_ALEF | WidthInfo::SOLIDUS_OVERLAY_ALEF, - """ - else: - s += """ - // Arabic Lam-Alef ligature - ( - WidthInfo::JOINING_GROUP_ALEF, - """ - - tail = False - for lo, hi in joining_group_lam: - if tail: - s += " | " - tail = True - s += f"'\\u{{{lo:X}}}'" - if hi != lo: - s += f"..='\\u{{{hi:X}}}'" - s += """, - ) => return (0, WidthInfo::DEFAULT), - (WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => { - return (0, WidthInfo::JOINING_GROUP_ALEF); - } - - // Hebrew Alef-ZWJ-Lamed ligature - (WidthInfo::ZWJ_HEBREW_LETTER_LAMED, '\\u{05D0}') => { - return (0, WidthInfo::DEFAULT); - } - - // Khmer coeng signs - (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => { - return (-1, WidthInfo::DEFAULT); - } - - // Buginese ZWJ ya ligature - (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => { - return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) - } - (WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA, '\\u{1A15}') => { - return (0, WidthInfo::DEFAULT) - } - - // Tifinagh bi-consonants - (WidthInfo::TIFINAGH_CONSONANT | WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D7F}') => { - return (1, WidthInfo::TIFINAGH_JOINER_CONSONANT); - } - (WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { - return (0, WidthInfo::DEFAULT); - } - (WidthInfo::TIFINAGH_JOINER_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { - return (-1, WidthInfo::DEFAULT); - } - - // Lisu tone letter combinations - (WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, '\\u{A4F8}'..='\\u{A4FB}') => { - return (0, WidthInfo::DEFAULT); - } - - // Old Turkic ligature - (WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\\u{10C32}') => { - return (0, WidthInfo::DEFAULT); - }""" - - s += f""" - // Emoji modifier - (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{ - return (0, WidthInfo::EMOJI_PRESENTATION); - }} - - // Regional indicator - ( - WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, - '\\u{{1F1E6}}'..='\\u{{1F1FF}}', - ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), - - // ZWJ emoji - ( - WidthInfo::EMOJI_PRESENTATION - | WidthInfo::SEVERAL_REGIONAL_INDICATOR - | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION - | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION - | WidthInfo::EMOJI_MODIFIER, - '\\u{{200D}}', - ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), - (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{ - return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); - }} - (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{ - return (0, WidthInfo::EMOJI_PRESENTATION) - }} - (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{ - return (0, WidthInfo::EMOJI_PRESENTATION) - }} - (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{ - return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); - }} - ( - WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION - | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, - '\\u{{1F1E6}}'..='\\u{{1F1FF}}', - ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), - ( - WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, - '\\u{{1F1E6}}'..='\\u{{1F1FF}}', - ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), - (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{ - return (0, WidthInfo::EMOJI_MODIFIER); - }} - (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{ - return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); - }} - (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); - }} - (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) - }} - (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) - }} - (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) - }} - (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) - }} - (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ - return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) - }} - ( - WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, - '\\u{{E0030}}'..='\\u{{E0039}}', - ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), - (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ - return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); - }} - (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ - return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); - }} - ( - WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION - | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, - '\\u{{1F3F4}}', - ) => return (0, WidthInfo::EMOJI_PRESENTATION), - (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) - if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION => - {{ - return (0, WidthInfo::EMOJI_PRESENTATION) - }} - - // Fallback - _ => {{}} - }} - }} - - let ret = lookup_width{cjk_lo}(c); - (ret.0 as i8, ret.1) - }} -}} - -{cfg}#[inline] -pub fn str_width{cjk_lo}(s: &str) -> usize {{ - s.chars() - .rfold( - (0, WidthInfo::DEFAULT), - |(sum, next_info), c| -> (usize, WidthInfo) {{ - let (add, info) = width_in_str{cjk_lo}(c, next_info); - (sum.wrapping_add_signed(isize::from(add)), info) - }}, - ) - .0 -}} -""" - - return s - - -def emit_module( - out_name: str, - unicode_version: tuple[int, int, int], - tables: list[Table], - special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], - special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], WidthState]], - emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]], - text_presentation_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], - emoji_modifier_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], - joining_group_lam: list[tuple[Codepoint, Codepoint]], - non_transparent_zero_widths: list[tuple[Codepoint, Codepoint]], - ligature_transparent: list[tuple[Codepoint, Codepoint]], - solidus_transparent: list[tuple[Codepoint, Codepoint]], - normalization_tests: list[tuple[str, str, str, str, str]], -): - """Outputs a Rust module to `out_name` using table data from `tables`. - If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. - """ - if os.path.exists(out_name): - os.remove(out_name) - with open(out_name, "w", newline="\n", encoding="utf-8") as module: - module.write( - """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT -// file at the top-level directory of this distribution and at -// http://rust-lang.org/COPYRIGHT. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly - -use core::cmp::Ordering; - -#[derive(Clone, Copy, Debug, PartialEq, Eq)] -struct WidthInfo(u16); - -impl WidthInfo { - /// No special handling necessary - const DEFAULT: Self = Self(0); -""" - ) - - for variant in WidthState: - if variant.is_carried(): - if variant.is_cjk_only(): - module.write(' #[cfg(feature = "cjk")]\n') - module.write( - f" const {variant.name}: Self = Self(0b{variant.value:016b});\n" - ) - - module.write( - f""" - /// Whether this width mode is ligature_transparent - /// (has 5th MSB set.) - fn is_ligature_transparent(self) -> bool {{ - (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000 - }} - - /// Sets 6th MSB. - fn set_zwj_bit(self) -> Self {{ - Self(self.0 | 0b0000_0100_0000_0000) - }} - - /// Has top bit set - fn is_emoji_presentation(self) -> bool {{ - (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000 - }} - - /// Has top bit set - fn is_zwj_emoji_presentation(self) -> bool {{ - (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000 - }} - - /// Set top bit - fn set_emoji_presentation(self) -> Self {{ - if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 - || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000 - {{ - Self(self.0 | 0b1000_0000_0000_0000) - }} else {{ - Self::VARIATION_SELECTOR_16 - }} - }} - - /// Clear top bit - fn unset_emoji_presentation(self) -> Self {{ - if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ - Self(self.0 & 0b0111_1111_1111_1111) - }} else {{ - Self::DEFAULT - }} - }} - - /// Has 2nd bit set - fn is_text_presentation(self) -> bool {{ - (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000 - }} - - /// Set 2nd bit - fn set_text_presentation(self) -> Self {{ - if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ - Self(self.0 | 0b0100_0000_0000_0000) - }} else {{ - Self(0b0100_0000_0000_0000) - }} - }} - - /// Clear 2nd bit - fn unset_text_presentation(self) -> Self {{ - Self(self.0 & 0b1011_1111_1111_1111) - }} -}} - -/// The version of [Unicode](http://www.unicode.org/) -/// that this version of unicode-width is based on. -pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; -""" - ) - - module.write(lookup_fns(False, special_ranges, joining_group_lam)) - module.write(lookup_fns(True, special_ranges_cjk, joining_group_lam)) - - emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table - text_presentation_idx, text_presentation_leaves = text_presentation_table - emoji_modifier_idx, emoji_modifier_leaves = emoji_modifier_table - - module.write( - """ -/// Whether this character is a zero-width character with -/// `Joining_Type=Transparent`. Used by the Alef-Lamed ligatures. -/// See also [`is_ligature_transparent`], a near-subset of this (only ZWJ is excepted) -/// which is transparent for non-Arabic ligatures. -fn is_transparent_zero_width(c: char) -> bool { - if lookup_width(c).0 != 0 { - // Not zero-width - false - } else { - let cp: u32 = c.into(); - NON_TRANSPARENT_ZERO_WIDTHS - .binary_search_by(|&(lo, hi)| { - let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); - let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); - if cp < lo { - Ordering::Greater - } else if cp > hi { - Ordering::Less - } else { - Ordering::Equal - } - }) - .is_err() - } -} - -/// Whether this character is a default-ignorable combining mark -/// or ZWJ. These characters won't interrupt non-Arabic ligatures. -fn is_ligature_transparent(c: char) -> bool { - matches!(c, """ - ) - - tail = False - for lo, hi in ligature_transparent: - if tail: - module.write(" | ") - tail = True - module.write(f"'\\u{{{lo:X}}}'") - if hi != lo: - module.write(f"..='\\u{{{hi:X}}}'") - - module.write( - """) -} - -/// Whether this character is transparent wrt the effect of -/// U+0338 COMBINING LONG SOLIDUS OVERLAY -/// on its base character. -#[cfg(feature = "cjk")] -fn is_solidus_transparent(c: char) -> bool { - let cp: u32 = c.into(); - is_ligature_transparent(c) - || SOLIDUS_TRANSPARENT - .binary_search_by(|&(lo, hi)| { - let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); - let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); - if cp < lo { - Ordering::Greater - } else if cp > hi { - Ordering::Less - } else { - Ordering::Equal - } - }) - .is_ok() -} - -/// Whether this character forms an [emoji presentation sequence] -/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) -/// when followed by `'\\u{FEOF}'`. -/// Emoji presentation sequences are considered to have width 2. -#[inline] -pub fn starts_emoji_presentation_seq(c: char) -> bool { - let cp: u32 = c.into(); - // First level of lookup uses all but 10 LSB - let top_bits = cp >> 10; - let idx_of_leaf: usize = match top_bits { -""" - ) - - for msbs, i in emoji_presentation_idx: - module.write(f" 0x{msbs:X} => {i},\n") - - module.write( - """ _ => return false, - }; - // Extract the 3-9th (0-indexed) least significant bits of `cp`, - // and use them to index into `leaf_row`. - let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); - let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; - // Use the 3 LSB of `cp` to index into `leaf_byte`. - ((leaf_byte >> (cp & 7)) & 1) == 1 -} - -/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence] -/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence) -/// when followed by `'\\u{FEOE}'`, and is not ideographic. -/// Such sequences are considered to have width 1. -#[inline] -pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool { - let cp: u32 = c.into(); - // First level of lookup uses all but 8 LSB - let top_bits = cp >> 8; - let leaf: &[(u8, u8)] = match top_bits { -""" - ) - - for msbs, i in text_presentation_idx: - module.write(f" 0x{msbs:X} => &TEXT_PRESENTATION_LEAF_{i},\n") - - module.write( - """ _ => return false, - }; - - let bottom_bits = (cp & 0xFF) as u8; - leaf.binary_search_by(|&(lo, hi)| { - if bottom_bits < lo { - Ordering::Greater - } else if bottom_bits > hi { - Ordering::Less - } else { - Ordering::Equal - } - }) - .is_ok() -} - -/// Returns `true` if `c` is an `Emoji_Modifier_Base`. -#[inline] -pub fn is_emoji_modifier_base(c: char) -> bool { - let cp: u32 = c.into(); - // First level of lookup uses all but 8 LSB - let top_bits = cp >> 8; - let leaf: &[(u8, u8)] = match top_bits { -""" - ) - - for msbs, i in emoji_modifier_idx: - module.write(f" 0x{msbs:X} => &EMOJI_MODIFIER_LEAF_{i},\n") - - module.write( - """ _ => return false, - }; - - let bottom_bits = (cp & 0xFF) as u8; - leaf.binary_search_by(|&(lo, hi)| { - if bottom_bits < lo { - Ordering::Greater - } else if bottom_bits > hi { - Ordering::Less - } else { - Ordering::Equal - } - }) - .is_ok() -} - -#[repr(align(32))] -struct Align32(T); - -#[repr(align(64))] -struct Align64(T); - -#[repr(align(128))] -struct Align128(T); -""" - ) - - subtable_count = 1 - for i, table in enumerate(tables): - new_subtable_count = len(table.buckets()) - if i == len(tables) - 1: - table.indices_to_widths() # for the last table, indices == widths - byte_array = table.to_bytes() - - if table.bytes_per_row is None: - module.write( - f"/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.)\n" - ) - if table.cfged: - module.write('#[cfg(feature = "cjk")]\n') - module.write( - f"static {table.name}: Align{table.align}<[u8; {len(byte_array)}]> = Align{table.align}([" - ) - for j, byte in enumerate(byte_array): - # Add line breaks for every 15th entry (chosen to match what rustfmt does) - if j % 16 == 0: - module.write("\n ") - module.write(f" 0x{byte:02X},") - module.write("\n") - else: - num_rows = len(byte_array) // table.bytes_per_row - num_primary_rows = ( - table.primary_len - // (8 // int(table.offset_type)) - // table.bytes_per_row - ) - module.write( - f""" -#[cfg(feature = "cjk")] -const {table.name}_LEN: usize = {num_rows}; -#[cfg(not(feature = "cjk"))] -const {table.name}_LEN: usize = {num_primary_rows}; -/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. -static {table.name}: Align{table.align}<[[u8; {table.bytes_per_row}]; {table.name}_LEN]> = Align{table.align}([\n""" - ) - for row_num in range(0, num_rows): - if row_num >= num_primary_rows: - module.write(' #[cfg(feature = "cjk")]\n') - module.write(" [\n") - row = byte_array[ - row_num - * table.bytes_per_row : (row_num + 1) - * table.bytes_per_row - ] - for subrow in batched(row, 15): - module.write(" ") - for entry in subrow: - module.write(f" 0x{entry:02X},") - module.write("\n") - module.write(" ],\n") - module.write("]);\n") - subtable_count = new_subtable_count - - # non transparent zero width table - - module.write( - f""" -/// Sorted list of codepoint ranges (inclusive) -/// that are zero-width but not `Joining_Type=Transparent` -/// FIXME: can we get better compression? -static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); {len(non_transparent_zero_widths)}] = [ -""" - ) - - for lo, hi in non_transparent_zero_widths: - module.write( - f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" - ) - - # solidus transparent table - - module.write( - f"""]; - -/// Sorted list of codepoint ranges (inclusive) -/// that don't affect how the combining solidus applies -/// (mostly ccc > 1). -/// FIXME: can we get better compression? -#[cfg(feature = "cjk")] -static SOLIDUS_TRANSPARENT: [([u8; 3], [u8; 3]); {len(solidus_transparent)}] = [ -""" - ) - - for lo, hi in solidus_transparent: - module.write( - f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" - ) - - # emoji table - - module.write( - f"""]; - -/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint -/// to get whether it can start an emoji presentation sequence. -static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([ -""" - ) - for leaf in emoji_presentation_leaves: - module.write(" [\n") - for row in batched(leaf, 15): - module.write(" ") - for entry in row: - module.write(f" 0x{entry:02X},") - module.write("\n") - module.write(" ],\n") - - module.write("]);\n") - - # text table - - for leaf_idx, leaf in enumerate(text_presentation_leaves): - module.write( - f""" -#[rustfmt::skip] -static TEXT_PRESENTATION_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ -""" - ) - for lo, hi in leaf: - module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") - module.write(f"];\n") - - # emoji modifier table - - for leaf_idx, leaf in enumerate(emoji_modifier_leaves): - module.write( - f""" -#[rustfmt::skip] -static EMOJI_MODIFIER_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ -""" - ) - for lo, hi in leaf: - module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") - module.write(f"];\n") - - test_width_variants = [] - test_width_variants_cjk = [] - for variant in WidthState: - if variant.is_carried(): - if not variant.is_cjk_only(): - test_width_variants.append(variant) - if not variant.is_non_cjk_only(): - test_width_variants_cjk.append(variant) - - module.write( - f""" -#[cfg(test)] -mod tests {{ - use super::*; - - fn str_width_test(s: &str, init: WidthInfo) -> isize {{ - s.chars() - .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ - let (add, info) = width_in_str(c, next_info); - (sum.checked_add(isize::from(add)).unwrap(), info) - }}) - .0 - }} - - #[cfg(feature = "cjk")] - fn str_width_test_cjk(s: &str, init: WidthInfo) -> isize {{ - s.chars() - .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ - let (add, info) = width_in_str_cjk(c, next_info); - (sum.checked_add(isize::from(add)).unwrap(), info) - }}) - .0 - }} - - #[test] - fn test_normalization() {{ - for &(orig, nfc, nfd, nfkc, nfkd) in &NORMALIZATION_TEST {{ - for init in NORMALIZATION_TEST_WIDTHS {{ - assert_eq!( - str_width_test(orig, init), - str_width_test(nfc, init), - "width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", - ); - assert_eq!( - str_width_test(orig, init), - str_width_test(nfd, init), - "width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", - ); - assert_eq!( - str_width_test(nfkc, init), - str_width_test(nfkd, init), - "width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:X?}}", - ); - }} - - #[cfg(feature = "cjk")] - for init in NORMALIZATION_TEST_WIDTHS_CJK {{ - assert_eq!( - str_width_test_cjk(orig, init), - str_width_test_cjk(nfc, init), - "CJK width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", - ); - assert_eq!( - str_width_test_cjk(orig, init), - str_width_test_cjk(nfd, init), - "CJK width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", - ); - assert_eq!( - str_width_test_cjk(nfkc, init), - str_width_test_cjk(nfkd, init), - "CJK width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:?}}", - ); - }} - }} - }} - - static NORMALIZATION_TEST_WIDTHS: [WidthInfo; {len(test_width_variants) + 1}] = [ - WidthInfo::DEFAULT,\n""" - ) - - for variant in WidthState: - if variant.is_carried() and not variant.is_cjk_only(): - module.write(f" WidthInfo::{variant.name},\n") - - module.write( - f""" ]; - - #[cfg(feature = "cjk")] - static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; {len(test_width_variants_cjk) + 1}] = [ - WidthInfo::DEFAULT,\n""" - ) - - for variant in WidthState: - if variant.is_carried() and not variant.is_non_cjk_only(): - module.write(f" WidthInfo::{variant.name},\n") - - module.write( - f""" ]; - - #[rustfmt::skip] - static NORMALIZATION_TEST: [(&str, &str, &str, &str, &str); {len(normalization_tests)}] = [\n""" - ) - for orig, nfc, nfd, nfkc, nfkd in normalization_tests: - module.write( - f' (r#"{orig}"#, r#"{nfc}"#, r#"{nfd}"#, r#"{nfkc}"#, r#"{nfkd}"#),\n' - ) - - module.write(" ];\n}\n") - - -def main(module_path: str): - """Obtain character data from the latest version of Unicode, transform it into a multi-level - lookup table for character width, and write a Rust module utilizing that table to - `module_filename`. - - See `lib.rs` for documentation of the exact width rules. - """ - version = load_unicode_version() - print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") - - (width_map, cjk_width_map) = load_width_maps() - - tables = make_tables(width_map, cjk_width_map) - - special_ranges = make_special_ranges(width_map) - cjk_special_ranges = make_special_ranges(cjk_width_map) - - emoji_presentations = load_emoji_presentation_sequences() - emoji_presentation_table = make_presentation_sequence_table(emoji_presentations) - - text_presentations = load_text_presentation_sequences() - text_presentation_table = make_ranges_table(text_presentations) - - emoji_modifier_bases = load_emoji_modifier_bases() - emoji_modifier_table = make_ranges_table(emoji_modifier_bases) - - joining_group_lam = load_joining_group_lam() - non_transparent_zero_widths = load_non_transparent_zero_widths(width_map) - ligature_transparent = load_ligature_transparent() - solidus_transparent = load_solidus_transparent(ligature_transparent, cjk_width_map) - - normalization_tests = load_normalization_tests() - - fetch_open("emoji-test.txt", "../tests", emoji=True) - - print("------------------------") - total_size = 0 - for i, table in enumerate(tables): - size_bytes = len(table.to_bytes()) - print(f"Table {i} size: {size_bytes} bytes") - total_size += size_bytes - - for s, table in [ - ("Emoji presentation", emoji_presentation_table), - ]: - index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 8) - print(f"{s} index size: {index_size} bytes") - total_size += index_size - leaves_size = len(table[1]) * len(table[1][0]) - print(f"{s} leaves size: {leaves_size} bytes") - total_size += leaves_size - - for s, table in [ - ("Text presentation", text_presentation_table), - ("Emoji modifier", emoji_modifier_table), - ]: - index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 16) - print(f"{s} index size: {index_size} bytes") - total_size += index_size - leaves_size = 2 * sum(map(len, table[1])) - print(f"{s} leaves size: {leaves_size} bytes") - total_size += leaves_size - - for s, table in [ - ("Non transparent zero width", non_transparent_zero_widths), - ("Solidus transparent", solidus_transparent), - ]: - table_size = 6 * len(table) - print(f"{s} table size: {table_size} bytes") - total_size += table_size - print("------------------------") - print(f" Total size: {total_size} bytes") - - emit_module( - out_name=module_path, - unicode_version=version, - tables=tables, - special_ranges=special_ranges, - special_ranges_cjk=cjk_special_ranges, - emoji_presentation_table=emoji_presentation_table, - text_presentation_table=text_presentation_table, - emoji_modifier_table=emoji_modifier_table, - joining_group_lam=joining_group_lam, - non_transparent_zero_widths=non_transparent_zero_widths, - ligature_transparent=ligature_transparent, - solidus_transparent=solidus_transparent, - normalization_tests=normalization_tests, - ) - print(f'Wrote to "{module_path}"') - - -if __name__ == "__main__": - main(MODULE_PATH) -- cgit v1.2.3