diff options
Diffstat (limited to 'vendor/unicode-width/scripts/unicode.py')
| -rwxr-xr-x | vendor/unicode-width/scripts/unicode.py | 2156 |
1 files changed, 2156 insertions, 0 deletions
diff --git a/vendor/unicode-width/scripts/unicode.py b/vendor/unicode-width/scripts/unicode.py new file mode 100755 index 00000000..320da14e --- /dev/null +++ b/vendor/unicode-width/scripts/unicode.py @@ -0,0 +1,2156 @@ +#!/usr/bin/env python3 +# +# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT +# file at the top-level directory of this distribution and at +# http://rust-lang.org/COPYRIGHT. +# +# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +# option. This file may not be copied, modified, or distributed +# except according to those terms. + +# This script uses the following Unicode tables: +# +# - DerivedCoreProperties.txt +# - EastAsianWidth.txt +# - HangulSyllableType.txt +# - NormalizationTest.txt (for tests only) +# - PropList.txt +# - ReadMe.txt +# - UnicodeData.txt +# - auxiliary/GraphemeBreakProperty.txt +# - emoji/emoji-data.txt +# - emoji/emoji-variation-sequences.txt +# - extracted/DerivedGeneralCategory.txt +# +# Since this should not require frequent updates, we just store this +# out-of-line and check the generated module into git. + +import enum +import math +import operator +import os +import re +import sys +import urllib.request +from collections import defaultdict +from itertools import batched +from typing import Callable, Iterable + +UNICODE_VERSION = "15.1.0" +"""The version of the Unicode data files to download.""" + +NUM_CODEPOINTS = 0x110000 +"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" + +MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) +"""The maximum number of bits required to represent a Unicode codepoint.""" + + +class OffsetType(enum.IntEnum): + """Represents the data type of a lookup table's offsets. Each variant's value represents the + number of bits required to represent that variant's type.""" + + U2 = 2 + """Offsets are 2-bit unsigned integers, packed four-per-byte.""" + U4 = 4 + """Offsets are 4-bit unsigned integers, packed two-per-byte.""" + U8 = 8 + """Each offset is a single byte (u8).""" + + +MODULE_PATH = "../src/tables.rs" +"""The path of the emitted Rust module (relative to the working directory)""" + +TABLE_SPLITS = [7, 13] +"""The splits between the bits of the codepoint used to index each subtable. +Adjust these values to change the sizes of the subtables""" + +Codepoint = int +BitPos = int + + +def fetch_open(filename: str, local_prefix: str = "", emoji: bool = False): + """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, + fetches it from `https://www.unicode.org/Public/`. Exits with code 1 on failure. + """ + basename = os.path.basename(filename) + localname = os.path.join(local_prefix, basename) + if not os.path.exists(localname): + if emoji: + prefix = f"emoji/{UNICODE_VERSION[:-2]}" + else: + prefix = f"{UNICODE_VERSION}/ucd" + urllib.request.urlretrieve( + f"https://www.unicode.org/Public/{prefix}/{filename}", + localname, + ) + try: + return open(localname, encoding="utf-8") + except OSError: + sys.stderr.write(f"cannot load {localname}") + sys.exit(1) + + +def load_unicode_version() -> tuple[int, int, int]: + """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" + with fetch_open("ReadMe.txt") as readme: + pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" + return tuple(map(int, re.search(pattern, readme.read()).groups())) # type: ignore + + +def load_property(filename: str, pattern: str, action: Callable[[int], None]): + with fetch_open(filename) as properties: + single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+") + multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+") + + for line in properties.readlines(): + raw_data = None # (low, high) + if match := single.match(line): + raw_data = (match.group(1), match.group(1)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + for cp in range(low, high + 1): + action(cp) + + +def to_sorted_ranges(iter: Iterable[Codepoint]) -> list[tuple[Codepoint, Codepoint]]: + "Creates a sorted list of ranges from an iterable of codepoints" + lst = [c for c in iter] + lst.sort() + ret = [] + for cp in lst: + if len(ret) > 0 and ret[-1][1] == cp - 1: + ret[-1] = (ret[-1][0], cp) + else: + ret.append((cp, cp)) + return ret + + +class EastAsianWidth(enum.IntEnum): + """Represents the width of a Unicode character according to UAX 16. + All East Asian Width classes resolve into either + `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`. + """ + + NARROW = 1 + """ One column wide. """ + WIDE = 2 + """ Two columns wide. """ + AMBIGUOUS = 3 + """ Two columns wide in a CJK context. One column wide in all other contexts. """ + + +class CharWidthInTable(enum.IntEnum): + """Represents the width of a Unicode character + as stored in the tables.""" + + ZERO = 0 + ONE = 1 + TWO = 2 + SPECIAL = 3 + + +class WidthState(enum.IntEnum): + """ + Width calculation proceeds according to a state machine. + We iterate over the characters of the string from back to front; + the next character encountered determines the transition to take. + + The integer values of these variants have special meaning: + - Top bit: whether this is Vs16 + - 2nd from top: whether this is Vs15 + - 3rd bit from top: whether this is transparent to emoji/text presentation + (if set, should also set 4th) + - 4th bit: whether to set top bit on emoji presentation. + If this is set but 3rd is not, the width mode is related to zwj sequences + - 5th from top: whether this is unaffected by ligature-transparent + - 6th bit: if 4th is set but this one is not, then this is a ZWJ ligature state + where no ZWJ has been encountered yet; encountering one flips this on""" + + # BASIC WIDTHS + + ZERO = 0x1_0000 + "Zero columns wide." + + NARROW = 0x1_0001 + "One column wide." + + WIDE = 0x1_0002 + "Two columns wide." + + THREE = 0x1_0003 + "Three columns wide." + + # \r\n + LINE_FEED = 0b0000_0000_0000_0001 + "\\n (CRLF has width 1)" + + # EMOJI + + # Emoji skintone modifiers + EMOJI_MODIFIER = 0b0000_0000_0000_0010 + "`Emoji_Modifier`" + + # Emoji ZWJ sequences + + REGIONAL_INDICATOR = 0b0000_0000_0000_0011 + "`Regional_Indicator`" + + SEVERAL_REGIONAL_INDICATOR = 0b0000_0000_0000_0100 + "At least two `Regional_Indicator`in sequence" + + EMOJI_PRESENTATION = 0b0000_0000_0000_0101 + "`Emoji_Presentation`" + + ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0110 + "\\u200D `Emoji_Presentation`" + + VS16_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0110 + "\\uFE0F \\u200D `Emoji_Presentation`" + + KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b0001_0000_0000_0111 + "\\u20E3 \\u200D `Emoji_Presentation`" + + VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION = 0b1001_0000_0000_0111 + "\\uFE0F \\u20E3 \\u200D `Emoji_Presentation`" + + REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1001 + "`Regional_Indicator` \\u200D `Emoji_Presentation`" + + EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1010 + "(`Regional_Indicator` `Regional_Indicator`)+ \\u200D `Emoji_Presentation`" + + ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION = 0b0000_0000_0000_1011 + "(`Regional_Indicator` `Regional_Indicator`)+ `Regional_Indicator` \\u200D `Emoji_Presentation`" + + TAG_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0000 + "\\uE007F \\u200D `Emoji_Presentation`" + + TAG_D1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0001 + "\\uE0030..=\\uE0039 \\uE007F \\u200D `Emoji_Presentation`" + + TAG_D2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0010 + "(\\uE0030..=\\uE0039){2} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_D3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_0011 + "(\\uE0030..=\\uE0039){3} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A1_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1001 + "\\uE0061..=\\uE007A \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A2_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1010 + "(\\uE0061..=\\uE007A){2} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A3_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1011 + "(\\uE0061..=\\uE007A){3} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A4_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1100 + "(\\uE0061..=\\uE007A){4} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A5_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1101 + "(\\uE0061..=\\uE007A){35} \\uE007F \\u200D `Emoji_Presentation`" + + TAG_A6_END_ZWJ_EMOJI_PRESENTATION = 0b0000_0000_0001_1110 + "(\\uE0061..=\\uE007A){6} \\uE007F \\u200D `Emoji_Presentation`" + + # VARIATION SELECTORS + + # Text presentation sequences (not CJK) + VARIATION_SELECTOR_15 = 0b0100_0000_0000_0000 + "\\uFE0E (text presentation sequences)" + + # Emoji presentation sequences + VARIATION_SELECTOR_16 = 0b1000_0000_0000_0000 + "\\uFE0F (emoji presentation sequences)" + + # ARABIC LAM ALEF + + JOINING_GROUP_ALEF = 0b0011_0000_1111_1111 + "Joining_Group=Alef (Arabic Lam-Alef ligature)" + + # COMBINING SOLIDUS (CJK only) + + COMBINING_LONG_SOLIDUS_OVERLAY = 0b0011_1100_1111_1111 + "\\u0338 (CJK only, makes <, =, > width 2)" + + # SOLIDUS + ALEF (solidus is Joining_Type=Transparent) + SOLIDUS_OVERLAY_ALEF = 0b0011_1000_1111_1111 + "\\u0338 followed by Joining_Group=Alef" + + # SCRIPT ZWJ LIGATURES + + # Hebrew alef lamed + + HEBREW_LETTER_LAMED = 0b0011_1000_0000_0000 + "\\u05DC (Alef-ZWJ-Lamed ligature)" + + ZWJ_HEBREW_LETTER_LAMED = 0b0011_1100_0000_0000 + "\\u200D\\u05DC (Alef-ZWJ-Lamed ligature)" + + # Buginese <a -i> ya + + BUGINESE_LETTER_YA = 0b0011_1000_0000_0001 + "\\u1A10 (<a, -i> + ya ligature)" + + ZWJ_BUGINESE_LETTER_YA = 0b0011_1100_0000_0001 + "\\u200D\\u1A10 (<a, -i> + ya ligature)" + + BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA = 0b0011_1100_0000_0010 + "\\u1A17\\u200D\\u1A10 (<a, -i> + ya ligature)" + + # Tifinagh bi-consonants + + TIFINAGH_CONSONANT = 0b0011_1000_0000_0011 + "\\u2D31..=\\u2D65 or \\u2D6F (joined by ZWJ or \\u2D7F TIFINAGH CONSONANT JOINER)" + + ZWJ_TIFINAGH_CONSONANT = 0b0011_1100_0000_0011 + "ZWJ then \\u2D31..=\\u2D65 or \\u2D6F" + + TIFINAGH_JOINER_CONSONANT = 0b0011_1100_0000_0100 + "\\u2D7F then \\u2D31..=\\u2D65 or \\u2D6F" + + # Lisu tone letters + LISU_TONE_LETTER_MYA_NA_JEU = 0b0011_1100_0000_0101 + "\\uA4FC or \\uA4FD (https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G42078)" + + # Old Turkic orkhon ec - orkhon i + + OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1000_0000_0110 + "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" + + ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110 + "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" + + # Khmer coeng signs + + KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111 + "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF" + + def table_width(self) -> CharWidthInTable: + "The width of a character as stored in the lookup tables." + match self: + case WidthState.ZERO: + return CharWidthInTable.ZERO + case WidthState.NARROW: + return CharWidthInTable.ONE + case WidthState.WIDE: + return CharWidthInTable.TWO + case _: + return CharWidthInTable.SPECIAL + + def is_carried(self) -> bool: + "Whether this corresponds to a non-default `WidthInfo`." + return int(self) <= 0xFFFF + + def width_alone(self) -> int: + "The width of a character with this type when it appears alone." + match self: + case ( + WidthState.ZERO + | WidthState.COMBINING_LONG_SOLIDUS_OVERLAY + | WidthState.VARIATION_SELECTOR_15 + | WidthState.VARIATION_SELECTOR_16 + ): + return 0 + case ( + WidthState.WIDE + | WidthState.EMOJI_MODIFIER + | WidthState.EMOJI_PRESENTATION + ): + return 2 + case WidthState.THREE: + return 3 + case _: + return 1 + + def is_cjk_only(self) -> bool: + return self in [ + WidthState.COMBINING_LONG_SOLIDUS_OVERLAY, + WidthState.SOLIDUS_OVERLAY_ALEF, + ] + + def is_non_cjk_only(self) -> bool: + return self == WidthState.VARIATION_SELECTOR_15 + + +assert len(set([v.value for v in WidthState])) == len([v.value for v in WidthState]) + + +def load_east_asian_widths() -> list[EastAsianWidth]: + """Return a list of effective widths, indexed by codepoint. + Widths are determined by fetching and parsing `EastAsianWidth.txt`. + + `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. + + `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. + + `Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`.""" + + with fetch_open("EastAsianWidth.txt") as eaw: + # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." + single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") + # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." + multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(\w+) +# (\w+)") + # map between width category code and condensed width + width_codes = { + **{c: EastAsianWidth.NARROW for c in ["N", "Na", "H"]}, + **{c: EastAsianWidth.WIDE for c in ["W", "F"]}, + "A": EastAsianWidth.AMBIGUOUS, + } + + width_map = [] + current = 0 + for line in eaw.readlines(): + raw_data = None # (low, high, width) + if match := single.match(line): + raw_data = (match.group(1), match.group(1), match.group(2)) + elif match := multiple.match(line): + raw_data = (match.group(1), match.group(2), match.group(3)) + else: + continue + low = int(raw_data[0], 16) + high = int(raw_data[1], 16) + width = width_codes[raw_data[2]] + + assert current <= high + while current <= high: + # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. + # All such codepoints are implicitly given Neural width (resolves to narrow) + width_map.append(EastAsianWidth.NARROW if current < low else width) + current += 1 + + while len(width_map) < NUM_CODEPOINTS: + # Catch any leftover codepoints and assign them implicit Neutral/narrow width. + width_map.append(EastAsianWidth.NARROW) + + # Ambiguous `Letter`s and `Modifier_Symbol`s are narrow + load_property( + "extracted/DerivedGeneralCategory.txt", + r"(:?Lu|Ll|Lt|Lm|Lo|Sk)", + lambda cp: ( + operator.setitem(width_map, cp, EastAsianWidth.NARROW) + if width_map[cp] == EastAsianWidth.AMBIGUOUS + else None + ), + ) + + # GREEK ANO TELEIA: NFC decomposes to U+00B7 MIDDLE DOT + width_map[0x0387] = EastAsianWidth.AMBIGUOUS + + # Canonical equivalence for symbols with stroke + with fetch_open("UnicodeData.txt") as udata: + single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-Z]+) 0338;") + for line in udata.readlines(): + if match := single.match(line): + composed = int(match.group(1), 16) + decomposed = int(match.group(2), 16) + if width_map[decomposed] == EastAsianWidth.AMBIGUOUS: + width_map[composed] = EastAsianWidth.AMBIGUOUS + + return width_map + + +def load_zero_widths() -> list[bool]: + """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width + character. `c` is considered a zero-width character if + + - it has the `Default_Ignorable_Code_Point` property (determined from `DerivedCoreProperties.txt`), + - or if it has the `Grapheme_Extend` property (determined from `DerivedCoreProperties.txt`), + - or if it one of eight characters that should be `Grapheme_Extend` but aren't due to a Unicode spec bug, + - or if it has a `Hangul_Syllable_Type` of `Vowel_Jamo` or `Trailing_Jamo` (determined from `HangulSyllableType.txt`). + """ + + zw_map = [False] * NUM_CODEPOINTS + + # `Default_Ignorable_Code_Point`s also have 0 width: + # https://www.unicode.org/faq/unsup_char.html#3 + # https://www.unicode.org/versions/Unicode15.1.0/ch05.pdf#G40095 + # + # `Grapheme_Extend` includes characters with general category `Mn` or `Me`, + # as well as a few `Mc` characters that need to be included so that + # canonically equivalent sequences have the same width. + load_property( + "DerivedCoreProperties.txt", + r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)", + lambda cp: operator.setitem(zw_map, cp, True), + ) + + # Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`, + # as they canonically decompose to two characters with this property, + # but they aren't. + for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]: + zw_map[c] = True + + # Treat `Hangul_Syllable_Type`s of `Vowel_Jamo` and `Trailing_Jamo` + # as zero-width. This matches the behavior of glibc `wcwidth`. + # + # Decomposed Hangul characters consist of 3 parts: a `Leading_Jamo`, + # a `Vowel_Jamo`, and an optional `Trailing_Jamo`. Together these combine + # into a single wide grapheme. So we treat vowel and trailing jamo as + # 0-width, such that only the width of the leading jamo is counted + # and the resulting grapheme has width 2. + # + # (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul) + load_property( + "HangulSyllableType.txt", + r"(?:V|T)", + lambda cp: operator.setitem(zw_map, cp, True), + ) + + # Syriac abbreviation mark: + # Zero-width `Prepended_Concatenation_Mark` + zw_map[0x070F] = True + + # Some Arabic Prepended_Concatenation_Mark`s + # https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G27820 + zw_map[0x0605] = True + zw_map[0x0890] = True + zw_map[0x0891] = True + zw_map[0x08E2] = True + + # `[:Grapheme_Cluster_Break=Prepend:]-[:Prepended_Concatenation_Mark:]` + gcb_prepend = set() + load_property( + "auxiliary/GraphemeBreakProperty.txt", + "Prepend", + lambda cp: gcb_prepend.add(cp), + ) + load_property( + "PropList.txt", + "Prepended_Concatenation_Mark", + lambda cp: gcb_prepend.remove(cp), + ) + for cp in gcb_prepend: + zw_map[cp] = True + + # HANGUL CHOSEONG FILLER + # U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have + # zero width. However, the expected usage is to combine it with vowel or trailing jamo + # (which are considered 0-width on their own) to form a composed Hangul syllable with + # width 2. Therefore, we treat it as having width 2. + zw_map[0x115F] = False + + # TIFINAGH CONSONANT JOINER + # (invisible only when used to join two Tifinagh consonants + zw_map[0x2D7F] = False + + # DEVANAGARI CARET + # https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447 + zw_map[0xA8FA] = True + + return zw_map + + +def load_width_maps() -> tuple[list[WidthState], list[WidthState]]: + """Load complete width table, including characters needing special handling. + (Returns 2 tables, one for East Asian and one for not.)""" + + eaws = load_east_asian_widths() + zws = load_zero_widths() + + not_ea = [] + ea = [] + + for eaw, zw in zip(eaws, zws): + if zw: + not_ea.append(WidthState.ZERO) + ea.append(WidthState.ZERO) + else: + if eaw == EastAsianWidth.WIDE: + not_ea.append(WidthState.WIDE) + else: + not_ea.append(WidthState.NARROW) + + if eaw == EastAsianWidth.NARROW: + ea.append(WidthState.NARROW) + else: + ea.append(WidthState.WIDE) + + # Joining_Group=Alef (Arabic Lam-Alef ligature) + alef_joining = [] + load_property( + "extracted/DerivedJoiningGroup.txt", + "Alef", + lambda cp: alef_joining.append(cp), + ) + + # Regional indicators + regional_indicators = [] + load_property( + "PropList.txt", + "Regional_Indicator", + lambda cp: regional_indicators.append(cp), + ) + + # Emoji modifiers + emoji_modifiers = [] + load_property( + "emoji/emoji-data.txt", + "Emoji_Modifier", + lambda cp: emoji_modifiers.append(cp), + ) + + # Default emoji presentation (for ZWJ sequences) + emoji_presentation = [] + load_property( + "emoji/emoji-data.txt", + "Emoji_Presentation", + lambda cp: emoji_presentation.append(cp), + ) + + for cps, width in [ + ([0x0A], WidthState.LINE_FEED), + ([0x05DC], WidthState.HEBREW_LETTER_LAMED), + (alef_joining, WidthState.JOINING_GROUP_ALEF), + (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + ( + [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF], + WidthState.KHMER_COENG_ELIGIBLE_LETTER, + ), + ([0x17A4], WidthState.WIDE), + ([0x17D8], WidthState.THREE), + ([0x1A10], WidthState.BUGINESE_LETTER_YA), + (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT), + ([0x2D6F], WidthState.TIFINAGH_CONSONANT), + ([0xA4FC], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), + ([0xA4FD], WidthState.LISU_TONE_LETTER_MYA_NA_JEU), + ([0xFE0F], WidthState.VARIATION_SELECTOR_16), + ([0x10C03], WidthState.OLD_TURKIC_LETTER_ORKHON_I), + (emoji_presentation, WidthState.EMOJI_PRESENTATION), + (emoji_modifiers, WidthState.EMOJI_MODIFIER), + (regional_indicators, WidthState.REGIONAL_INDICATOR), + ]: + for cp in cps: + not_ea[cp] = width + ea[cp] = width + + # East-Asian only + ea[0x0338] = WidthState.COMBINING_LONG_SOLIDUS_OVERLAY + + # Not East Asian only + not_ea[0xFE0E] = WidthState.VARIATION_SELECTOR_15 + + return (not_ea, ea) + + +def load_joining_group_lam() -> list[tuple[Codepoint, Codepoint]]: + "Returns a list of character ranges with Joining_Group=Lam" + lam_joining = [] + load_property( + "extracted/DerivedJoiningGroup.txt", + "Lam", + lambda cp: lam_joining.append(cp), + ) + + return to_sorted_ranges(lam_joining) + + +def load_non_transparent_zero_widths( + width_map: list[WidthState], +) -> list[tuple[Codepoint, Codepoint]]: + "Returns a list of characters with zero width but not 'Joining_Type=Transparent'" + + zero_widths = set() + for cp, width in enumerate(width_map): + if width.width_alone() == 0: + zero_widths.add(cp) + transparent = set() + load_property( + "extracted/DerivedJoiningType.txt", + "T", + lambda cp: transparent.add(cp), + ) + + return to_sorted_ranges(zero_widths - transparent) + + +def load_ligature_transparent() -> list[tuple[Codepoint, Codepoint]]: + """Returns a list of character ranges corresponding to all combining marks that are also + `Default_Ignorable_Code_Point`s, plus ZWJ. This is the set of characters that won't interrupt + a ligature.""" + default_ignorables = set() + load_property( + "DerivedCoreProperties.txt", + "Default_Ignorable_Code_Point", + lambda cp: default_ignorables.add(cp), + ) + + combining_marks = set() + load_property( + "extracted/DerivedGeneralCategory.txt", + "(?:Mc|Mn|Me)", + lambda cp: combining_marks.add(cp), + ) + + default_ignorable_combinings = default_ignorables.intersection(combining_marks) + default_ignorable_combinings.add(0x200D) # ZWJ + + return to_sorted_ranges(default_ignorable_combinings) + + +def load_solidus_transparent( + ligature_transparents: list[tuple[Codepoint, Codepoint]], + cjk_width_map: list[WidthState], +) -> list[tuple[Codepoint, Codepoint]]: + """Characters expanding to a canonical combining class above 1, plus `ligature_transparent`s from above. + Ranges matching ones in `ligature_transparent` exactly are excluded (for compression), so it needs to bechecked also. + """ + + ccc_above_1 = set() + load_property( + "extracted/DerivedCombiningClass.txt", + "(?:[2-9]|(?:[1-9][0-9]+))", + lambda cp: ccc_above_1.add(cp), + ) + + for lo, hi in ligature_transparents: + for cp in range(lo, hi + 1): + ccc_above_1.add(cp) + + num_chars = len(ccc_above_1) + + # Recursive decompositions + while True: + with fetch_open("UnicodeData.txt") as udata: + single = re.compile(r"([0-9A-Z]+);.*?;.*?;.*?;.*?;([0-9A-F ]+);") + for line in udata.readlines(): + if match := single.match(line): + composed = int(match.group(1), 16) + decomposed = [int(c, 16) for c in match.group(2).split(" ")] + if all([c in ccc_above_1 for c in decomposed]): + ccc_above_1.add(composed) + if len(ccc_above_1) == num_chars: + break + else: + num_chars = len(ccc_above_1) + + for cp in ccc_above_1: + if cp != 0xFE0F: + assert ( + cjk_width_map[cp].table_width() != CharWidthInTable.SPECIAL + ), f"U+{cp:X}" + + sorted = to_sorted_ranges(ccc_above_1) + return list(filter(lambda range: range not in ligature_transparents, sorted)) + + +def load_normalization_tests() -> list[tuple[str, str, str, str, str]]: + def parse_codepoints(cps: str) -> str: + return "".join(map(lambda cp: chr(int(cp, 16)), cps.split(" "))) + + with fetch_open("NormalizationTest.txt") as normtests: + ret = [] + single = re.compile( + r"^([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);([0-9A-F ]+);" + ) + for line in normtests.readlines(): + if match := single.match(line): + ret.append( + ( + parse_codepoints(match.group(1)), + parse_codepoints(match.group(2)), + parse_codepoints(match.group(3)), + parse_codepoints(match.group(4)), + parse_codepoints(match.group(5)), + ) + ) + return ret + + +def make_special_ranges( + width_map: list[WidthState], +) -> list[tuple[tuple[Codepoint, Codepoint], WidthState]]: + "Assign ranges of characters to their special behavior (used in match)" + ret = [] + can_merge_with_prev = False + for cp, width in enumerate(width_map): + if width == WidthState.EMOJI_PRESENTATION: + can_merge_with_prev = False + elif width.table_width() == CharWidthInTable.SPECIAL: + if can_merge_with_prev and ret[-1][1] == width: + ret[-1] = ((ret[-1][0][0], cp), width) + else: + ret.append(((cp, cp), width)) + can_merge_with_prev = True + return ret + + +class Bucket: + """A bucket contains a group of codepoints and an ordered width list. If one bucket's width + list overlaps with another's width list, those buckets can be merged via `try_extend`. + """ + + def __init__(self): + """Creates an empty bucket.""" + self.entry_set = set() + self.widths = [] + + def append(self, codepoint: Codepoint, width: CharWidthInTable): + """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" + self.entry_set.add((codepoint, width)) + self.widths.append(width) + + def try_extend(self, attempt: "Bucket") -> bool: + """If either `self` or `attempt`'s width list starts with the other bucket's width list, + set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints + into `self`, and return `True`. Otherwise, return `False`.""" + (less, more) = (self.widths, attempt.widths) + if len(self.widths) > len(attempt.widths): + (less, more) = (attempt.widths, self.widths) + if less != more[: len(less)]: + return False + self.entry_set |= attempt.entry_set + self.widths = more + return True + + def entries(self) -> list[tuple[Codepoint, CharWidthInTable]]: + """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" + result = list(self.entry_set) + result.sort() + return result + + def width(self) -> CharWidthInTable | None: + """If all codepoints in this bucket have the same width, return that width; otherwise, + return `None`.""" + if len(self.widths) == 0: + return None + potential_width = self.widths[0] + for width in self.widths[1:]: + if potential_width != width: + return None + return potential_width + + +def make_buckets( + entries: Iterable[tuple[int, CharWidthInTable]], low_bit: BitPos, cap_bit: BitPos +) -> list[Bucket]: + """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All + codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the + same bucket. Returns a list of the buckets in increasing order of those bits.""" + num_bits = cap_bit - low_bit + assert num_bits > 0 + buckets = [Bucket() for _ in range(0, 2**num_bits)] + mask = (1 << num_bits) - 1 + for codepoint, width in entries: + buckets[(codepoint >> low_bit) & mask].append(codepoint, width) + return buckets + + +class Table: + """Represents a lookup table. Each table contains a certain number of subtables; each + subtable is indexed by a contiguous bit range of the codepoint and contains a list + of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) + + Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should + be indexed by sub-table `i` in the next-level lookup table. The entries of this table are + indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The + key to compression is that two different buckets in two different sub-tables may have the + same width list, which means that they can be merged into the same bucket. + + If no bucket contains two codepoints with different widths, calling `indices_to_widths` will + discard the buckets and convert the entries into `EffectiveWidth` values.""" + + def __init__( + self, + name: str, + entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], + secondary_entry_groups: Iterable[Iterable[tuple[int, CharWidthInTable]]], + low_bit: BitPos, + cap_bit: BitPos, + offset_type: OffsetType, + align: int, + bytes_per_row: int | None = None, + starting_indexed: list[Bucket] = [], + cfged: bool = False, + ): + """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator + in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, + and each table entry is represented in the format specified by `offset_type`. Asserts + that this table is actually representable with `offset_type`.""" + starting_indexed_len = len(starting_indexed) + self.name = name + self.low_bit = low_bit + self.cap_bit = cap_bit + self.offset_type = offset_type + self.entries: list[int] = [] + self.indexed: list[Bucket] = list(starting_indexed) + self.align = align + self.bytes_per_row = bytes_per_row + self.cfged = cfged + + buckets: list[Bucket] = [] + for entries in entry_groups: + buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) + + for bucket in buckets: + for i, existing in enumerate(self.indexed): + if existing.try_extend(bucket): + self.entries.append(i) + break + else: + self.entries.append(len(self.indexed)) + self.indexed.append(bucket) + + self.primary_len = len(self.entries) + self.primary_bucket_len = len(self.indexed) + + buckets = [] + for entries in secondary_entry_groups: + buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) + + for bucket in buckets: + for i, existing in enumerate(self.indexed): + if existing.try_extend(bucket): + self.entries.append(i) + break + else: + self.entries.append(len(self.indexed)) + self.indexed.append(bucket) + + # Validate offset type + max_index = 1 << int(self.offset_type) + for index in self.entries: + assert index < max_index, f"{index} <= {max_index}" + + self.indexed = self.indexed[starting_indexed_len:] + + def indices_to_widths(self): + """Destructively converts the indices in this table to the `EffectiveWidth` values of + their buckets. Assumes that no bucket contains codepoints with different widths. + """ + self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) # type: ignore + del self.indexed + + def buckets(self): + """Returns an iterator over this table's buckets.""" + return self.indexed + + def to_bytes(self) -> list[int]: + """Returns this table's entries as a list of bytes. The bytes are formatted according to + the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries + to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will + contain four packed 2-bit entries.""" + entries_per_byte = 8 // int(self.offset_type) + byte_array = [] + for i in range(0, len(self.entries), entries_per_byte): + byte = 0 + for j in range(0, entries_per_byte): + byte |= self.entries[i + j] << (j * int(self.offset_type)) + byte_array.append(byte) + return byte_array + + +def make_tables( + width_map: list[WidthState], + cjk_width_map: list[WidthState], +) -> list[Table]: + """Creates a table for each configuration in `table_cfgs`, with the first config corresponding + to the top-level lookup table, the second config corresponding to the second-level lookup + table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs + to include in the top-level table.""" + + entries = enumerate([w.table_width() for w in width_map]) + cjk_entries = enumerate([w.table_width() for w in cjk_width_map]) + + root_table = Table( + "WIDTH_ROOT", + [entries], + [], + TABLE_SPLITS[1], + MAX_CODEPOINT_BITS, + OffsetType.U8, + 128, + ) + + cjk_root_table = Table( + "WIDTH_ROOT_CJK", + [cjk_entries], + [], + TABLE_SPLITS[1], + MAX_CODEPOINT_BITS, + OffsetType.U8, + 128, + starting_indexed=root_table.indexed, + cfged=True, + ) + + middle_table = Table( + "WIDTH_MIDDLE", + map(lambda bucket: bucket.entries(), root_table.buckets()), + map(lambda bucket: bucket.entries(), cjk_root_table.buckets()), + TABLE_SPLITS[0], + TABLE_SPLITS[1], + OffsetType.U8, + 2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), + bytes_per_row=2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]), + ) + + leaves_table = Table( + "WIDTH_LEAVES", + map( + lambda bucket: bucket.entries(), + middle_table.buckets()[: middle_table.primary_bucket_len], + ), + map( + lambda bucket: bucket.entries(), + middle_table.buckets()[middle_table.primary_bucket_len :], + ), + 0, + TABLE_SPLITS[0], + OffsetType.U2, + 2 ** (TABLE_SPLITS[0] - 2), + bytes_per_row=2 ** (TABLE_SPLITS[0] - 2), + ) + + return [root_table, cjk_root_table, middle_table, leaves_table] + + +def load_emoji_presentation_sequences() -> list[Codepoint]: + """Outputs a list of cpodepoints, corresponding to all the valid characters for starting + an emoji presentation sequence.""" + + with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + # Match all emoji presentation sequences + # (one codepoint followed by U+FE0F, and labeled "emoji style") + sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s*emoji style") + codepoints = [] + for line in sequences.readlines(): + if match := sequence.match(line): + cp = int(match.group(1), 16) + codepoints.append(cp) + return codepoints + + +def load_text_presentation_sequences() -> list[Codepoint]: + """Outputs a list of codepoints, corresponding to all the valid characters + whose widths change with a text presentation sequence.""" + + text_presentation_seq_codepoints = set() + with fetch_open("emoji/emoji-variation-sequences.txt") as sequences: + # Match all text presentation sequences + # (one codepoint followed by U+FE0E, and labeled "text style") + sequence = re.compile(r"^([0-9A-F]+)\s+FE0E\s*;\s*text style") + for line in sequences.readlines(): + if match := sequence.match(line): + cp = int(match.group(1), 16) + text_presentation_seq_codepoints.add(cp) + + default_emoji_codepoints = set() + + load_property( + "emoji/emoji-data.txt", + "Emoji_Presentation", + lambda cp: default_emoji_codepoints.add(cp), + ) + + codepoints = [] + for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints): + # "Enclosed Ideographic Supplement" block; + # wide even in text presentation + if not cp in range(0x1F200, 0x1F300): + codepoints.append(cp) + + codepoints.sort() + return codepoints + + +def load_emoji_modifier_bases() -> list[Codepoint]: + """Outputs a list of codepoints, corresponding to all the valid characters + whose widths change with a text presentation sequence.""" + + ret = [] + load_property( + "emoji/emoji-data.txt", + "Emoji_Modifier_Base", + lambda cp: ret.append(cp), + ) + ret.sort() + return ret + + +def make_presentation_sequence_table( + seqs: list[Codepoint], + lsb: int = 10, +) -> tuple[list[tuple[int, int]], list[list[int]]]: + """Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence. + The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB. + """ + + prefixes_dict = defaultdict(set) + for cp in seqs: + prefixes_dict[cp >> lsb].add(cp & (2**lsb - 1)) + + msbs: list[int] = list(prefixes_dict.keys()) + + leaves: list[list[int]] = [] + for cps in prefixes_dict.values(): + leaf = [0] * (2 ** (lsb - 3)) + for cp in cps: + idx_in_leaf, bit_shift = divmod(cp, 8) + leaf[idx_in_leaf] |= 1 << bit_shift + leaves.append(leaf) + + indexes = [(msb, index) for (index, msb) in enumerate(msbs)] + + # Cull duplicate leaves + i = 0 + while i < len(leaves): + first_idx = leaves.index(leaves[i]) + if first_idx == i: + i += 1 + else: + for j in range(0, len(indexes)): + if indexes[j][1] == i: + indexes[j] = (indexes[j][0], first_idx) + elif indexes[j][1] > i: + indexes[j] = (indexes[j][0], indexes[j][1] - 1) + + leaves.pop(i) + + return (indexes, leaves) + + +def make_ranges_table( + seqs: list[Codepoint], +) -> tuple[list[tuple[int, int]], list[list[tuple[int, int]]]]: + """Generates 2-level lookup table for a binary property of a codepoint. + First level is all but the last byte, second level is ranges for last byte + """ + + prefixes_dict = defaultdict(list) + for cp in seqs: + prefixes_dict[cp >> 8].append(cp & 0xFF) + + msbs: list[int] = list(prefixes_dict.keys()) + + leaves: list[list[tuple[int, int]]] = [] + for cps in prefixes_dict.values(): + leaf = [] + for cp in cps: + if len(leaf) > 0 and leaf[-1][1] == cp - 1: + leaf[-1] = (leaf[-1][0], cp) + else: + leaf.append((cp, cp)) + leaves.append(leaf) + + indexes = [(msb, index) for (index, msb) in enumerate(msbs)] + + # Cull duplicate leaves + i = 0 + while i < len(leaves): + first_idx = leaves.index(leaves[i]) + if first_idx == i: + i += 1 + else: + for j in range(0, len(indexes)): + if indexes[j][1] == i: + indexes[j] = (indexes[j][0], first_idx) + elif indexes[j][1] > i: + indexes[j] = (indexes[j][0], indexes[j][1] - 1) + + leaves.pop(i) + + return (indexes, leaves) + + +def lookup_fns( + is_cjk: bool, + special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], + joining_group_lam: list[tuple[Codepoint, Codepoint]], +) -> str: + if is_cjk: + cfg = '#[cfg(feature = "cjk")]\n' + cjk_lo = "_cjk" + cjk_cap = "_CJK" + ambig = "wide" + else: + cfg = "" + cjk_lo = "" + cjk_cap = "" + ambig = "narrow" + s = f""" +/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by +/// consulting a multi-level lookup table. +/// +/// # Maintenance +/// The tables themselves are autogenerated but this function is hardcoded. You should have +/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) +/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the +/// `make_tables` function in `unicode.py`) you must ensure that this code reflects those changes. +{cfg}#[inline] +fn lookup_width{cjk_lo}(c: char) -> (u8, WidthInfo) {{ + let cp = c as usize; + + let t1_offset = WIDTH_ROOT{cjk_cap}.0[cp >> {TABLE_SPLITS[1]}]; + + // Each sub-table in WIDTH_MIDDLE is 7 bits, and each stored entry is a byte, + // so each sub-table is 128 bytes in size. + // (Sub-tables are selected using the computed offset from the previous table.) + let t2_offset = WIDTH_MIDDLE.0[usize::from(t1_offset)][cp >> {TABLE_SPLITS[0]} & 0x{(2 ** (TABLE_SPLITS[1] - TABLE_SPLITS[0]) - 1):X}]; + + // Each sub-table in WIDTH_LEAVES is 6 bits, but each stored entry is 2 bits. + // This is accomplished by packing four stored entries into one byte. + // So each sub-table is 2**(7-2) == 32 bytes in size. + // Since this is the last table, each entry represents an encoded width. + let packed_widths = WIDTH_LEAVES.0[usize::from(t2_offset)][cp >> 2 & 0x{(2 ** (TABLE_SPLITS[0] - 2) - 1):X}]; + + // Extract the packed width + let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; + + if width < 3 {{ + (width, WidthInfo::DEFAULT) + }} else {{ + match c {{ +""" + + for (lo, hi), width in special_ranges: + s += f" '\\u{{{lo:X}}}'" + if hi != lo: + s += f"..='\\u{{{hi:X}}}'" + if width.is_carried(): + width_info = width.name + else: + width_info = "DEFAULT" + s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n" + + s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION), + }} + }} +}} + +/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or +/// `None` if `c` is a control character. +/// Ambiguous width characters are treated as {ambig}. +{cfg}#[inline] +pub fn single_char_width{cjk_lo}(c: char) -> Option<usize> {{ + if c < '\\u{{7F}}' {{ + if c >= '\\u{{20}}' {{ + // U+0020 to U+007F (exclusive) are single-width ASCII codepoints + Some(1) + }} else {{ + // U+0000 to U+0020 (exclusive) are control codes + None + }} + }} else if c >= '\\u{{A0}}' {{ + // No characters >= U+00A0 are control codes, so we can consult the lookup tables + Some(lookup_width{cjk_lo}(c).0.into()) + }} else {{ + // U+007F to U+00A0 (exclusive) are control codes + None + }} +}} + +/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`. +/// Ambiguous width characters are treated as {ambig}. +{cfg}#[inline] +fn width_in_str{cjk_lo}(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) {{ + if next_info.is_emoji_presentation() {{ + if starts_emoji_presentation_seq(c) {{ + let width = if next_info.is_zwj_emoji_presentation() {{ + 0 + }} else {{ + 2 + }}; + return (width, WidthInfo::EMOJI_PRESENTATION); + }} else {{ + next_info = next_info.unset_emoji_presentation(); + }} + }}""" + + if is_cjk: + s += """ + if (matches!( + next_info, + WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY | WidthInfo::SOLIDUS_OVERLAY_ALEF + ) && matches!(c, '<' | '=' | '>')) + { + return (2, WidthInfo::DEFAULT); + }""" + + s += """ + if c <= '\\u{A0}' { + match c { + // According to the spec, LF should be width 1, which is how it is often rendered when it is forced to have a single-line rendering + // However, this makes it harder to use this crate to calculate line breaks, and breaks assumptions of downstream crates. + // https://github.com/unicode-rs/unicode-width/issues/60 + '\\n' => (0, WidthInfo::LINE_FEED), + '\\r' if next_info == WidthInfo::LINE_FEED => (0, WidthInfo::DEFAULT), + _ => (1, WidthInfo::DEFAULT), + } + } else { + // Fast path + if next_info != WidthInfo::DEFAULT { + if c == '\\u{FE0F}' { + return (0, next_info.set_emoji_presentation()); + }""" + + if not is_cjk: + s += """ + if c == '\\u{FE0E}' { + return (0, next_info.set_text_presentation()); + } + if next_info.is_text_presentation() { + if starts_non_ideographic_text_presentation_seq(c) { + return (1, WidthInfo::DEFAULT); + } else { + next_info = next_info.unset_text_presentation(); + } + }""" + + s += """ + if next_info.is_ligature_transparent() { + if c == '\\u{200D}' { + return (0, next_info.set_zwj_bit()); + } else if is_ligature_transparent(c) { + return (0, next_info); + } + } + + match (next_info, c) {""" + if is_cjk: + s += """ + (WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, _) if is_solidus_transparent(c) => { + return ( + lookup_width_cjk(c).0 as i8, + WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY, + ); + } + (WidthInfo::JOINING_GROUP_ALEF, '\\u{0338}') => { + return (0, WidthInfo::SOLIDUS_OVERLAY_ALEF); + } + // Arabic Lam-Alef ligature + ( + WidthInfo::JOINING_GROUP_ALEF | WidthInfo::SOLIDUS_OVERLAY_ALEF, + """ + else: + s += """ + // Arabic Lam-Alef ligature + ( + WidthInfo::JOINING_GROUP_ALEF, + """ + + tail = False + for lo, hi in joining_group_lam: + if tail: + s += " | " + tail = True + s += f"'\\u{{{lo:X}}}'" + if hi != lo: + s += f"..='\\u{{{hi:X}}}'" + s += """, + ) => return (0, WidthInfo::DEFAULT), + (WidthInfo::JOINING_GROUP_ALEF, _) if is_transparent_zero_width(c) => { + return (0, WidthInfo::JOINING_GROUP_ALEF); + } + + // Hebrew Alef-ZWJ-Lamed ligature + (WidthInfo::ZWJ_HEBREW_LETTER_LAMED, '\\u{05D0}') => { + return (0, WidthInfo::DEFAULT); + } + + // Khmer coeng signs + (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => { + return (-1, WidthInfo::DEFAULT); + } + + // Buginese <a, -i> ZWJ ya ligature + (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => { + return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) + } + (WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA, '\\u{1A15}') => { + return (0, WidthInfo::DEFAULT) + } + + // Tifinagh bi-consonants + (WidthInfo::TIFINAGH_CONSONANT | WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D7F}') => { + return (1, WidthInfo::TIFINAGH_JOINER_CONSONANT); + } + (WidthInfo::ZWJ_TIFINAGH_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { + return (0, WidthInfo::DEFAULT); + } + (WidthInfo::TIFINAGH_JOINER_CONSONANT, '\\u{2D31}'..='\\u{2D65}' | '\\u{2D6F}') => { + return (-1, WidthInfo::DEFAULT); + } + + // Lisu tone letter combinations + (WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, '\\u{A4F8}'..='\\u{A4FB}') => { + return (0, WidthInfo::DEFAULT); + } + + // Old Turkic ligature + (WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, '\\u{10C32}') => { + return (0, WidthInfo::DEFAULT); + }""" + + s += f""" + // Emoji modifier + (WidthInfo::EMOJI_MODIFIER, _) if is_emoji_modifier_base(c) => {{ + return (0, WidthInfo::EMOJI_PRESENTATION); + }} + + // Regional indicator + ( + WidthInfo::REGIONAL_INDICATOR | WidthInfo::SEVERAL_REGIONAL_INDICATOR, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (1, WidthInfo::SEVERAL_REGIONAL_INDICATOR), + + // ZWJ emoji + ( + WidthInfo::EMOJI_PRESENTATION + | WidthInfo::SEVERAL_REGIONAL_INDICATOR + | WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::EMOJI_MODIFIER, + '\\u{{200D}}', + ) => return (0, WidthInfo::ZWJ_EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{20E3}}') => {{ + return (0, WidthInfo::KEYCAP_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::VS16_ZWJ_EMOJI_PRESENTATION, _) if starts_emoji_presentation_seq(c) => {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + (WidthInfo::VS16_KEYCAP_ZWJ_EMOJI_PRESENTATION, '0'..='9' | '#' | '*') => {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F1E6}}'..='\\u{{1F1FF}}') => {{ + return (1, WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION); + }} + ( + WidthInfo::REGIONAL_INDICATOR_ZWJ_PRESENTATION + | WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (-1, WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + ( + WidthInfo::EVEN_REGIONAL_INDICATOR_ZWJ_PRESENTATION, + '\\u{{1F1E6}}'..='\\u{{1F1FF}}', + ) => return (3, WidthInfo::ODD_REGIONAL_INDICATOR_ZWJ_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{1F3FB}}'..='\\u{{1F3FF}}') => {{ + return (0, WidthInfo::EMOJI_MODIFIER); + }} + (WidthInfo::ZWJ_EMOJI_PRESENTATION, '\\u{{E007F}}') => {{ + return (0, WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION) + }} + (WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0061}}'..='\\u{{E007A}}') => {{ + return (0, WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION) + }} + ( + WidthInfo::TAG_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A1_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A2_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION, + '\\u{{E0030}}'..='\\u{{E0039}}', + ) => return (0, WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION), + (WidthInfo::TAG_D1_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ + return (0, WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION); + }} + (WidthInfo::TAG_D2_END_ZWJ_EMOJI_PRESENTATION, '\\u{{E0030}}'..='\\u{{E0039}}') => {{ + return (0, WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION); + }} + ( + WidthInfo::TAG_A3_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A4_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A5_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_A6_END_ZWJ_EMOJI_PRESENTATION + | WidthInfo::TAG_D3_END_ZWJ_EMOJI_PRESENTATION, + '\\u{{1F3F4}}', + ) => return (0, WidthInfo::EMOJI_PRESENTATION), + (WidthInfo::ZWJ_EMOJI_PRESENTATION, _) + if lookup_width{cjk_lo}(c).1 == WidthInfo::EMOJI_PRESENTATION => + {{ + return (0, WidthInfo::EMOJI_PRESENTATION) + }} + + // Fallback + _ => {{}} + }} + }} + + let ret = lookup_width{cjk_lo}(c); + (ret.0 as i8, ret.1) + }} +}} + +{cfg}#[inline] +pub fn str_width{cjk_lo}(s: &str) -> usize {{ + s.chars() + .rfold( + (0, WidthInfo::DEFAULT), + |(sum, next_info), c| -> (usize, WidthInfo) {{ + let (add, info) = width_in_str{cjk_lo}(c, next_info); + (sum.wrapping_add_signed(isize::from(add)), info) + }}, + ) + .0 +}} +""" + + return s + + +def emit_module( + out_name: str, + unicode_version: tuple[int, int, int], + tables: list[Table], + special_ranges: list[tuple[tuple[Codepoint, Codepoint], WidthState]], + special_ranges_cjk: list[tuple[tuple[Codepoint, Codepoint], WidthState]], + emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]], + text_presentation_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], + emoji_modifier_table: tuple[list[tuple[int, int]], list[list[tuple[int, int]]]], + joining_group_lam: list[tuple[Codepoint, Codepoint]], + non_transparent_zero_widths: list[tuple[Codepoint, Codepoint]], + ligature_transparent: list[tuple[Codepoint, Codepoint]], + solidus_transparent: list[tuple[Codepoint, Codepoint]], + normalization_tests: list[tuple[str, str, str, str, str]], +): + """Outputs a Rust module to `out_name` using table data from `tables`. + If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`. + """ + if os.path.exists(out_name): + os.remove(out_name) + with open(out_name, "w", newline="\n", encoding="utf-8") as module: + module.write( + """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT +// file at the top-level directory of this distribution and at +// http://rust-lang.org/COPYRIGHT. +// +// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or +// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license +// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly + +use core::cmp::Ordering; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +struct WidthInfo(u16); + +impl WidthInfo { + /// No special handling necessary + const DEFAULT: Self = Self(0); +""" + ) + + for variant in WidthState: + if variant.is_carried(): + if variant.is_cjk_only(): + module.write(' #[cfg(feature = "cjk")]\n') + module.write( + f" const {variant.name}: Self = Self(0b{variant.value:016b});\n" + ) + + module.write( + f""" + /// Whether this width mode is ligature_transparent + /// (has 5th MSB set.) + fn is_ligature_transparent(self) -> bool {{ + (self.0 & 0b0000_1000_0000_0000) == 0b0000_1000_0000_0000 + }} + + /// Sets 6th MSB. + fn set_zwj_bit(self) -> Self {{ + Self(self.0 | 0b0000_0100_0000_0000) + }} + + /// Has top bit set + fn is_emoji_presentation(self) -> bool {{ + (self.0 & 0b1000_0000_0000_0000) == 0b1000_0000_0000_0000 + }} + + /// Has top bit set + fn is_zwj_emoji_presentation(self) -> bool {{ + (self.0 & 0b1011_0000_0000_0000) == 0b1001_0000_0000_0000 + }} + + /// Set top bit + fn set_emoji_presentation(self) -> Self {{ + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 + || (self.0 & 0b1001_0000_0000_0000) == 0b0001_0000_0000_0000 + {{ + Self(self.0 | 0b1000_0000_0000_0000) + }} else {{ + Self::VARIATION_SELECTOR_16 + }} + }} + + /// Clear top bit + fn unset_emoji_presentation(self) -> Self {{ + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ + Self(self.0 & 0b0111_1111_1111_1111) + }} else {{ + Self::DEFAULT + }} + }} + + /// Has 2nd bit set + fn is_text_presentation(self) -> bool {{ + (self.0 & 0b0100_0000_0000_0000) == 0b0100_0000_0000_0000 + }} + + /// Set 2nd bit + fn set_text_presentation(self) -> Self {{ + if (self.0 & 0b0010_0000_0000_0000) == 0b0010_0000_0000_0000 {{ + Self(self.0 | 0b0100_0000_0000_0000) + }} else {{ + Self(0b0100_0000_0000_0000) + }} + }} + + /// Clear 2nd bit + fn unset_text_presentation(self) -> Self {{ + Self(self.0 & 0b1011_1111_1111_1111) + }} +}} + +/// The version of [Unicode](http://www.unicode.org/) +/// that this version of unicode-width is based on. +pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; +""" + ) + + module.write(lookup_fns(False, special_ranges, joining_group_lam)) + module.write(lookup_fns(True, special_ranges_cjk, joining_group_lam)) + + emoji_presentation_idx, emoji_presentation_leaves = emoji_presentation_table + text_presentation_idx, text_presentation_leaves = text_presentation_table + emoji_modifier_idx, emoji_modifier_leaves = emoji_modifier_table + + module.write( + """ +/// Whether this character is a zero-width character with +/// `Joining_Type=Transparent`. Used by the Alef-Lamed ligatures. +/// See also [`is_ligature_transparent`], a near-subset of this (only ZWJ is excepted) +/// which is transparent for non-Arabic ligatures. +fn is_transparent_zero_width(c: char) -> bool { + if lookup_width(c).0 != 0 { + // Not zero-width + false + } else { + let cp: u32 = c.into(); + NON_TRANSPARENT_ZERO_WIDTHS + .binary_search_by(|&(lo, hi)| { + let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); + let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); + if cp < lo { + Ordering::Greater + } else if cp > hi { + Ordering::Less + } else { + Ordering::Equal + } + }) + .is_err() + } +} + +/// Whether this character is a default-ignorable combining mark +/// or ZWJ. These characters won't interrupt non-Arabic ligatures. +fn is_ligature_transparent(c: char) -> bool { + matches!(c, """ + ) + + tail = False + for lo, hi in ligature_transparent: + if tail: + module.write(" | ") + tail = True + module.write(f"'\\u{{{lo:X}}}'") + if hi != lo: + module.write(f"..='\\u{{{hi:X}}}'") + + module.write( + """) +} + +/// Whether this character is transparent wrt the effect of +/// U+0338 COMBINING LONG SOLIDUS OVERLAY +/// on its base character. +#[cfg(feature = "cjk")] +fn is_solidus_transparent(c: char) -> bool { + let cp: u32 = c.into(); + is_ligature_transparent(c) + || SOLIDUS_TRANSPARENT + .binary_search_by(|&(lo, hi)| { + let lo = u32::from_le_bytes([lo[0], lo[1], lo[2], 0]); + let hi = u32::from_le_bytes([hi[0], hi[1], hi[2], 0]); + if cp < lo { + Ordering::Greater + } else if cp > hi { + Ordering::Less + } else { + Ordering::Equal + } + }) + .is_ok() +} + +/// Whether this character forms an [emoji presentation sequence] +/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence) +/// when followed by `'\\u{FEOF}'`. +/// Emoji presentation sequences are considered to have width 2. +#[inline] +pub fn starts_emoji_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + // First level of lookup uses all but 10 LSB + let top_bits = cp >> 10; + let idx_of_leaf: usize = match top_bits { +""" + ) + + for msbs, i in emoji_presentation_idx: + module.write(f" 0x{msbs:X} => {i},\n") + + module.write( + """ _ => return false, + }; + // Extract the 3-9th (0-indexed) least significant bits of `cp`, + // and use them to index into `leaf_row`. + let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap(); + let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf]; + // Use the 3 LSB of `cp` to index into `leaf_byte`. + ((leaf_byte >> (cp & 7)) & 1) == 1 +} + +/// Returns `true` if `c` has default emoji presentation, but forms a [text presentation sequence] +/// (https://www.unicode.org/reports/tr51/#def_text_presentation_sequence) +/// when followed by `'\\u{FEOE}'`, and is not ideographic. +/// Such sequences are considered to have width 1. +#[inline] +pub fn starts_non_ideographic_text_presentation_seq(c: char) -> bool { + let cp: u32 = c.into(); + // First level of lookup uses all but 8 LSB + let top_bits = cp >> 8; + let leaf: &[(u8, u8)] = match top_bits { +""" + ) + + for msbs, i in text_presentation_idx: + module.write(f" 0x{msbs:X} => &TEXT_PRESENTATION_LEAF_{i},\n") + + module.write( + """ _ => return false, + }; + + let bottom_bits = (cp & 0xFF) as u8; + leaf.binary_search_by(|&(lo, hi)| { + if bottom_bits < lo { + Ordering::Greater + } else if bottom_bits > hi { + Ordering::Less + } else { + Ordering::Equal + } + }) + .is_ok() +} + +/// Returns `true` if `c` is an `Emoji_Modifier_Base`. +#[inline] +pub fn is_emoji_modifier_base(c: char) -> bool { + let cp: u32 = c.into(); + // First level of lookup uses all but 8 LSB + let top_bits = cp >> 8; + let leaf: &[(u8, u8)] = match top_bits { +""" + ) + + for msbs, i in emoji_modifier_idx: + module.write(f" 0x{msbs:X} => &EMOJI_MODIFIER_LEAF_{i},\n") + + module.write( + """ _ => return false, + }; + + let bottom_bits = (cp & 0xFF) as u8; + leaf.binary_search_by(|&(lo, hi)| { + if bottom_bits < lo { + Ordering::Greater + } else if bottom_bits > hi { + Ordering::Less + } else { + Ordering::Equal + } + }) + .is_ok() +} + +#[repr(align(32))] +struct Align32<T>(T); + +#[repr(align(64))] +struct Align64<T>(T); + +#[repr(align(128))] +struct Align128<T>(T); +""" + ) + + subtable_count = 1 + for i, table in enumerate(tables): + new_subtable_count = len(table.buckets()) + if i == len(tables) - 1: + table.indices_to_widths() # for the last table, indices == widths + byte_array = table.to_bytes() + + if table.bytes_per_row is None: + module.write( + f"/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.)\n" + ) + if table.cfged: + module.write('#[cfg(feature = "cjk")]\n') + module.write( + f"static {table.name}: Align{table.align}<[u8; {len(byte_array)}]> = Align{table.align}([" + ) + for j, byte in enumerate(byte_array): + # Add line breaks for every 15th entry (chosen to match what rustfmt does) + if j % 16 == 0: + module.write("\n ") + module.write(f" 0x{byte:02X},") + module.write("\n") + else: + num_rows = len(byte_array) // table.bytes_per_row + num_primary_rows = ( + table.primary_len + // (8 // int(table.offset_type)) + // table.bytes_per_row + ) + module.write( + f""" +#[cfg(feature = "cjk")] +const {table.name}_LEN: usize = {num_rows}; +#[cfg(not(feature = "cjk"))] +const {table.name}_LEN: usize = {num_primary_rows}; +/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. +static {table.name}: Align{table.align}<[[u8; {table.bytes_per_row}]; {table.name}_LEN]> = Align{table.align}([\n""" + ) + for row_num in range(0, num_rows): + if row_num >= num_primary_rows: + module.write(' #[cfg(feature = "cjk")]\n') + module.write(" [\n") + row = byte_array[ + row_num + * table.bytes_per_row : (row_num + 1) + * table.bytes_per_row + ] + for subrow in batched(row, 15): + module.write(" ") + for entry in subrow: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + module.write("]);\n") + subtable_count = new_subtable_count + + # non transparent zero width table + + module.write( + f""" +/// Sorted list of codepoint ranges (inclusive) +/// that are zero-width but not `Joining_Type=Transparent` +/// FIXME: can we get better compression? +static NON_TRANSPARENT_ZERO_WIDTHS: [([u8; 3], [u8; 3]); {len(non_transparent_zero_widths)}] = [ +""" + ) + + for lo, hi in non_transparent_zero_widths: + module.write( + f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" + ) + + # solidus transparent table + + module.write( + f"""]; + +/// Sorted list of codepoint ranges (inclusive) +/// that don't affect how the combining solidus applies +/// (mostly ccc > 1). +/// FIXME: can we get better compression? +#[cfg(feature = "cjk")] +static SOLIDUS_TRANSPARENT: [([u8; 3], [u8; 3]); {len(solidus_transparent)}] = [ +""" + ) + + for lo, hi in solidus_transparent: + module.write( + f" ([0x{lo & 0xFF:02X}, 0x{lo >> 8 & 0xFF:02X}, 0x{lo >> 16:02X}], [0x{hi & 0xFF:02X}, 0x{hi >> 8 & 0xFF:02X}, 0x{hi >> 16:02X}]),\n" + ) + + # emoji table + + module.write( + f"""]; + +/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint +/// to get whether it can start an emoji presentation sequence. +static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([ +""" + ) + for leaf in emoji_presentation_leaves: + module.write(" [\n") + for row in batched(leaf, 15): + module.write(" ") + for entry in row: + module.write(f" 0x{entry:02X},") + module.write("\n") + module.write(" ],\n") + + module.write("]);\n") + + # text table + + for leaf_idx, leaf in enumerate(text_presentation_leaves): + module.write( + f""" +#[rustfmt::skip] +static TEXT_PRESENTATION_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ +""" + ) + for lo, hi in leaf: + module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") + module.write(f"];\n") + + # emoji modifier table + + for leaf_idx, leaf in enumerate(emoji_modifier_leaves): + module.write( + f""" +#[rustfmt::skip] +static EMOJI_MODIFIER_LEAF_{leaf_idx}: [(u8, u8); {len(leaf)}] = [ +""" + ) + for lo, hi in leaf: + module.write(f" (0x{lo:02X}, 0x{hi:02X}),\n") + module.write(f"];\n") + + test_width_variants = [] + test_width_variants_cjk = [] + for variant in WidthState: + if variant.is_carried(): + if not variant.is_cjk_only(): + test_width_variants.append(variant) + if not variant.is_non_cjk_only(): + test_width_variants_cjk.append(variant) + + module.write( + f""" +#[cfg(test)] +mod tests {{ + use super::*; + + fn str_width_test(s: &str, init: WidthInfo) -> isize {{ + s.chars() + .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ + let (add, info) = width_in_str(c, next_info); + (sum.checked_add(isize::from(add)).unwrap(), info) + }}) + .0 + }} + + #[cfg(feature = "cjk")] + fn str_width_test_cjk(s: &str, init: WidthInfo) -> isize {{ + s.chars() + .rfold((0, init), |(sum, next_info), c| -> (isize, WidthInfo) {{ + let (add, info) = width_in_str_cjk(c, next_info); + (sum.checked_add(isize::from(add)).unwrap(), info) + }}) + .0 + }} + + #[test] + fn test_normalization() {{ + for &(orig, nfc, nfd, nfkc, nfkd) in &NORMALIZATION_TEST {{ + for init in NORMALIZATION_TEST_WIDTHS {{ + assert_eq!( + str_width_test(orig, init), + str_width_test(nfc, init), + "width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", + ); + assert_eq!( + str_width_test(orig, init), + str_width_test(nfd, init), + "width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", + ); + assert_eq!( + str_width_test(nfkc, init), + str_width_test(nfkd, init), + "width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:X?}}", + ); + }} + + #[cfg(feature = "cjk")] + for init in NORMALIZATION_TEST_WIDTHS_CJK {{ + assert_eq!( + str_width_test_cjk(orig, init), + str_width_test_cjk(nfc, init), + "CJK width of X = {{orig:?}} differs from toNFC(X) = {{nfc:?}} with mode {{init:X?}}", + ); + assert_eq!( + str_width_test_cjk(orig, init), + str_width_test_cjk(nfd, init), + "CJK width of X = {{orig:?}} differs from toNFD(X) = {{nfd:?}} with mode {{init:X?}}", + ); + assert_eq!( + str_width_test_cjk(nfkc, init), + str_width_test_cjk(nfkd, init), + "CJK width of toNFKC(X) = {{nfkc:?}} differs from toNFKD(X) = {{nfkd:?}} with mode {{init:?}}", + ); + }} + }} + }} + + static NORMALIZATION_TEST_WIDTHS: [WidthInfo; {len(test_width_variants) + 1}] = [ + WidthInfo::DEFAULT,\n""" + ) + + for variant in WidthState: + if variant.is_carried() and not variant.is_cjk_only(): + module.write(f" WidthInfo::{variant.name},\n") + + module.write( + f""" ]; + + #[cfg(feature = "cjk")] + static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; {len(test_width_variants_cjk) + 1}] = [ + WidthInfo::DEFAULT,\n""" + ) + + for variant in WidthState: + if variant.is_carried() and not variant.is_non_cjk_only(): + module.write(f" WidthInfo::{variant.name},\n") + + module.write( + f""" ]; + + #[rustfmt::skip] + static NORMALIZATION_TEST: [(&str, &str, &str, &str, &str); {len(normalization_tests)}] = [\n""" + ) + for orig, nfc, nfd, nfkc, nfkd in normalization_tests: + module.write( + f' (r#"{orig}"#, r#"{nfc}"#, r#"{nfd}"#, r#"{nfkc}"#, r#"{nfkd}"#),\n' + ) + + module.write(" ];\n}\n") + + +def main(module_path: str): + """Obtain character data from the latest version of Unicode, transform it into a multi-level + lookup table for character width, and write a Rust module utilizing that table to + `module_filename`. + + See `lib.rs` for documentation of the exact width rules. + """ + version = load_unicode_version() + print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") + + (width_map, cjk_width_map) = load_width_maps() + + tables = make_tables(width_map, cjk_width_map) + + special_ranges = make_special_ranges(width_map) + cjk_special_ranges = make_special_ranges(cjk_width_map) + + emoji_presentations = load_emoji_presentation_sequences() + emoji_presentation_table = make_presentation_sequence_table(emoji_presentations) + + text_presentations = load_text_presentation_sequences() + text_presentation_table = make_ranges_table(text_presentations) + + emoji_modifier_bases = load_emoji_modifier_bases() + emoji_modifier_table = make_ranges_table(emoji_modifier_bases) + + joining_group_lam = load_joining_group_lam() + non_transparent_zero_widths = load_non_transparent_zero_widths(width_map) + ligature_transparent = load_ligature_transparent() + solidus_transparent = load_solidus_transparent(ligature_transparent, cjk_width_map) + + normalization_tests = load_normalization_tests() + + fetch_open("emoji-test.txt", "../tests", emoji=True) + + print("------------------------") + total_size = 0 + for i, table in enumerate(tables): + size_bytes = len(table.to_bytes()) + print(f"Table {i} size: {size_bytes} bytes") + total_size += size_bytes + + for s, table in [ + ("Emoji presentation", emoji_presentation_table), + ]: + index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 8) + print(f"{s} index size: {index_size} bytes") + total_size += index_size + leaves_size = len(table[1]) * len(table[1][0]) + print(f"{s} leaves size: {leaves_size} bytes") + total_size += leaves_size + + for s, table in [ + ("Text presentation", text_presentation_table), + ("Emoji modifier", emoji_modifier_table), + ]: + index_size = len(table[0]) * (math.ceil(math.log(table[0][-1][0], 256)) + 16) + print(f"{s} index size: {index_size} bytes") + total_size += index_size + leaves_size = 2 * sum(map(len, table[1])) + print(f"{s} leaves size: {leaves_size} bytes") + total_size += leaves_size + + for s, table in [ + ("Non transparent zero width", non_transparent_zero_widths), + ("Solidus transparent", solidus_transparent), + ]: + table_size = 6 * len(table) + print(f"{s} table size: {table_size} bytes") + total_size += table_size + print("------------------------") + print(f" Total size: {total_size} bytes") + + emit_module( + out_name=module_path, + unicode_version=version, + tables=tables, + special_ranges=special_ranges, + special_ranges_cjk=cjk_special_ranges, + emoji_presentation_table=emoji_presentation_table, + text_presentation_table=text_presentation_table, + emoji_modifier_table=emoji_modifier_table, + joining_group_lam=joining_group_lam, + non_transparent_zero_widths=non_transparent_zero_widths, + ligature_transparent=ligature_transparent, + solidus_transparent=solidus_transparent, + normalization_tests=normalization_tests, + ) + print(f'Wrote to "{module_path}"') + + +if __name__ == "__main__": + main(MODULE_PATH) |
