generate.py

#!/usr/bin/env python

""" Outputs the width file to stdout. """

import datetime
import hashlib
import os.path
import re
import sys

try:
    # python3
    from urllib.request import urlretrieve
except ImportError:
    from urllib import urlretrieve
try:
    # python3
    xrange
except NameError:
    xrange = range

VERSION = "14.0.0"
UNICODE_DATA_URL = 'https://unicode.org/Public/%s/ucd/UnicodeData.txt' % VERSION
EAW_URL = 'https://unicode.org/Public/%s/ucd/EastAsianWidth.txt' % VERSION
EMOJI_DATA_URL = 'https://unicode.org/Public/%s/ucd/emoji/emoji-data.txt' % VERSION

# A handful of field names
# See https://www.unicode.org/L2/L1999/UnicodeData.html
FIELD_CODEPOINT = 0
FIELD_NAME = 1
FIELD_CATEGORY = 2

# Category for unassigned codepoints.
CAT_UNASSIGNED = "Cn"

# Category for private use codepoints.
CAT_PRIVATE_USE = "Co"

# Category for surrogates.
CAT_SURROGATE = "Cs"

# Category for non-characters.
# Note this does not appear in UnicodeData.txt.
# See https://www.unicode.org/faq/private_use.html
CAT_NON_CHARACTERS = "non-characters"

# Maximum codepoint value.
MAX_CODEPOINT = 0x110000

CPP_PREFIX = "widechar_"

OUTPUT_FILENAME = "widechar_width.h"
OUTPUT_FILENAME_JS = "widechar_width.js"

RANGE_CHARS = ("{", "}")

OUTPUT_TEMPLATE = r"""
/**
 * {filename}, generated on {today}.
 * See https://github.com/ridiculousfish/widecharwidth/
 *
 * SHA1 file hashes:
 *  UnicodeData.txt:     {unicode_hash}
 *  EastAsianWidth.txt:  {eaw_hash}
 *  emoji-data.txt:      {emoji_hash}
 */

#ifndef WIDECHAR_WIDTH_H
#define WIDECHAR_WIDTH_H

#include <algorithm>
#include <iterator>
#include <cstddef>
#include <cstdint>

namespace {{

/* Special width values */
enum {{
  {p}nonprint = -1,     // The character is not printable.
  {p}combining = -2,    // The character is a zero-width combiner.
  {p}ambiguous = -3,    // The character is East-Asian ambiguous width.
  {p}private_use = -4,  // The character is for private use.
  {p}unassigned = -5,   // The character is unassigned.
  {p}widened_in_9 = -6, // Width is 1 in Unicode 8, 2 in Unicode 9+.
  {p}non_character = -7 // The character is a noncharacter.
}};

/* An inclusive range of characters. */
struct {p}range {{
  uint32_t lo;
  uint32_t hi;
}};

/* Simple ASCII characters - used a lot, so we check them first. */
static const struct {p}range {p}ascii_table[] = {{
    {ascii}
}};

/* Private usage range. */
static const struct {p}range {p}private_table[] = {{
    {private}
}};

/* Nonprinting characters. */
static const struct {p}range {p}nonprint_table[] = {{
    {nonprint}
}};

/* Width 0 combining marks. */
static const struct {p}range {p}combining_table[] = {{
    {combining}
}};

/* Width 2 characters. */
static const struct {p}range {p}doublewide_table[] = {{
    {doublewide}
}};

/* Ambiguous-width characters. */
static const struct {p}range {p}ambiguous_table[] = {{
    {ambiguous}
}};

/* Unassigned characters. */
static const struct {p}range {p}unassigned_table[] = {{
    {unassigned}
}};

/* Non-characters. */
static const struct {p}range {p}nonchar_table[] = {{
    {noncharacters}
}};

/* Characters that were widened from with 1 to 2 in Unicode 9. */
static const struct {p}range {p}widened_table[] = {{
    {widenedin9}
}};

template<typename Collection>
bool {p}in_table(const Collection &arr, uint32_t c) {{
    auto where = std::lower_bound(std::begin(arr), std::end(arr), c,
        []({p}range p, uint32_t c) {{ return p.hi < c; }});
    return where != std::end(arr) && where->lo <= c;
}}

/* Return the width of character c, or a special negative value. */
int {p}wcwidth(uint32_t c) {{
    if ({p}in_table({p}ascii_table, c))
        return 1;
    if ({p}in_table({p}private_table, c))
        return {p}private_use;
    if ({p}in_table({p}nonprint_table, c))
        return {p}nonprint;
    if ({p}in_table({p}nonchar_table, c))
        return {p}non_character;
    if ({p}in_table({p}combining_table, c))
        return {p}combining;
    if ({p}in_table({p}doublewide_table, c))
        return 2;
    if ({p}in_table({p}ambiguous_table, c))
        return {p}ambiguous;
    if ({p}in_table({p}unassigned_table, c))
        return {p}unassigned;
    if ({p}in_table({p}widened_table, c))
        return {p}widened_in_9;
    return 1;
}}

}} // namespace
#endif // WIDECHAR_WIDTH_H
"""

OUTPUT_TEMPLATE_JS = r"""
/*
 * {filename}, generated on {today}.
 * See https://github.com/ridiculousfish/widecharwidth/
 *
 * SHA1 file hashes:
 *  UnicodeData.txt:     {unicode_hash}
 *  EastAsianWidth.txt:  {eaw_hash}
 *  emoji-data.txt:      {emoji_hash}
 */

/* Special width values */
const {p}nonprint = -1;     // The character is not printable.
const {p}combining = -2;    // The character is a zero-width combiner.
const {p}ambiguous = -3;    // The character is East-Asian ambiguous width.
const {p}private_use = -4;  // The character is for private use.
const {p}unassigned = -5;   // The character is unassigned.
const {p}widened_in_9 = -6; // Width is 1 in Unicode 8, 2 in Unicode 9+.
const {p}non_character = -7; // The character is a noncharacter.

/* Simple ASCII characters - used a lot, so we check them first. */
const {p}ascii_table = [
    {ascii}
];

/* Private usage range. */
const {p}private_table = [
    {private}
];

/* Nonprinting characters. */
const {p}nonprint_table = [
    {nonprint}
];

/* Width 0 combining marks. */
const {p}combining_table = [
    {combining}
];

/* Width.2 characters. */
const {p}doublewide_table = [
    {doublewide}
];

/* Ambiguous-width characters. */
const {p}ambiguous_table = [
    {ambiguous}
];

/* Unassigned characters. */
const {p}unassigned_table = [
    {unassigned}
];

/* Non-characters. */
const {p}nonchar_table[] = [
    {noncharacters}
];

/* Characters that were widened from with 1 to 2 in Unicode 9. */
const {p}widened_table[] = [
    {widenedin9}
];

function {p}in_table(data, ucs) {{
    let min = 0;
    let max = data.length - 1;
    let mid;
    if (ucs < data[0][0] || ucs > data[max][1])
        return false;

    while (max >= min) {{
        mid = (min + max) >> 1;
        if (ucs > data[mid][1]) {{
            min = mid + 1;
        }}
        else if (ucs < data[mid][0]) {{
            max = mid - 1;
        }}
        else {{
            return true;
        }}
    }}
    return false;
}}

/* Return the width of character c, or a special negative value. */
function {p}wcwidth(c) {{
    if ({p}in_table({p}ascii_table, c))
        return 1;
    if ({p}in_table({p}private_table, c))
        return {p}private_use;
    if ({p}in_table({p}nonprint_table, c))
        return {p}nonprint;
    if ({p}in_table({p}nonchar_table, c))
        return {p}non_character;
    if ({p}in_table({p}combining_table, c))
        return {p}combining;
    if ({p}in_table({p}doublewide_table, c))
        return 2;
    if ({p}in_table({p}ambiguous_table, c))
        return {p}ambiguous;
    if ({p}in_table({p}unassigned_table, c))
        return {p}unassigned;
    if ({p}in_table({p}widened_table, c))
        return {p}widened_in_9;
    return 1;
}}
"""

# Ambiguous East Asian characters
WIDTH_AMBIGUOUS_EASTASIAN = -3

# Width changed from 1 to 2 in Unicode 9.0
WIDTH_WIDENED_IN_9 = -6

# Private use characters.
WIDTH_PRIVATE_USE = -7


class CodePoint(object):  # pylint: disable=too-few-public-methods
    """ Represents a single Unicode codepoint """

    def __init__(self, codepoint):
        self.codepoint = codepoint
        self.width = None
        self.category = CAT_UNASSIGNED

    def hex(self):
        """ Return the codepoint as a hex string """
        return "0x%05X" % self.codepoint


def log(msg):
    """ Logs a string to stderr """
    sys.stderr.write(str(msg) + "\n")


def read_datafile(url):
    """Download a file from url to name if not already present.
    Return the file as a tuple (lines, sha1)
    lines will have comment-only lines removed, sha1 is a string.
    """
    name = url.rsplit("/", 1)[-1]
    if not os.path.isfile(name):
        log("Downloading " + name)
        urlretrieve(url, name)
    with open(name, "rb") as ofile:
        data = ofile.read()
    hashval = hashlib.sha1(data).hexdigest()
    lines = data.decode("utf-8").split("\n")
    lines = [line for line in lines if not line.startswith("#")]
    return (lines, hashval)


def set_general_categories(unicode_data, cps):
    """Receives lines from UnicodeData.txt,
    and sets general categories for codepoints."""
    for line in unicode_data:
        fields = line.strip().split(";")
        if len(fields) > FIELD_CATEGORY:
            for idx in hexrange_to_range(fields[FIELD_CODEPOINT]):
                cps[idx].category = fields[FIELD_CATEGORY]


def merged_codepoints(cps):
    """ return a list of codepoints (start, end) for inclusive ranges """
    if not cps:
        return []
    cps = sorted(cps, key=lambda cp: cp.codepoint)
    ranges = [(cps[0], cps[0])]
    for cp in cps[1:]:
        last_range = ranges[-1]
        if cp.codepoint == last_range[1].codepoint + 1:
            ranges[-1] = (last_range[0], cp)
            continue
        ranges.append((cp, cp))
    return ranges


def gen_seps(length):
    """ Yield separators for a table of given length """
    table_columns = 1
    for idx in xrange(1, length + 1):
        if idx == length:
            yield ""
        elif idx % table_columns == 0:
            yield ",\n    "
        else:
            yield ", "


def codepoints_to_carray_str(cps):
    global RANGE_CHARS
    """ Given a list of codepoints, return a C array string representing their inclusive ranges. """
    result = ""
    ranges = merged_codepoints(cps)
    seps = gen_seps(len(ranges))
    for (start, end) in ranges:
        result += "%s%s, %s%s%s" % (RANGE_CHARS[0], start.hex(), end.hex(), RANGE_CHARS[1], next(seps))
    return result


def hexrange_to_range(hexrange):
    """Given a string like 1F300..1F320 representing an inclusive range,
    return the range of codepoints.
    If the string is like 1F321, return a range of just that element.
    """
    fields = [int(val, 16) for val in hexrange.split("..")]
    if len(fields) == 1:
        fields += fields
    return range(fields[0], fields[1] + 1)


def parse_eaw_line(eaw_line):
    """ Return a list of tuples (codepoint, width) from an EAW line """
    # Remove hash.
    line = eaw_line.split("#", 1)[0]
    fields = line.strip().split(";")
    if len(fields) != 2:
        return []
    cps, width_type = fields
    # width_types:
    #  A: ambiguous, F: fullwidth, H: halfwidth,
    #  N: neutral, Na: east-asian Narrow
    if width_type == "A":
        width = WIDTH_AMBIGUOUS_EASTASIAN
    elif width_type in ["F", "W"]:
        width = 2
    else:
        width = 1
    return [(cp, width) for cp in hexrange_to_range(cps)]


def set_eaw_widths(eaw_data_lines, cps):
    """ Read from EastAsianWidth.txt, set width values on the codepoints """
    for line in eaw_data_lines:
        for (cp, width) in parse_eaw_line(line):
            cps[cp].width = width
    # Apply the following special cases:
    #  - The unassigned code points in the following blocks default to "W":
    #         CJK Unified Ideographs Extension A: U+3400..U+4DBF
    #         CJK Unified Ideographs:             U+4E00..U+9FFF
    #         CJK Compatibility Ideographs:       U+F900..U+FAFF
    #  - All undesignated code points in Planes 2 and 3, whether inside or
    #      outside of allocated blocks, default to "W":
    #         Plane 2:                            U+20000..U+2FFFD
    #         Plane 3:                            U+30000..U+3FFFD
    wide_ranges = [
        (0x3400, 0x4DBF),
        (0x4E00, 0x9FFF),
        (0xF900, 0xFAFF),
        (0x20000, 0x2FFFD),
        (0x30000, 0x3FFFD),
    ]
    for wr in wide_ranges:
        for cp in xrange(wr[0], wr[1] + 1):
            if cps[cp].width is None:
                cps[cp].width = 2


def parse_emoji_line(line):
    """ Return a list {cp, version} for the line """
    # Example line: 0023   ; Emoji #  1.1  [1] (#)  number sign
    fields_comment = line.split("#", 1)
    if len(fields_comment) != 2:
        return []
    fields, comment = fields_comment
    cps, _prop = fields.split(";")
    version = 0.0
    # Some code points are marked "reserved" and do not have a version "NA".
    fmtre = re.search(r"^\s*E\d+\.\d+", comment)
    version = float(fmtre.group(0).strip()[1:]) if fmtre else 0.0
    return [(cp, version) for cp in hexrange_to_range(cps)]


def set_emoji_widths(emoji_data_lines, cps):
    """ Read from emoji-data.txt, set codepoint widths """
    for line in emoji_data_lines:
        for (cp, version) in parse_emoji_line(line):
            # Don't consider <=1F000 values as emoji. These can only be made
            # emoji through the variation selector which interacts terribly
            # with wcwidth().
            if cp < 0x1F000:
                continue

            # Skip codepoints that have a version of 0.0 as they were marked
            # in the emoji-data file as reserved/unused:
            if version <= 1.0:
                continue

            # Skip codepoints that are explicitly not wide.
            # For example U+1F336 ("Hot Pepper") renders like any emoji but is
            # marked as neutral in EAW so has width 1 for some reason.
            if cps[cp].width == 1:
                continue

            # If this emoji was introduced before Unicode 9, then it was widened in 9.
            cps[cp].width = 2 if version >= 9.0 else WIDTH_WIDENED_IN_9


def set_hardcoded_ranges(cps):
    """ Mark private use and surrogate codepoints """
    # Private use can be determined awkwardly from UnicodeData.txt,
    # but we just hard-code them.
    # We do not treat "private use high surrogate" as private use
    # so as to match wcwidth9().
    private_ranges = [(0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)]
    for (first, last) in private_ranges:
        for idx in xrange(first, last + 1):
            cps[idx].category = CAT_PRIVATE_USE

    surrogate_ranges = [(0xD800, 0xDBFF), (0xDC00, 0xDFFF)]
    for (first, last) in surrogate_ranges:
        for idx in xrange(first, last + 1):
            cps[idx].category = CAT_SURROGATE

    # See "noncharacters" discussion at https://www.unicode.org/faq/private_use.html
    # "Last two code points of each of the 16 supplementary planes" and also BMP (plane 0).
    nonchar_ranges = [(0xFDD0, 0xFDEF)]
    for plane in xrange(0, 16 + 1):
        c = 0x10000 * plane + 0xFFFE
        nonchar_ranges.append((c, c + 1))

    for (first, last) in nonchar_ranges:
        for idx in xrange(first, last + 1):
            cps[idx].category = CAT_NON_CHARACTERS


def generate():
    """ Return our widechar_width.h as a string """
    # Read our three files.
    unicode_data, unicode_hash = read_datafile(UNICODE_DATA_URL)
    eaw_data, eaw_hash = read_datafile(EAW_URL)
    emoji_data, emoji_hash = read_datafile(EMOJI_DATA_URL)

    log("Thinking...")

    # Generate a CodePoint for each value.
    cps = [CodePoint(i) for i in xrange(MAX_CODEPOINT + 1)]

    set_general_categories(unicode_data, cps)
    set_eaw_widths(eaw_data, cps)
    set_emoji_widths(emoji_data, cps)
    set_hardcoded_ranges(cps)

    def categories(cats):
        """Return a carray string of codepoints in any of the given categories."""
        catset = set(cats)
        matches = [cp for cp in cps if cp.category in catset]
        return codepoints_to_carray_str(matches)

    def codepoints_with_width(width):
        """ Return a carray string of codepoints with the given width. """
        return codepoints_to_carray_str([cp for cp in cps if cp.width == width])

    def ascii_codepoints():
        """ Return a carray string of codepoints with the given width. """
        return codepoints_to_carray_str(
            [cp for cp in cps if cp.codepoint < 0x7F and cp.codepoint >= 0x20]
        )

    fields = {
        "p": CPP_PREFIX,
        "filename": OUTPUT_FILENAME,
        "today": str(datetime.date.today()),
        "unicode_hash": unicode_hash,
        "eaw_hash": eaw_hash,
        "emoji_hash": emoji_hash,
        "ascii": ascii_codepoints(),
        "private": categories([CAT_PRIVATE_USE]),
        "noncharacters": categories([CAT_NON_CHARACTERS]),
        "nonprint": categories(["Cc", "Cf", "Zl", "Zp", CAT_SURROGATE]),
        "combining": categories(["Mn", "Mc", "Me"]),
        "doublewide": codepoints_with_width(2),
        "unassigned": categories([CAT_UNASSIGNED]),
        "ambiguous": codepoints_with_width(WIDTH_AMBIGUOUS_EASTASIAN),
        "widenedin9": codepoints_with_width(WIDTH_WIDENED_IN_9),
    }
    return fields


if __name__ == '__main__':
    fields = generate()
    with open(OUTPUT_FILENAME, 'w') as fd:
        fd.write(OUTPUT_TEMPLATE.strip().format(**fields))
        fd.write('\n')
    log("Output " + OUTPUT_FILENAME)

    RANGE_CHARS = ('[', ']')
    fields = generate()
    with open(OUTPUT_FILENAME_JS, 'w') as fd:
        fd.write(OUTPUT_TEMPLATE_JS.strip().format(**fields))
        fd.write('\n')
    log("Output " + OUTPUT_FILENAME_JS)