glibc/localedata/unicode-gen/unicode_utils.py

# Utilities to generate Unicode data for glibc from upstream Unicode data.
#
# Copyright (C) 2014-2024 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

'''
This module contains utilities used by the scripts to generate
Unicode data for glibc from upstream Unicode data files.
'''

import sys
import re


# Common locale header.
COMMENT_HEADER = """
% This file is part of the GNU C Library and contains locale data.
% The Free Software Foundation does not claim any copyright interest
% in the locale data contained in this file.  The foregoing does not
% affect the license of the GNU C Library as a whole.  It does not
% exempt you from the conditions of the license if your use would
% otherwise be governed by that license.
"""

# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
#      'title': None,
#      'digit': '',
#      'name': '<control>',
#      'bidi': 'BN',
#      'combining': '0',
#      'comment': '',
#      'oldname': 'NULL',
#      'decomposition': '',
#      'upper': None,
#      'mirrored': 'N',
#      'lower': None,
#      'decdigit': '',
#      'numeric': ''},
#      …
# }
UNICODE_ATTRIBUTES = {}

# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
#
# Contents of this dictionary look like this:
#
# {917504: ['Default_Ignorable_Code_Point'],
#  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
#  …
# }
DERIVED_CORE_PROPERTIES = {}

# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}

def fill_attribute(code_point, fields):
    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.

    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
    in the UnicodeData.txt file.

    '''
    UNICODE_ATTRIBUTES[code_point] =  {
        'name': fields[1],          # Character name
        'category': fields[2],      # General category
        'combining': fields[3],     # Canonical combining classes
        'bidi': fields[4],          # Bidirectional category
        'decomposition': fields[5], # Character decomposition mapping
        'decdigit': fields[6],      # Decimal digit value
        'digit': fields[7],         # Digit value
        'numeric': fields[8],       # Numeric value
        'mirrored': fields[9],      # mirrored
        'oldname': fields[10],      # Old Unicode 1.0 name
        'comment': fields[11],      # comment
        # Uppercase mapping
        'upper': int(fields[12], 16) if fields[12] else None,
        # Lowercase mapping
        'lower': int(fields[13], 16) if fields[13] else None,
        # Titlecase mapping
        'title': int(fields[14], 16) if fields[14] else None,
    }

def fill_attributes(filename):
    '''Stores the entire contents of the UnicodeData.txt file
    in the UNICODE_ATTRIBUTES dictionary.

    A typical line for a single code point in UnicodeData.txt looks
    like this:

    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;

    Code point ranges are indicated by pairs of lines like this:

    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    '''
    with open(filename, mode='r') as unicode_data_file:
        fields_start = []
        for line in unicode_data_file:
            fields = line.strip().split(';')
            if len(fields) != 15:
                sys.stderr.write(
                    'short line in file "%(f)s": %(l)s\n' %{
                    'f': filename, 'l': line})
                exit(1)
            if fields[2] == 'Cs':
                # Surrogates are UTF-16 artefacts,
                # not real characters. Ignore them.
                fields_start = []
                continue
            if fields[1].endswith(', First>'):
                fields_start = fields
                fields_start[1] = fields_start[1].split(',')[0][1:]
                continue
            if fields[1].endswith(', Last>'):
                fields[1] = fields[1].split(',')[0][1:]
                if fields[1:] != fields_start[1:]:
                    sys.stderr.write(
                        'broken code point range in file "%(f)s": %(l)s\n' %{
                            'f': filename, 'l': line})
                    exit(1)
                for code_point in range(
                        int(fields_start[0], 16),
                        int(fields[0], 16)+1):
                    fill_attribute(code_point, fields)
                fields_start = []
                continue
            fill_attribute(int(fields[0], 16), fields)
            fields_start = []

def fill_derived_core_properties(filename):
    '''Stores the entire contents of the DerivedCoreProperties.txt file
    in the DERIVED_CORE_PROPERTIES dictionary.

    Lines in DerivedCoreProperties.txt are either a code point range like
    this:

    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z

    or a single code point like this:

    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR

    '''
    with open(filename, mode='r') as derived_core_properties_file:
        for line in derived_core_properties_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                prop = match.group('property')
                if code_point in DERIVED_CORE_PROPERTIES:
                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
                else:
                    DERIVED_CORE_PROPERTIES[code_point] = [prop]

def fill_east_asian_widths(filename):
    '''Stores the entire contents of the EastAsianWidths.txt file
    in the EAST_ASIAN_WIDTHS dictionary.

    Lines in EastAsianWidths.txt are either a code point range like
    this:

    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>

    or a single code point like this:

    A015;W           # Lm         YI SYLLABLE WU
    '''
    with open(filename, mode='r') as east_asian_widths_file:
        for line in east_asian_widths_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                EAST_ASIAN_WIDTHS[code_point] = match.group('property')

def to_upper(code_point):
    '''Returns the code point of the uppercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['upper']):
        return UNICODE_ATTRIBUTES[code_point]['upper']
    else:
        return code_point

def to_lower(code_point):
    '''Returns the code point of the lowercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['lower']):
        return UNICODE_ATTRIBUTES[code_point]['lower']
    else:
        return code_point

def to_upper_turkish(code_point):
    '''Returns the code point of the Turkish uppercase version
    of the given code point'''
    if code_point == 0x0069:
        return 0x0130
    return to_upper(code_point)

def to_lower_turkish(code_point):
    '''Returns the code point of the Turkish lowercase version
    of the given code point'''
    if code_point == 0x0049:
        return 0x0131
    return to_lower(code_point)

def to_title(code_point):
    '''Returns the code point of the titlecase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['title']):
        return UNICODE_ATTRIBUTES[code_point]['title']
    else:
        return code_point

def is_upper(code_point):
    '''Checks whether the character with this code point is uppercase'''
    return (to_lower(code_point) != code_point
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))

def is_lower(code_point):
    '''Checks whether the character with this code point is lowercase'''
    # Some characters are defined as “Lowercase” in
    # DerivedCoreProperties.txt but do not have a mapping to upper
    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
    # one of these.
    return (to_upper(code_point) != code_point
            # <U00DF> is lowercase, but without simple to_upper mapping.
            or code_point == 0x00DF
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))

def is_alpha(code_point):
    '''Checks whether the character with this code point is alphabetic'''
    return ((code_point in DERIVED_CORE_PROPERTIES
             and
             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
            or
            # Consider all the non-ASCII digits as alphabetic.
            # ISO C 99 forbids us to have them in category “digit”,
            # but we want iswalnum to return true on them.
            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
             and not (code_point >= 0x0030 and code_point <= 0x0039)))

def is_digit(code_point):
    '''Checks whether the character with this code point is a digit'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
        # a zero.  Must add <0> in front of them by hand.
    else:
        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.5:
        #    The iswdigit function tests for any wide character that
        #    corresponds to a decimal-digit character (as defined in 5.2.1).
        # 5.2.1:
        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
        return (code_point >= 0x0030 and code_point <= 0x0039)

def is_outdigit(code_point):
    '''Checks whether the character with this code point is outdigit'''
    return (code_point >= 0x0030 and code_point <= 0x0039)

def is_blank(code_point):
    '''Checks whether the character with this code point is blank'''
    return (code_point == 0x0009 # '\t'
            # Category Zs without mention of '<noBreak>'
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
                and '<noBreak>' not in
                UNICODE_ATTRIBUTES[code_point]['decomposition']))

def is_space(code_point):
    '''Checks whether the character with this code point is a space'''
    # Don’t make U+00A0 a space. Non-breaking space means that all programs
    # should treat it like a punctuation character, not like a space.
    return (code_point == 0x0020 # ' '
            or code_point == 0x000C # '\f'
            or code_point == 0x000A # '\n'
            or code_point == 0x000D # '\r'
            or code_point == 0x0009 # '\t'
            or code_point == 0x000B # '\v'
            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and
                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
                 or
                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
                  and
                  '<noBreak>' not in
                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))

def is_cntrl(code_point):
    '''Checks whether the character with this code point is
    a control character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
                 or
                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))

def is_xdigit(code_point):
    '''Checks whether the character with this code point is
    a hexadecimal digit'''
    if False:
        return (is_digit(code_point)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))
    else:
        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.12:
        #    The iswxdigit function tests for any wide character that
        #    corresponds to a hexadecimal-digit character (as defined
        #    in 6.4.4.1).
        # 6.4.4.1:
        #    hexadecimal-digit: one of
        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
        return ((code_point >= 0x0030 and code_point  <= 0x0039)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))

def is_graph(code_point):
    '''Checks whether the character with this code point is
    a graphical character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and not is_space(code_point))

def is_print(code_point):
    '''Checks whether the character with this code point is printable'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])

def is_punct(code_point):
    '''Checks whether the character with this code point is punctuation'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
    else:
        # The traditional POSIX definition of punctuation is every graphic,
        # non-alphanumeric character.
        return (is_graph(code_point)
                and not is_alpha(code_point)
                and not is_digit(code_point))

def is_combining(code_point):
    '''Checks whether the character with this code point is
    a combining character'''
    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
    # file. In 3.0.1 it was identical to the union of the general categories
    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
    # PropList.txt file, so we take the latter definition.
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and
            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])

def is_combining_level3(code_point):
    '''Checks whether the character with this code point is
    a combining level3 character'''
    return (is_combining(code_point)
            and
            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))

def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)

def ucs_symbol_range(code_point_low, code_point_high):
    '''Returns a string UCS symbol string for a code point range.

    Example:

    <U0041>..<U005A>
    '''
    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)

def verifications():
    '''Tests whether the is_* functions observe the known restrictions'''
    for code_point in sorted(UNICODE_ATTRIBUTES):
        # toupper restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_upper(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_upper(code_point)})
        # tolower restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_lower(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_lower(code_point)})
        # alpha restriction: "Characters classified as either upper or lower
        # shall automatically belong to this class.
        if ((is_lower(code_point) or is_upper(code_point))
             and not is_alpha(code_point)):
            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
                'sym': ucs_symbol(code_point)})
        # alpha restriction: “No character specified for the keywords cntrl,
        # digit, punct or space shall be specified.”
        if (is_alpha(code_point) and is_cntrl(code_point)):
            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is alpha and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is alpha and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_space(code_point)):
            sys.stderr.write('%(sym)s is alpha and space\n' %{
                'sym': ucs_symbol(code_point)})
        # space restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, graph or xdigit shall be specified.”
        # upper, lower, alpha already checked above.
        if (is_space(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is space and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is space and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is space and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # cntrl restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, punct, graph, print or xdigit shall be
        # specified.”  upper, lower, alpha already checked above.
        if (is_cntrl(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_print(code_point)):
            sys.stderr.write('%(sym)s is cntrl and print\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # punct restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
        # be specified.”  upper, lower, alpha, cntrl already checked above.
        if (is_punct(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is punct and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and code_point == 0x0020):
            sys.stderr.write('%(sym)s is punct\n' %{
                'sym': ucs_symbol(code_point)})
        # graph restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.

        # print restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.

        # graph - print relation: differ only in the <space> character.
        # How is this possible if there are more than one space character?!
        # I think susv2/xbd/locale.html should speak of “space characters”,
        # not “space character”.
        if (is_print(code_point)
            and not (is_graph(code_point) or is_space(code_point))):
            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
                'sym': unicode_utils.ucs_symbol(code_point)})
        if (not is_print(code_point)
            and (is_graph(code_point) or code_point == 0x0020)):
            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
                'sym': unicode_utils.ucs_symbol(code_point)})