mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-22 04:50:07 +00:00
25c9c3789e
= `Default_Ignorable_Code_Point`s should have width 0 = Unicode specifies (https://www.unicode.org/faq/unsup_char.html#3) that characters with the `Default_Ignorable_Code_Point` property > should be rendered as completely invisible (and non advancing, i.e. “zero width”), if not explicitly supported in rendering. Hence, `wcwidth()` should give them all a width of 0, with two exceptions: - the soft hyphen (U+00AD SOFT HYPHEN) is assigned width 1 by longstanding precedent - U+115F HANGUL CHOSEONG FILLER needs a carveout due to the unique behavior of the conjoining Korean jamo characters. One composed Hangul "syllable block" like 퓛 is made up of two to three individual component characters, or "jamo". These are all assigned an `East_Asian_Width` of `Wide` by Unicode, which would normally mean they would all be assigned width 2 by glibc; a combination of (leading choseong jamo) + (medial jungseong jamo) + (trailing jongseong jamo) would then have width 2 + 2 + 2 = 6. However, glibc (and other wcwidth implementations) special-cases jungseong and jongseong, assigning them all width 0, to ensure that the complete block has width 2 + 0 + 0 = 2 as it should. U+115F is meant for use in syllable blocks that are intentionally missing a leading jamo; it must be assigned a width of 2 even though it has no visible display to ensure that the complete block has width 2. However, `wcwidth()` currently (before this patch) incorrectly assigns non-zero width to U+3164 HANGUL FILLER and U+FFA0 HALFWIDTH HANGUL FILLER; this commit fixes that. Unicode spec references: - Hangul: §3.12 https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G24646 and §18.6 https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G31028 - `Default_Ignorable_Code_Point`: §5.21 https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095. = Non-`Default_Ignorable_Code_Point` format controls should be visible = The Unicode Standard, §5.21 - Characters Ignored for Display (https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) says the following: > A small number of format characters (General_Category = Cf ) > are also not given the Default_Ignorable_Code_Point property. > This may surprise implementers, who often assume > that all format characters are generally ignored in fallback display. > The exact list of these exceptional format characters > can be found in the Unicode Character Database. > There are, however, three important sets of such format characters to note: > > - prepended concatenation marks > - interlinear annotation characters > - Egyptian hieroglyph format controls > > The prepended concatenation marks always have a visible display. > See “Prepended Concatenation Marks” in [*Section 23.2, Layout Controls*](https://www.unicode.org/versions/Unicode15.1.0/ch23.pdf#M9.35858.HeadingBreak.132.Layout.Controls) > for more discussion of the use and display of these signs. > > The other two notable sets of format characters that exceptionally are not ignored > in fallback display consist of the interlinear annotation characters, > U+FFF9 INTERLINEAR ANNOTATION ANCHOR through > U+FFFB INTERLINEAR ANNOTATION TERMINATOR, > and the Egyptian hieroglyph format controls, > U+13430 EGYPTIAN HIEROGLYPH VERTICAL JOINER through > U+1343F EGYPTIAN HIEROGLYPH END WALLED ENCLOSURE. > These characters should have a visible glyph display for fallback rendering, > because if they are not displayed, > it is too easy to misread the resulting displayed text. > See “Annotation Characters” in [*Section 23.8, Specials*](https://www.unicode.org/versions/Unicode15.1.0/ch23.pdf#M9.21335.Heading.133.Specials), > as well as [*Section 11.4, Egyptian Hieroglyphs*](https://www.unicode.org/versions/Unicode15.1.0/ch11.pdf#M9.73291.Heading.1418.Egyptian.Hieroglyphs) > for more discussion of the use and display of these characters. glibc currently correctly assigns non-zero width to the prepended concatenation marks, but it incorrectly gives zero width to the interlinear annotation characters (which a generic terminal cannot interpret) and the Egyptian hieroglyph format controls (which are not widely supported in rendering implementations at present). This commit fixes both these issues as well. = Derive Hangul syllable type from Unicode data = Previosuly, the jungseong and jongseong jamo ranges were hard-coded into the script. With this commit, they are instead parsed from the HangulSyllableType.txt data file published by Unicode. This does not affect the end result. Signed-off-by: Jules Bertholet <julesbertholet@quoi.xyz>
403 lines
17 KiB
Python
Executable File
403 lines
17 KiB
Python
Executable File
#!/usr/bin/python3
|
|
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2014-2024 Free Software Foundation, Inc.
|
|
# Copyright The GNU Toolchain Authors.
|
|
# This file is part of the GNU C Library.
|
|
#
|
|
# The GNU C Library is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU Lesser General Public
|
|
# License as published by the Free Software Foundation; either
|
|
# version 2.1 of the License, or (at your option) any later version.
|
|
#
|
|
# The GNU C Library is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# Lesser General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU Lesser General Public
|
|
# License along with the GNU C Library; if not, see
|
|
# <https://www.gnu.org/licenses/>.
|
|
|
|
'''glibc/localedata/charmaps/UTF-8 file generator script
|
|
|
|
This script generates a glibc/localedata/charmaps/UTF-8 file
|
|
from Unicode data.
|
|
|
|
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
|
|
|
It will output UTF-8 file
|
|
'''
|
|
|
|
import argparse
|
|
import re
|
|
import unicode_utils
|
|
|
|
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
|
# sections 3.11 and 4.4.
|
|
|
|
JAMO_INITIAL_SHORT_NAME = (
|
|
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
|
|
'C', 'K', 'T', 'P', 'H'
|
|
)
|
|
|
|
JAMO_MEDIAL_SHORT_NAME = (
|
|
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
|
|
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
|
|
)
|
|
|
|
JAMO_FINAL_SHORT_NAME = (
|
|
'', 'G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
|
|
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
|
|
'P', 'H'
|
|
)
|
|
|
|
def process_range(start, end, outfile, name):
|
|
'''Writes a range of code points into the CHARMAP section of the
|
|
output file
|
|
|
|
'''
|
|
if 'Hangul Syllable' in name:
|
|
# from glibc/localedata/ChangeLog:
|
|
#
|
|
# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
|
|
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
|
|
# so they become printable and carry a width. Comment out surrogate
|
|
# ranges. Add a WIDTH table
|
|
#
|
|
# So we expand the Hangul Syllables here:
|
|
for i in range(int(start, 16), int(end, 16)+1 ):
|
|
index2, index3 = divmod(i - 0xaC00, 28)
|
|
index1, index2 = divmod(index2, 21)
|
|
hangul_syllable_name = 'HANGUL SYLLABLE ' \
|
|
+ JAMO_INITIAL_SHORT_NAME[index1] \
|
|
+ JAMO_MEDIAL_SHORT_NAME[index2] \
|
|
+ JAMO_FINAL_SHORT_NAME[index3]
|
|
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
|
unicode_utils.ucs_symbol(i), convert_to_hex(i),
|
|
hangul_syllable_name))
|
|
return
|
|
# UnicodeData.txt file has contains code point ranges like this:
|
|
#
|
|
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
#
|
|
# The glibc UTF-8 file splits ranges like these into shorter
|
|
# ranges of 64 code points each:
|
|
#
|
|
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
|
# …
|
|
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
|
|
for i in range(int(start, 16), int(end, 16), 64 ):
|
|
if i > (int(end, 16)-64):
|
|
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
|
unicode_utils.ucs_symbol(i),
|
|
unicode_utils.ucs_symbol(int(end,16)),
|
|
convert_to_hex(i),
|
|
name))
|
|
break
|
|
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
|
unicode_utils.ucs_symbol(i),
|
|
unicode_utils.ucs_symbol(i+63),
|
|
convert_to_hex(i),
|
|
name))
|
|
|
|
def process_charmap(flines, outfile):
|
|
'''This function takes an array which contains *all* lines of
|
|
of UnicodeData.txt and write lines to outfile as used in the
|
|
|
|
CHARMAP
|
|
…
|
|
END CHARMAP
|
|
|
|
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
|
|
|
|
Samples for input lines:
|
|
|
|
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
|
|
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
|
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
|
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
|
|
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
|
|
|
|
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
|
|
|
|
<U0010> /x10 DATA LINK ESCAPE
|
|
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
|
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
|
|
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
|
|
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
|
|
|
'''
|
|
fields_start = []
|
|
for line in flines:
|
|
fields = line.split(";")
|
|
# Some characters have “<control>” as their name. We try to
|
|
# use the “Unicode 1.0 Name” (10th field in
|
|
# UnicodeData.txt) for them.
|
|
#
|
|
# The Characters U+0080, U+0081, U+0084 and U+0099 have
|
|
# “<control>” as their name but do not even have aa
|
|
# ”Unicode 1.0 Name”. We could write code to take their
|
|
# alternate names from NameAliases.txt.
|
|
if fields[1] == "<control>" and fields[10]:
|
|
fields[1] = fields[10]
|
|
# Handling code point ranges like:
|
|
#
|
|
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
|
|
fields_start = fields
|
|
continue
|
|
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
|
|
process_range(fields_start[0], fields[0],
|
|
outfile, fields[1][:-7]+'>')
|
|
fields_start = []
|
|
continue
|
|
fields_start = []
|
|
if 'Surrogate,' in fields[1]:
|
|
# Comment out the surrogates in the UTF-8 file.
|
|
# One could of course skip them completely but
|
|
# the original UTF-8 file in glibc had them as
|
|
# comments, so we keep these comment lines.
|
|
outfile.write('%')
|
|
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
|
unicode_utils.ucs_symbol(int(fields[0], 16)),
|
|
convert_to_hex(int(fields[0], 16)),
|
|
fields[1]))
|
|
|
|
def convert_to_hex(code_point):
|
|
'''Converts a code point to a hexadecimal UTF-8 representation
|
|
like /x**/x**/x**.'''
|
|
# Getting UTF8 of Unicode characters.
|
|
# In Python3, .encode('UTF-8') does not work for
|
|
# surrogates. Therefore, we use this conversion table
|
|
surrogates = {
|
|
0xD800: '/xed/xa0/x80',
|
|
0xDB7F: '/xed/xad/xbf',
|
|
0xDB80: '/xed/xae/x80',
|
|
0xDBFF: '/xed/xaf/xbf',
|
|
0xDC00: '/xed/xb0/x80',
|
|
0xDFFF: '/xed/xbf/xbf',
|
|
}
|
|
if code_point in surrogates:
|
|
return surrogates[code_point]
|
|
return ''.join([
|
|
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
|
|
])
|
|
|
|
def write_header_charmap(outfile):
|
|
'''Write the header on top of the CHARMAP section to the output file'''
|
|
outfile.write("<code_set_name> UTF-8\n")
|
|
outfile.write("<comment_char> %\n")
|
|
outfile.write("<escape_char> /\n")
|
|
outfile.write("<mb_cur_min> 1\n")
|
|
outfile.write("<mb_cur_max> 6\n\n")
|
|
outfile.write("% CHARMAP generated using utf8_gen.py\n")
|
|
outfile.write("% alias ISO-10646/UTF-8\n")
|
|
outfile.write("CHARMAP\n")
|
|
|
|
def write_header_width(outfile, unicode_version):
|
|
'''Writes the header on top of the WIDTH section to the output file'''
|
|
outfile.write('% Character width according to Unicode {:s}.\n'.format(unicode_version))
|
|
outfile.write('% Width is determined by the following rules, in order of decreasing precedence:\n')
|
|
outfile.write('% - U+00AD SOFT HYPHEN has width 1, as a special case for compatibility (https://archive.is/b5Ck).\n')
|
|
outfile.write('% - U+115F HANGUL CHOSEONG FILLER has width 2.\n')
|
|
outfile.write('% This character stands in for an intentionally omitted leading consonant\n')
|
|
outfile.write('% in a Hangul syllable block; as such it must be assigned width 2 despite its lack\n')
|
|
outfile.write('% of visible display to ensure that the complete block has the correct width.\n')
|
|
outfile.write('% (See below for more information on Hangul syllables.)\n')
|
|
outfile.write('% - Combining jungseong and jongseong Hangul jamo have width 0; generated from\n')
|
|
outfile.write('% "grep \'^[^;]*;[VT]\' HangulSyllableType.txt".\n')
|
|
outfile.write('% One composed Hangul "syllable block" like 퓛 is made up of\n')
|
|
outfile.write('% two to three individual component characters called "jamo".\n')
|
|
outfile.write('% The complete block must have total width 2;\n')
|
|
outfile.write('% to achieve this, we assign a width of 2 to leading "choseong" jamo,\n')
|
|
outfile.write('% and of 0 to medial vowel "jungseong" and trailing "jongseong" jamo.\n')
|
|
outfile.write('% - Non-spacing and enclosing marks have width 0; generated from\n')
|
|
outfile.write('% "grep -E \'^[^;]*;[^;]*;(Mn|Me);\' UnicodeData.txt".\n')
|
|
outfile.write('% - "Default_Ignorable_Code_Point"s have width 0; generated from\n')
|
|
outfile.write('% "grep \'^[^;]*;\\s*Default_Ignorable_Code_Point\' DerivedCoreProperties.txt".\n')
|
|
outfile.write('% - Double-width characters have width 2; generated from\n')
|
|
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt".\n')
|
|
outfile.write('% - Default width for all other characters is 1.\n')
|
|
outfile.write("WIDTH\n")
|
|
|
|
def process_width(outfile, ulines, dlines, elines, klines):
|
|
'''ulines are lines from UnicodeData.txt.
|
|
elines are lines from EastAsianWidth.txt containing characters with width
|
|
“W” or “F”.
|
|
dlines are lines from DerivedCoreProperties.txt which contain
|
|
characters with the property “Default_Ignorable_Code_Point”.
|
|
klines are lines from HangulSyllableType.txt which contain characters
|
|
with syllable type “V” or “T”.
|
|
'''
|
|
# Wide and fullwidth characters have width 1
|
|
width_dict = {}
|
|
for line in elines:
|
|
fields = line.split(";")
|
|
if not '..' in fields[0]:
|
|
code_points = (fields[0], fields[0])
|
|
else:
|
|
code_points = fields[0].split("..")
|
|
for key in range(int(code_points[0], 16),
|
|
int(code_points[1], 16)+1):
|
|
width_dict[key] = 2
|
|
|
|
# Nonspacing and enclosing marks have width 0
|
|
for line in ulines:
|
|
fields = line.split(";")
|
|
if fields[4] == "NSM" or fields[2] in ("Me", "Mn"):
|
|
width_dict[int(fields[0], 16)] = 0
|
|
|
|
# Conjoining vowel and trailing jamo have width 0
|
|
for line in klines:
|
|
fields = line.split(";")
|
|
if not '..' in fields[0]:
|
|
code_points = (fields[0], fields[0])
|
|
else:
|
|
code_points = fields[0].split("..")
|
|
for key in range(int(code_points[0], 16),
|
|
int(code_points[1], 16)+1):
|
|
width_dict[key] = 0
|
|
|
|
# “Default_Ignorable_Code_Point”s have width 0
|
|
for line in dlines:
|
|
fields = line.split(";")
|
|
if not '..' in fields[0]:
|
|
code_points = (fields[0], fields[0])
|
|
else:
|
|
code_points = fields[0].split("..")
|
|
for key in range(int(code_points[0], 16),
|
|
int(code_points[1], 16)+1):
|
|
width_dict[key] = 0 # default width is 1
|
|
|
|
|
|
# Special case: U+00AD SOFT HYPHEN
|
|
del width_dict[0x00AD]
|
|
|
|
# Special case: U+115F HANGUL CHOSEONG FILLER
|
|
width_dict[0x115F] = 2
|
|
|
|
for key in list(range(0x3248, 0x3250)):
|
|
# These are “A” which means we can decide whether to treat them
|
|
# as “W” or “N” based on context:
|
|
# http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
|
|
# For us, “W” seems better.
|
|
width_dict[key] = 2
|
|
for key in list(range(0x4DC0, 0x4E00)):
|
|
width_dict[key] = 2
|
|
|
|
same_width_lists = []
|
|
current_width_list = []
|
|
for key in sorted(width_dict):
|
|
if not current_width_list:
|
|
current_width_list = [key]
|
|
elif (key == current_width_list[-1] + 1
|
|
and width_dict[key] == width_dict[current_width_list[0]]):
|
|
current_width_list.append(key)
|
|
else:
|
|
same_width_lists.append(current_width_list)
|
|
current_width_list = [key]
|
|
if current_width_list:
|
|
same_width_lists.append(current_width_list)
|
|
|
|
for same_width_list in same_width_lists:
|
|
if len(same_width_list) == 1:
|
|
outfile.write('{:s}\t{:d}\n'.format(
|
|
unicode_utils.ucs_symbol(same_width_list[0]),
|
|
width_dict[same_width_list[0]]))
|
|
else:
|
|
outfile.write('{:s}...{:s}\t{:d}\n'.format(
|
|
unicode_utils.ucs_symbol(same_width_list[0]),
|
|
unicode_utils.ucs_symbol(same_width_list[-1]),
|
|
width_dict[same_width_list[0]]))
|
|
|
|
if __name__ == "__main__":
|
|
PARSER = argparse.ArgumentParser(
|
|
description='''
|
|
Generate a UTF-8 file from UnicodeData.txt, DerivedCoreProperties.txt, EastAsianWidth.txt, and HangulSyllableType.txt
|
|
''')
|
|
PARSER.add_argument(
|
|
'-u', '--unicode_data_file',
|
|
nargs='?',
|
|
type=str,
|
|
default='UnicodeData.txt',
|
|
help=('The UnicodeData.txt file to read, '
|
|
+ 'default: %(default)s'))
|
|
PARSER.add_argument(
|
|
'-d', '--derived_core_properties_file',
|
|
nargs='?',
|
|
type=str,
|
|
default='DerivedCoreProperties.txt',
|
|
help=('The DerivedCoreProperties.txt file to read, '
|
|
+ 'default: %(default)s'))
|
|
PARSER.add_argument(
|
|
'-e', '--east_asian_with_file',
|
|
nargs='?',
|
|
type=str,
|
|
default='EastAsianWidth.txt',
|
|
help=('The EastAsianWidth.txt file to read, '
|
|
+ 'default: %(default)s'))
|
|
PARSER.add_argument(
|
|
'-k', '--hangul_syllable_type_file',
|
|
nargs='?',
|
|
type=str,
|
|
default='HangulSyllableType.txt',
|
|
help=('The HangulSyllableType.txt file to read, '
|
|
+ 'default: %(default)s'))
|
|
PARSER.add_argument(
|
|
'--unicode_version',
|
|
nargs='?',
|
|
required=True,
|
|
type=str,
|
|
help='The Unicode version of the input files used.')
|
|
ARGS = PARSER.parse_args()
|
|
|
|
unicode_utils.fill_attributes(ARGS.unicode_data_file)
|
|
with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
|
|
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
|
with open(ARGS.derived_core_properties_file, mode='r') as DERIVED_CORE_PROPERTIES_FILE:
|
|
DERIVED_CORE_PROPERTIES_LINES = []
|
|
for LINE in DERIVED_CORE_PROPERTIES_FILE:
|
|
# If characters which are from reserved ranges
|
|
# (i.e. not yet assigned code points)
|
|
# are added to the WIDTH section of the UTF-8 file, then
|
|
# “make check” produces “Unknown Character” errors for
|
|
# these code points because such unassigned code points
|
|
# are not in the CHARMAP section of the UTF-8 file.
|
|
#
|
|
# Therefore, we skip all reserved code points.
|
|
if re.match(r'.*<reserved-.+>', LINE):
|
|
continue
|
|
if re.match(r'^[^;]*;\s*Default_Ignorable_Code_Point', LINE):
|
|
DERIVED_CORE_PROPERTIES_LINES.append(LINE.strip())
|
|
with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
|
|
EAST_ASIAN_WIDTH_LINES = []
|
|
for LINE in EAST_ASIAN_WIDTH_FILE:
|
|
if re.match(r'.*<reserved-.+>', LINE):
|
|
continue
|
|
if re.match(r'^[^;]*;\s*[WF]', LINE):
|
|
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
|
with open(ARGS.hangul_syllable_type_file, mode='r') as HANGUL_SYLLABLE_TYPE_FILE:
|
|
HANGUL_SYLLABLE_TYPE_LINES = []
|
|
for LINE in HANGUL_SYLLABLE_TYPE_FILE:
|
|
if re.match(r'.*<reserved-.+>', LINE):
|
|
continue
|
|
if re.match(r'^[^;]*;\s*[VT]', LINE):
|
|
HANGUL_SYLLABLE_TYPE_LINES.append(LINE.strip())
|
|
with open('UTF-8', mode='w') as OUTFILE:
|
|
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
|
write_header_charmap(OUTFILE)
|
|
process_charmap(UNICODE_DATA_LINES, OUTFILE)
|
|
OUTFILE.write("END CHARMAP\n\n")
|
|
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
|
write_header_width(OUTFILE, ARGS.unicode_version)
|
|
process_width(OUTFILE,
|
|
UNICODE_DATA_LINES,
|
|
DERIVED_CORE_PROPERTIES_LINES,
|
|
EAST_ASIAN_WIDTH_LINES,
|
|
HANGUL_SYLLABLE_TYPE_LINES)
|
|
OUTFILE.write("END WIDTH\n")
|