2016-01-21 04:34:33 +00:00

2137 lines
75 KiB
Executable File

# -*- coding: utf-8 -*-
# Copyright (c) 2009-2016 International Business Machines
# Corporation and others. All Rights Reserved.
# file name:
# encoding: US-ASCII
# tab size: 8 (not used)
# indentation:4
# created on: 2011nov03 (forked from
# created by: Markus W. Scherer
# Copies Unicode Character Database (UCD) files from a tree
# of files downloaded from (for example)
# to ICU's source/data/unidata/ and source/test/testdata/
# and modifies some of the files to make them more compact.
# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
# Invoke with three command-line parameters:
# 1. source folder with UCD & idna files
# 2. ICU source root folder
# 3. ICU tools root folder
# Sample invocation:
# ~/$ py/ ~/uni61/20120118 ~/ ~/
import array
import bisect
import codecs
import datetime
import os
import os.path
import re
import shutil
import sys
# Unicode version ---------------------------------------------------------- ***
_ucd_version = "?"
_copyright = ""
_terms_of_use = ""
_current_year ="%Y")
# ISO 15924 script codes --------------------------------------------------- ***
# Script codes from ISO 15924
# that are not yet in the UCD.
_scripts_only_in_iso15924 = (
"Afak", "Blis", "Cirt", "Cyrs",
"Egyd", "Egyh", "Geok",
"Hans", "Hant",
"Inds", "Jpan", "Jurc", "Kore", "Kpel", "Latf", "Latg", "Loma",
"Maya", "Moon", "Nkgb", "Nshu", "Phlv", "Roro",
"Sara", "Syre", "Syrj", "Syrn",
"Tang", "Teng", "Visp", "Wole", "Zmth", "Zsym", "Zxxx"
# Properties --------------------------------------------------------------- ***
_ignored_properties = set((
# Other_Xyz only contribute to Xyz, store only the latter.
# Further properties that just contribute to others.
"CE", # Composition_Exclusion just contributes to Full_Composition_Exclusion.
# These properties just don't seem useful.
# They are deprecated since Unicode 6.0.
# ICU does not use Unihan properties.
# Dictionary of properties.
# Keyed by normalized property names and aliases.
# Each value is a tuple with
# 0: Type of property (binary, enum, ...)
# 1: List of aliases; short & long name followed by other aliases.
# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
# 2: Set of short property value names.
# 3: Dictionary of property values.
# For Catalog & Enumerated properties,
# maps each value name to a list of aliases.
# Empty for other types of properties.
_properties = {}
# Dictionary of binary-property values which we store as False/True.
# Same as the values dictionary of one of the binary properties.
_binary_values = {}
# Dictionary of null values.
# Keyed by short property names.
# These are type-specific values for properties that occur in the data.
# They are overridden by _defaults, block and code point properties.
_null_values = {}
# Property value names for null values.
# We do not store these in _defaults.
_null_names = frozenset(("<none>", "NaN"))
# Dictionary of explicit default property values.
# Keyed by short property names.
_defaults = {}
# _null_values overridden by explicit _defaults.
# Initialized after parsing is done.
_null_or_defaults = {}
# List of properties with an ICU UProperty enum.
# Each item is an (enum, pname, values) tuple.
# - enum: the ICU enum UProperty constant string
# - pname: the UCD short property name
# - values: list of (enum, vname) pairs per property value
# - enum: the ICU property value's enum constant string
# - vname: the UCD short property value name
_icu_properties = []
# Dictionary of short property names mapped to _icu_properties items.
_pname_to_icu_prop = {}
_non_alnum_re = re.compile("[^a-zA-Z0-9]")
def NormPropName(pname):
"""Returns a normalized form of pname.
Removes non-ASCII-alphanumeric characters and lowercases letters."""
return _non_alnum_re.sub("", pname).lower()
def GetProperty(pname):
"""Returns the _properties value for the pname.
Returns null if the property is ignored.
Caches alternate spellings of the property name."""
# Try the input name.
prop = _properties.get(pname)
if prop != None: return prop
if pname in _ignored_properties: return None
# Try the normalized input name.
norm_name = NormPropName(pname)
prop = _properties.get(norm_name)
if prop != None:
_properties[pname] = prop # Cache prop under this new name spelling.
return prop
elif pname in _ignored_properties:
_ignored_properties.add(pname) # Remember to ignore this new name spelling.
return None
raise NameError("unknown property %s\n" % pname)
def GetShortPropertyName(pname):
if pname in _null_values: return pname # pname is already the short name.
prop = GetProperty(pname)
if not prop: return "" # For ignored properties.
return prop[1][0] or prop[1][1] # Long name if no short name.
def GetShortPropertyValueName(prop, vname):
if vname in prop[2]: return vname
values = prop[3]
aliases = values.get(vname)
if aliases == None:
norm_name = NormPropName(vname)
aliases = values.get(norm_name)
if aliases == None:
raise NameError("unknown value name %s for property %s\n" %
(vname, prop[1][0]))
values[vname] = aliases
return aliases[0] or aliases[1] # Long name if no short name.
def NormalizePropertyValue(prop, vname):
if prop[2]: # Binary/Catalog/Enumerated property.
value = GetShortPropertyValueName(prop, vname)
if prop[0] == "Binary":
value = value == "Y"
if prop[1][0].endswith("ccc"):
value = int(value)
value = vname
return value
# Character data ----------------------------------------------------------- ***
# Lists of NamesList h1 and h2 headings.
# Each h1 value is a (start, end, comment) tuple.
# Each h2 value is a (cp, comment) tuple.
_h1 = []
_h2 = []
# List of Unicode blocks.
# Each item is a tuple of start & end code point integers
# and a dictionary of default property values.
_blocks = []
# List of ranges with algorithmic names.
# Each value is a list of [start, end, type, prefix]
# where prefix is optional.
_alg_names_ranges = []
# List of Unicode character ranges and their properties,
# stored as an inversion map with range_start & props dictionary.
# Starts with one range for all of Unicode without any properties.
# Setting values subdivides ranges.
_starts = array.array('l', [0, 0x110000]) # array of int32_t
_props = [{}, {}] # props for 0 and 110000
def FindRange(x):
""" Binary search for x in the inversion map.
Returns the smallest i where x < _starts[i]"""
return bisect.bisect(_starts, x) - 1
def GetProps(c):
i = FindRange(c)
return _props[i]
def UpdateProps(start, end, update):
assert 0 <= start <= end <= 0x10ffff
(need_to_update, do_update, u) = (update[0], update[1], update[2])
# Find the index i of the range in _starts that contains start.
i = FindRange(start)
limit = end + 1
# Intersect [start, limit[ with ranges in _starts.
c_start = _starts[i]
c_limit = _starts[i + 1]
c_props = _props[i]
# c_start <= start < c_limit
if c_start < start:
update_limit = c_limit if c_limit <= limit else limit
if need_to_update(u, start, update_limit - 1, c_props):
# Split off [c_start, start[ with a copy of c_props.
i += 1
c_props = c_props.copy()
_starts.insert(i, start)
_props.insert(i, c_props)
c_start = start
# Modify all ranges that are fully inside [start, limit[.
while c_limit <= limit:
# start <= c_start < c_limit <= limit
if need_to_update(u, c_start, c_limit - 1, c_props):
do_update(u, c_start, c_limit - 1, c_props)
if c_limit == 0x110000: return
i += 1
c_start = c_limit
c_limit = _starts[i + 1]
c_props = _props[i]
if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
# Split off [limit, c_limit[ with a copy of c_props.
_starts.insert(i + 1, limit)
_props.insert(i + 1, c_props.copy())
# Modify [c_start, limit[ c_props.
do_update(u, c_start, limit - 1, c_props)
def NeedToSetProps(props, start, end, c_props):
"""Returns True if props is not a sub-dict of c_props."""
for (pname, value) in props.iteritems():
if pname not in c_props or value != c_props[pname]: return True
return False
def DoSetProps(props, start, end, c_props):
def SetProps(start, end, props):
UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
def NeedToSetAlways(nv, start, end, c_props):
return True
# For restoring boundaries after merging adjacent same-props ranges.
def AddBoundary(x):
"""Ensure that there is a range start/limit at x."""
assert 0 <= x <= 0x10ffff
i = FindRange(x)
if _starts[i] == x: return
# Split the range at x.
c_start = _starts[i]
c_limit = _starts[i + 1]
c_props = _props[i]
# c_start < x < c_limit
i += 1
_starts.insert(i, x)
_props.insert(i, c_props.copy())
def SetDefaultValue(pname, value):
"""Sets the property's default value. Ignores null values."""
prop = GetProperty(pname)
if prop and value not in _null_names:
value = NormalizePropertyValue(prop, value)
if value != _null_values[prop[1][0]]:
_defaults[prop[1][0]] = value
SetProps(0, 0x10ffff, {prop[1][0]: value})
def SetBinaryPropertyToTrue(pname, start, end):
prop = GetProperty(pname)
if prop:
assert prop[0] == "Binary"
SetProps(start, end, {prop[1][0]: True})
def SetPropValue(prop, vname, start, end):
value = NormalizePropertyValue(prop, vname)
SetProps(start, end, {prop[1][0]: value})
def SetPropertyValue(pname, vname, start, end):
prop = GetProperty(pname)
if prop: SetPropValue(prop, vname, start, end)
# Parsing ------------------------------------------------------------------ ***
_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
def ReadUCDLines(in_file, want_ranges=True, want_other=False,
want_comments=False, want_missing=False):
"""Parses lines from a semicolon-delimited UCD text file.
Strips comments, ignores empty and all-comment lines.
Returns a tuple (type, line, ...).
global _copyright, _terms_of_use
for line in in_file:
line = line.strip()
if not line: continue
if line.startswith("#"): # whole-line comment
if want_missing:
match = _missing_re.match(line)
if match:
fields =";")
for i in xrange(len(fields)): fields[i] = fields[i].strip()
yield ("missing", line, fields)
if want_comments: yield ("comment", line)
if line.startswith("# Copyright"):
if not _copyright and _current_year in line:
_copyright = line
elif "terms of use" in line and not _terms_of_use:
_terms_of_use = line
comment_start = line.find("#") # inline comment
if comment_start >= 0:
line = line[:comment_start].rstrip()
if not line: continue
fields = line.split(";")
for i in xrange(len(fields)): fields[i] = fields[i].strip()
if want_ranges:
first = fields[0]
match = _stripped_range_re.match(first)
if match:
start = int(, 16)
end = int(, 16)
yield ("range", line, start, end, fields)
match = _stripped_cp_re.match(first)
if match:
c = int(, 16)
yield ("range", line, c, c, fields)
if want_other:
yield ("other", line, fields)
raise SyntaxError("unable to parse line\n %s\n" % line)
def AddBinaryProperty(short_name, long_name):
_null_values[short_name] = False
bin_prop = _properties["Math"]
prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
_properties[short_name] = prop
_properties[long_name] = prop
_properties[NormPropName(short_name)] = prop
_properties[NormPropName(long_name)] = prop
def AddPOSIXBinaryProperty(name):
# We only define a long name for ICU-specific (non-UCD) POSIX properties.
_null_values[name] = False
bin_prop = _properties["Math"]
prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
_properties[name] = prop
_properties[NormPropName(name)] = prop
# This is to match UProperty UCHAR_POSIX_ALNUM etc.
_properties["posix" + NormPropName(name)] = prop
# Match a comment line like
# PropertyAliases-6.1.0.txt
# and extract the Unicode version.
_ucd_version_re = re.compile("# *PropertyAliases" +
"-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
def ParsePropertyAliases(in_file):
global _ucd_version
prop_type_nulls = {
"Binary": False,
"Catalog": "??", # Must be specified, e.g., in @missing line.
"Enumerated": "??", # Must be specified.
"Numeric": "NaN",
"String": "",
"Miscellaneous": ""
for data in ReadUCDLines(in_file, want_ranges=False,
want_other=True, want_comments=True):
if data[0] == "comment":
line = data[1]
match = _ucd_version_re.match(line)
if match:
_ucd_version =
words = line[1:].lstrip().split()
if len(words) == 2 and words[1] == "Properties":
prop_type = words[0]
null_value = prop_type_nulls[prop_type]
# type == "other"
aliases = data[2]
name = aliases[0]
if name in _ignored_properties:
for alias in aliases:
if name.endswith("ccc"):
_null_values[name] = 0
_null_values[name] = null_value
prop = (prop_type, aliases, set(), {})
for alias in aliases:
_properties[alias] = prop
_properties[NormPropName(alias)] = prop
# Add provisional and ICU-specific properties we need.
# We add some in support of runtime API, even if we do not write
# data for them to ppucd.txt (e.g., lccc & tccc).
# We add others just to represent UCD data that contributes to
# some functionality, although Unicode has not "blessed" them
# as separate properties (e.g., Turkic_Case_Folding).
# Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
name = "Turkic_Case_Folding"
_null_values[name] = ""
prop = ("String", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
name = "Conditional_Case_Mappings"
_null_values[name] = ""
prop = ("Miscellaneous", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# lccc = ccc of first cp in canonical decomposition.
_null_values["lccc"] = 0
ccc_prop = list(_properties["ccc"])
ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
prop = tuple(ccc_prop)
_properties["lccc"] = prop
_properties["Lead_Canonical_Combining_Class"] = prop
_properties["leadcanonicalcombiningclass"] = prop
# tccc = ccc of last cp in canonical decomposition.
_null_values["tccc"] = 0
ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
prop = tuple(ccc_prop)
_properties["tccc"] = prop
_properties["Trail_Canonical_Combining_Class"] = prop
_properties["trailcanonicalcombiningclass"] = prop
# Script_Extensions
if "scx" not in _properties:
_null_values["scx"] = ""
prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
_properties["scx"] = prop
_properties["Script_Extensions"] = prop
_properties["scriptextensions"] = prop
# General Category as a bit mask.
_null_values["gcm"] = "??"
gc_prop = _properties["gc"]
prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
_properties["gcm"] = prop
_properties["General_Category_Mask"] = prop
_properties["generalcategorymask"] = prop
# Various binary properties.
AddBinaryProperty("Sensitive", "Case_Sensitive")
AddBinaryProperty("nfdinert", "NFD_Inert")
AddBinaryProperty("nfkdinert", "NFKD_Inert")
AddBinaryProperty("nfcinert", "NFC_Inert")
AddBinaryProperty("nfkcinert", "NFKC_Inert")
AddBinaryProperty("segstart", "Segment_Starter")
AddBinaryProperty("Emoji", "Emoji")
AddBinaryProperty("Emoji_Presentation", "Emoji_Presentation")
AddBinaryProperty("Emoji_Modifier", "Emoji_Modifier")
AddBinaryProperty("Emoji_Modifier_Base", "Emoji_Modifier_Base")
# C/POSIX character classes that do not have Unicode property [value] aliases.
# See uchar.h.
def ParsePropertyValueAliases(in_file):
global _binary_values
for data in ReadUCDLines(in_file, want_ranges=False,
want_other=True, want_missing=True):
if data[0] == "missing":
SetDefaultValue(data[2][0], data[2][1])
# type == "other"
fields = data[2]
pname = fields[0]
prop = GetProperty(pname)
if prop:
del fields[0] # Only the list of aliases remains.
short_name = fields[0]
if short_name == "n/a": # no short name
fields[0] = ""
short_name = fields[1]
values = prop[3]
for alias in fields:
if alias:
values[alias] = fields
values[NormPropName(alias)] = fields
if prop[0] == "Binary" and not _binary_values:
_binary_values = values
# Some of the @missing lines with non-null default property values
# are in files that we do not parse;
# either because the data for that property is easily
# (i.e., the @missing line would be the only reason to parse such a file)
# or because we compute the property at runtime,
# such as the Hangul_Syllable_Type.
if "dt" not in _defaults: # DerivedDecompositionType.txt
_defaults["dt"] = "None"
if "nt" not in _defaults: # DerivedNumericType.txt
_defaults["nt"] = "None"
if "hst" not in _defaults: # HangulSyllableType.txt
_defaults["hst"] = "NA"
if "gc" not in _defaults: # No @missing line in any .txt file?
_defaults["gc"] = "Cn"
# Copy the gc default value to gcm.
_defaults["gcm"] = _defaults["gc"]
# Add ISO 15924-only script codes.
# Only for the ICU script code API, not necessary for parsing the UCD.
script_prop = _properties["sc"]
short_script_names = script_prop[2] # set
script_values = script_prop[3] # dict
remove_scripts = []
for script in _scripts_only_in_iso15924:
if script in short_script_names:
# Do not invent a Unicode long script name before the UCD adds the script.
script_list = [script, script] # [short, long]
script_values[script] = script_list
# Probably not necessary because
# we will not parse these scripts from the UCD:
script_values[NormPropName(script)] = script_list
if remove_scripts:
raise ValueError(
"remove %s from _scripts_only_in_iso15924" % remove_scripts)
def ParseBlocks(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue("blk", data[2][0])
# type == "range"
(start, end, name) = (data[2], data[3], data[4][1])
_blocks.append((start, end, {"blk": name}))
SetPropertyValue("blk", name, start, end)
# Check for overlapping blocks.
prev_end = -1
for b in _blocks:
start = b[0]
end = b[1]
if prev_end >= start:
raise ValueError(
"block %04lX..%04lX %s overlaps with another " +
"ending at %04lX\n %s\n" %
(start, end, b[2]["blk"], prev_end))
prev_end = end
def ParseUnicodeData(in_file):
dt_prop = GetProperty("dt")
range_first_line = ""
range_first = -1
for data in ReadUCDLines(in_file, want_missing=True):
# type == "range"
(line, c, end, fields) = (data[1], data[2], data[3], data[4])
assert c == end
name = fields[1]
if name.startswith("<"):
if name.endswith(", First>"):
if range_first >= 0:
raise SyntaxError(
"error: unterminated range started at\n %s\n" %
range_first = c
range_first_line = line
elif name.endswith(", Last>"):
if range_first < 0:
raise SyntaxError(
"error: range end without start at\n %s\n" %
elif range_first > c:
raise SyntaxError(
"error: range start/end out of order at\n %s\n %s\n" %
(range_first_line, line))
first_name = range_first_line.split(";")[1][1:-8]
name = name[1:-7]
if first_name != name:
raise SyntaxError(
"error: range start/end name mismatch at\n %s\n %s\n" %
(range_first_line, line))
end = c
c = range_first
range_first = -1
# Remember algorithmic name ranges.
if "Ideograph" in name:
_alg_names_ranges.append([c, end, "han", "CJK UNIFIED IDEOGRAPH-"])
elif name == "Hangul Syllable":
_alg_names_ranges.append([c, end, "hangul"])
name = ""
# Ignore non-names like <control>.
name = ""
props = {}
if name: props["na"] = name
props["gc"] = fields[2]
ccc = int(fields[3])
if ccc: props["ccc"] = ccc
props["bc"] = fields[4]
# Decomposition type & mapping.
dm = fields[5]
if dm:
if dm.startswith("<"):
dt_limit = dm.index(">")
dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
dm = dm[dt_limit + 1:].lstrip()
dt = "Can"
props["dt"] = dt
props["dm"] = dm
# Numeric type & value.
decimal = fields[6]
digit = fields[7]
nv = fields[8]
if (decimal and decimal != nv) or (digit and digit != nv):
raise SyntaxError("error: numeric values differ at\n %s\n" % line)
if nv:
# Map improper fractions to proper ones.
if nv == "2/12":
nv = "1/6"
elif nv == "3/12":
nv = "1/4"
elif nv == "4/12":
nv = "1/3"
elif nv == "6/12":
nv = "1/2"
elif nv == "8/12":
nv = "2/3"
elif nv == "9/12":
nv = "3/4"
elif nv == "10/12":
nv = "5/6"
props["nv"] = nv
props["nt"] = "De" if decimal else "Di" if digit else "Nu"
if fields[9] == "Y": props["Bidi_M"] = True
# ICU 49 and above does not support Unicode_1_Name any more.
# See ticket #9013.
# na1 = fields[10]
# if na1: props["na1"] = na1
# ISO_Comment is deprecated and has no values.
# isc = fields[11]
# if isc: props["isc"] = isc
# Simple case mappings.
suc = fields[12]
slc = fields[13]
stc = fields[14]
if suc: props["suc"] = suc
if slc: props["slc"] = slc
if stc: props["stc"] = stc
SetProps(c, end, props)
if range_first >= 0:
raise SyntaxError(
"error: unterminated range started at\n %s\n" %
# Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
_names_h2_re = re.compile("@\t\t(.+)")
_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
def ParseNamesList(in_file):
pending_h2 = ""
for line in in_file:
line = line.strip()
if not line: continue
match = _names_h1_re.match(line)
if match:
pending_h2 = "" # Drop a pending h2 when we get to an h1.
start = int(, 16)
end = int(, 16)
comment ="\xa0", " ")
_h1.append((start, end, comment))
match = _names_h2_re.match(line)
if match:
pending_h2 ="\xa0", " ")
if pending_h2:
match = _names_char_re.match(line)
if match:
c = int(, 16)
_h2.append((c, pending_h2))
pending_h2 = ""
def ParseNamedProperties(in_file):
"""Parses a .txt file where the first column is a code point range
and the second column is a property name.
Sets binary properties to True,
and other properties to the values in the third column."""
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue(data[2][0], data[2][1])
# type == "range"
if len(data[4]) == 2:
SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
def ParseOneProperty(in_file, pname):
"""Parses a .txt file where the first column is a code point range
and the second column is the value of a known property."""
prop = GetProperty(pname)
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue(pname, data[2][0])
# type == "range"
SetPropValue(prop, data[4][1], data[2], data[3])
def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
def ParseIndicPositionalCategory(in_file): ParseOneProperty(in_file, "InPC")
def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
def DoSetNameAlias(alias, start, end, c_props):
if "Name_Alias" in c_props:
c_props["Name_Alias"] += ',' + alias
c_props["Name_Alias"] = alias
def ParseNameAliases(in_file):
"""Parses Name_Alias from NameAliases.txt.
A character can have multiple aliases.
In Unicode 6.0, there are two columns,
with a name correction in the second column.
In Unicode 6.1, there are three columns.
The second contains an alias, the third its type.
The documented types are:
correction, control, alternate, figment, abbreviation
This function does not sort the types, assuming they appear in this order."""
for data in ReadUCDLines(in_file):
start = data[2]
end = data[3]
if start != end:
raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
(start, end))
fields = data[4]
if len(fields) == 2:
alias = "correction=" + fields[1]
alias = fields[2] + '=' + fields[1]
update = (NeedToSetAlways, DoSetNameAlias, alias)
UpdateProps(start, end, update)
def NeedToSetNumericValue(nv, start, end, c_props):
c_nv = c_props.get("nv")
if c_nv == None:
# DerivedNumericValues.txt adds a Numeric_Value.
assert "nt" not in c_props
return True
if nv != c_nv:
raise ValueError(("UnicodeData.txt has nv=%s for %04lX..%04lX " +
"but DerivedNumericValues.txt has nv=%s") %
(c_nv, start, end, nv))
return False
def DoSetNumericValue(nv, start, end, c_props):
c_props.update({"nt": "Nu", "nv": nv})
def ParseDerivedNumericValues(in_file):
"""Parses DerivedNumericValues.txt.
For most characters, the numeric type & value were parsed previously
from UnicodeData.txt but that does not show the values for Han characters.
Here we check that values match those from UnicodeData.txt
and add new ones."""
# Ignore the @missing line which has an incorrect number of fields,
# and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
# Also, "NaN" is just the Numeric null value anyway.
for data in ReadUCDLines(in_file):
# Conditional update to the numeric value in the 4th field.
update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
UpdateProps(data[2], data[3], update)
def ParseCaseFolding(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
assert data[2][0] == "C" # common to scf & cf
SetDefaultValue("scf", data[2][1])
SetDefaultValue("cf", data[2][1])
# type == "range"
start = data[2]
end = data[3]
status = data[4][1]
mapping = data[4][2]
assert status in "CSFT"
if status == "C":
SetProps(start, end, {"scf": mapping, "cf": mapping})
elif status == "S":
SetPropertyValue("scf", mapping, start, end)
elif status == "F":
SetPropertyValue("cf", mapping, start, end)
else: # status == "T"
SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
def DoSetConditionalCaseMappings(ccm, start, end, c_props):
if "Conditional_Case_Mappings" in c_props:
c_props["Conditional_Case_Mappings"] += ',' + ccm
c_props["Conditional_Case_Mappings"] = ccm
def ParseSpecialCasing(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue("lc", data[2][0])
SetDefaultValue("tc", data[2][1])
SetDefaultValue("uc", data[2][2])
# type == "range"
start = data[2]
end = data[3]
fields = data[4]
if len(fields) < 5 or not fields[4]:
# Unconditional mappings.
SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
# Conditional_Case_Mappings
ccm = (fields[4] + ":lc=" + fields[1] +
"&tc=" + fields[2] + "&uc=" + fields[3])
update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
UpdateProps(start, end, update)
def ParseBidiBrackets(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue("bpt", data[2][1])
# type == "range"
start = data[2]
end = data[3]
assert start == end
mapping = data[4][1]
bracket_type = data[4][2]
SetProps(start, end, {"bpb": mapping, "bpt": bracket_type})
# Postprocessing ----------------------------------------------------------- ***
def CompactBlock(b, i):
assert b[0] == _starts[i]
orig_i = i
# Count the number of occurrences of each property's value in this block.
# To minimize the output, count the number of ranges,
# not the number of code points.
num_ranges_so_far = 0
prop_counters = {}
while True:
start = _starts[i]
if start > b[1]: break
props = _props[i]
for (pname, value) in props.iteritems():
if pname in prop_counters:
counter = prop_counters[pname]
counter = {_null_or_defaults[pname]: num_ranges_so_far}
prop_counters[pname] = counter
if value in counter:
counter[value] += 1
counter[value] = 1
# Also count default values for properties that do not occur in a range.
for pname in prop_counters:
if pname not in props:
counter = prop_counters[pname]
value = _null_or_defaults[pname]
counter[value] += 1
num_ranges_so_far += 1
# Invariant: For each counter, the sum of counts must equal num_ranges_so_far.
i += 1
# For each property that occurs within this block,
# set the most common value as a block property value.
b_props = b[2]
for (pname, counter) in prop_counters.iteritems():
max_value = None
max_count = 0
num_unique = 0
for (value, count) in counter.iteritems():
if count > max_count:
max_value = value
max_count = count
if count == 1: num_unique += 1
if max_value != _null_or_defaults[pname]:
# Avoid picking randomly among several unique values.
if (max_count > 1 or num_unique == 1):
b_props[pname] = max_value
# For each range and property, remove the default+block value
# but set the default value if that property was not set
# (i.e., it used to inherit the default value).
b_defaults = _null_or_defaults.copy()
i = orig_i
while True:
start = _starts[i]
if start > b[1]: break
props = _props[i]
for pname in prop_counters:
if pname in props:
if props[pname] == b_defaults[pname]: del props[pname]
elif pname in b_props:
# b_props only has non-default values.
# Set the default value if it used to be inherited.
props[pname] = _null_or_defaults[pname]
i += 1
# Return the _starts index of the first range after this block.
return i
def CompactNonBlock(limit, i):
"""Remove default property values from between-block ranges."""
while True:
start = _starts[i]
if start >= limit: break
props = _props[i]
for pname in props.keys(): # .keys() is a copy so we can del props[pname].
if props[pname] == _null_or_defaults[pname]: del props[pname]
i += 1
# Return the _starts index of the first range after this block.
return i
def CompactBlocks():
"""Optimizes block properties.
Sets properties on blocks to the most commonly used values,
and removes default+block values from code point properties."""
# Ensure that there is a boundary in _starts for each block
# so that the simple mixing method below works.
for b in _blocks: AddBoundary(b[0])
# Walk through ranges and blocks together.
i = 0
for b in _blocks:
b_start = b[0]
if _starts[i] < b_start:
i = CompactNonBlock(b_start, i)
i = CompactBlock(b, i)
CompactNonBlock(0x110000, i)
# Output ------------------------------------------------------------------- ***
def AppendRange(fields, start, end):
if start == end:
fields.append("%04lX" % start)
fields.append("%04lX..%04lX" % (start, end))
def AppendProps(fields, props):
# Sort property names (props keys) by their normalized forms
# and output properties in that order.
for pname in sorted(props, key=NormPropName):
value = props[pname]
if isinstance(value, bool):
if not value: pname = "-" + pname
fields.append("%s=%s" % (pname, value))
def WriteFieldsRangeProps(fields, start, end, props, out_file):
AppendRange(fields, start, end)
AppendProps(fields, props)
def EscapeNonASCII(s):
i = 0
while i < len(s):
c = ord(s[i])
if c <= 0x7f:
i = i + 1
if c <= 0xffff:
esc = u"\\u%04X" % c
esc = u"\\U%08X" % c
s = s[:i] + esc + s[i+1:]
i = i + len(esc)
return s
def WritePreparsedUCD(out_file):
global _copyright, _terms_of_use
out_file.write("# Preparsed UCD generated by ICU\n");
if not _copyright:
_copyright = "# Copyright (c) 1991-" + _current_year + " Unicode, Inc."
out_file.write(_copyright + "\n")
if _terms_of_use: out_file.write(_terms_of_use + "\n")
out_file.write("ucd;%s\n\n" % _ucd_version)
# Sort property names (props keys) by their normalized forms
# and output properties in that order.
pnames = sorted(_null_values, key=NormPropName)
for pname in pnames:
prop = _properties[pname]
out_file.write(";".join(["property", prop[0]] + prop[1]))
out_file.write(";".join(["binary"] + _binary_values["N"]))
out_file.write(";".join(["binary"] + _binary_values["Y"]))
for pname in pnames:
prop = _properties[pname]
short_names = prop[2]
if short_names and prop[0] != "Binary":
for name in sorted(short_names):
out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
# Ensure that there is a boundary in _starts for each
# range of data we mix into the output,
# so that the simple mixing method below works.
for b in _blocks: AddBoundary(b[0])
for r in _alg_names_ranges: AddBoundary(r[0])
for h in _h1: AddBoundary(h[0])
for h in _h2: AddBoundary(h[0])
# Write the preparsed data.
# TODO: doc syntax
# - ppucd.txt = preparsed UCD
# - Only whole-line comments starting with #, no inline comments.
# - defaults must precede any block or cp lines
# - block;a..b must precede any cp lines with code points in a..b
# - Some code may require that all cp lines with code points in a..b
# appear between block;a..b and the next block line.
# - block lines are not required; cp lines can have data for
# ranges outside of blocks.
WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
i_blocks = 0
i_alg = 0
i_h1 = 0
i_h2 = 0
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
# Block with default properties.
if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
b = _blocks[i_blocks]
WriteFieldsRangeProps(["\nblock"], b[0], b[1], b[2], out_file)
i_blocks += 1
# NamesList h1 heading (for [most of] a block).
if i_h1 < len(_h1) and start == _h1[i_h1][0]:
h = _h1[i_h1]
out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], EscapeNonASCII(h[2])))
i_h1 += 1
# Algorithmic-names range.
if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
r = _alg_names_ranges[i_alg]
fields = ["algnamesrange"]
AppendRange(fields, r[0], r[1])
i_alg += 1
# NamesList h2 heading.
if i_h2 < len(_h2) and start == _h2[i_h2][0]:
out_file.write("# %s\n" % EscapeNonASCII(_h2[i_h2][1]))
i_h2 += 1
# Code point/range data.
props = _props[i]
# Omit ranges with only default+block properties.
if props:
WriteFieldsRangeProps(["cp"], start, end, props, out_file)
# Write Normalizer2 input files -------------------------------------------- ***
# Ported from gennorm/store.c.
def WriteAllCC(out_file):
out_file.write("# Canonical_Combining_Class (ccc) values\n");
prev_start = 0
prev_cc = 0
for i in xrange(len(_starts)):
start = _starts[i]
props = _props[i]
cc = props.get("ccc")
if not cc: cc = 0
if prev_cc != cc:
if prev_cc != 0:
last_code_point = start - 1
if prev_start == last_code_point:
out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
out_file.write("%04X..%04X:%d\n" %
(prev_start, last_code_point, prev_cc))
prev_start = start
prev_cc = cc
def HasMapping(c):
props = GetProps(c)
dt = props.get("dt")
return dt and dt != "None"
def HasOneWayMapping(c):
while True:
props = GetProps(c)
dt = props.get("dt")
if not dt or dt == "None":
return False # no mapping
elif dt == "Can":
# The canonical decomposition is a one-way mapping if
# - it does not map to exactly two code points
# - c has ccc!=0
# - c has the Composition_Exclusion property
# - its starter has a one-way mapping (loop for this)
# - its non-starter decomposes
nfd = props["dm"].split()
if (len(nfd) != 2 or
props.get("ccc") or
props.get("Comp_Ex") or
HasMapping(int(nfd[1], 16))):
return True
c = int(nfd[0], 16) # continue
# c has a compatibility mapping.
return True
def WriteNorm2NFCTextFile(path):
global _current_year
with open(os.path.join(path, "nfc.txt"), "w") as out_file:
"""# Copyright (C) 1999-""" + _current_year +
""", International Business Machines
# Corporation and others. All Rights Reserved.
# file name: nfc.txt
# machine-generated by ICU
# Complete data for Unicode NFC normalization.
* Unicode """ + _ucd_version + """
out_file.write("\n# Canonical decomposition mappings\n")
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
dm = props.get("dm")
if dm and dm[0] != '<' and props["dt"] == "Can":
assert start == end
# The Comp_Ex=Full_Composition_Exclusion property tells us
# whether the canonical decomposition round-trips.
separator = '>' if props.get("Comp_Ex") else '='
out_file.write("%04X%s%s\n" % (start, separator, dm))
def WriteNorm2NFKCTextFile(path):
global _current_year
with open(os.path.join(path, "nfkc.txt"), "w") as out_file:
"""# Copyright (C) 1999-""" + _current_year +
""", International Business Machines
# Corporation and others. All Rights Reserved.
# file name: nfkc.txt
# machine-generated by ICU
# Data for Unicode NFKC normalization.
# This file contains only compatibility decomposition mappings,
# plus those canonical decompositions that change from NFC round-trip mappings
# to NFKC one-way mappings.
# Use this file as the second gennorm2 input file after nfc.txt.
* Unicode """ + _ucd_version + """
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
dm = props.get("dm")
if dm and dm[0] != '<':
assert start == end
if props["dt"] != "Can":
# Compatibility decomposition.
out_file.write("%04X>%s\n" % (start, dm))
elif not props.get("Comp_Ex") and HasOneWayMapping(start):
# NFC round-trip mapping turns into NFKC one-way mapping.
out_file.write("%04X>%s # NFC round-trip, NFKC one-way\n" %
(start, dm))
def WriteNorm2NFKC_CFTextFile(path):
global _current_year
with open(os.path.join(path, "nfkc_cf.txt"), "w") as out_file:
"""# Unicode Character Database
# Copyright (c) 1991-""" + _current_year + """ Unicode, Inc.
# For terms of use, see
# For documentation, see
# file name: nfkc_cf.txt
# machine-generated by ICU
# This file contains the Unicode NFKC_CF mappings,
# extracted from the UCD file DerivedNormalizationProps.txt,
# and reformatted into syntax for the gennorm2 Normalizer2 data generator tool.
# Use this file as the third gennorm2 input file after nfc.txt and nfkc.txt.
out_file.write("* Unicode " + _ucd_version + "\n\n")
prev_start = 0
prev_end = 0
prev_nfkc_cf = None
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
props = _props[i]
nfkc_cf = props.get("NFKC_CF")
# Merge with the previous range if possible,
# or remember this range for merging.
if nfkc_cf == prev_nfkc_cf and (prev_end + 1) == start:
prev_end = end
if prev_nfkc_cf != None and (not prev_nfkc_cf or prev_nfkc_cf[0] != '<'):
if prev_start == prev_end:
out_file.write("%04X>%s\n" % (prev_start, prev_nfkc_cf))
out_file.write("%04X..%04X>%s\n" %
(prev_start, prev_end, prev_nfkc_cf))
prev_start = start
prev_end = end
prev_nfkc_cf = nfkc_cf
def WriteNorm2(path):
# UTS #46 Normalizer2 input file ------------------------------------------- ***
_idna_replacements = [
# Several versions of avoiding circular FFFD>FFFD mappings,
# depending on the version of the input file.
(re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"),
(re.compile(r"\.\.FFFD"), "..FFFC"),
(re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
# Since we switch between checking and not checking for STD3 character
# restrictions at runtime, checking the non-LDH ASCII characters in code,
# we treat these values here like their regular siblings.
(re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
(re.compile(r"; disallowed_STD3_mapped +; "), ">"),
# For UTS #46, we do not care about "not valid in IDNA2008".
(re.compile(r"; *; NV8 +"), ""),
# Normal transformations.
(re.compile(r"; disallowed"), ">FFFD"),
(re.compile(r"; ignored"), ">"),
(re.compile(r"^([^;]+) ; valid"), r"# \1valid"),
(re.compile(r"; mapped +; "), ">"),
(re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >")
def IdnaToUTS46TextFile(s, t):
"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
# Different input/output file names.
dest_path = os.path.dirname(t)
t = os.path.join(dest_path, "uts46.txt")
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
out_file.write("# Original file:\n")
for line in in_file:
orig_line = line
if line.startswith("# For documentation, see"):
# ================================================
# This file has been reformatted into syntax for the
# gennorm2 Normalizer2 data generator tool.
# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
# "disallowed" lines map to U+FFFD.
# "ignored" lines map to an empty string.
# Characters disallowed under STD3 rules are treated as valid or mapped;
# they are handled in code.
# Deviation characters are also handled in code.
# Use this file as the second gennorm2 input file after nfc.txt.
# ================================================
if line[0] in "#\r\n":
for rep in _idna_replacements: line = rep[0].sub(rep[1], line)
# Align inline comments at column 40.
comment_pos = line.find("#", 1)
if comment_pos < 40:
line = (line[:comment_pos] + ((40 - comment_pos) * ' ') +
elif comment_pos > 40:
space_pos = comment_pos
while space_pos > 0 and line[space_pos - 1] == ' ':
space_pos = space_pos - 1
if space_pos < 40:
# Fewer than 40 characters before the comment:
# Align comments at column 40.
line = line[:40] + line[comment_pos:]
# 40 or more characters before the comment:
# Keep one space between contents and comment.
line = line[:space_pos] + " " + line[comment_pos:]
# Write the modified line.
if "..FFFF" in orig_line and "..FFFC" in line:
out_file.write("FFFE..FFFF >FFFD\n");
return t
# Preprocessing ------------------------------------------------------------ ***
_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
def CopyAndStripWithOptionalMerge(s, t, do_merge):
# TODO: We do not seem to need the do_merge argument and logic any more.
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
first = -1 # First code point with first_data.
last = -1 # Last code point with first_data.
first_data = "" # Common data for code points [first..last].
for line in in_file:
match = _strip_re.match(line)
if match:
line =
line = line.rstrip()
if do_merge:
match = _code_point_re.match(line)
if match:
c = int(, 16)
data = line[match.end() - 1:]
c = -1
data = ""
if last >= 0 and (c != (last + 1) or data != first_data):
# output the current range
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
if c < 0:
# no data on this line, output as is
# data on this line, store for possible range compaction
if last < 0:
# set as the first line in a possible range
first = c
last = c
first_data = data
# must be c == (last + 1) and data == first_data
# because of previous conditions
# continue with the current range
last = c
# Only strip, don't merge: just output the stripped line.
if do_merge and last >= 0:
# output the last range in the file
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
return t
def CopyAndStrip(s, t):
"""Copies a file and removes comments behind data lines but not in others."""
return CopyAndStripWithOptionalMerge(s, t, False)
def CopyAndStripAndMerge(s, t):
"""Copies and strips a file and merges lines.
Copies a file, removes comments, and
merges lines with adjacent code point ranges and identical per-code point
data lines into one line with range syntax.
return CopyAndStripWithOptionalMerge(s, t, True)
def PrependBOM(s, t):
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
out_file.write("\xef\xbb\xbf") # UTF-8 BOM for ICU svn
shutil.copyfileobj(in_file, out_file)
return t
def CopyOnly(s, t):
shutil.copy(s, t)
return t
def DontCopy(s, t):
return s
# Each _files value is a
# (preprocessor, dest_folder, parser, order) tuple
# where all fields except the preprocessor are optional.
# After the initial preprocessing (copy/strip/merge),
# if a parser is specified, then a tuple is added to _files_to_parse
# at index "order" (default order 9).
# An explicit order number is set only for files that must be parsed
# before others.
_files = {
"BidiBrackets.txt": (DontCopy, ParseBidiBrackets),
"BidiMirroring.txt": (DontCopy, ParseBidiMirroring),
"BidiTest.txt": (CopyOnly, "testdata"),
"Blocks.txt": (DontCopy, ParseBlocks),
"CaseFolding.txt": (CopyOnly, ParseCaseFolding),
"DerivedAge.txt": (DontCopy, ParseDerivedAge),
"DerivedBidiClass.txt": (DontCopy, ParseDerivedBidiClass),
"DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
"DerivedJoiningGroup.txt": (DontCopy, ParseDerivedJoiningGroup),
"DerivedJoiningType.txt": (DontCopy, ParseDerivedJoiningType),
"DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
"DerivedNumericValues.txt": (DontCopy, ParseDerivedNumericValues),
"EastAsianWidth.txt": (DontCopy, ParseEastAsianWidth),
"emoji-data.txt": (DontCopy, ParseNamedProperties),
"GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
"GraphemeBreakTest.txt": (PrependBOM, "testdata"),
"IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
"IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
"LineBreak.txt": (DontCopy, ParseLineBreak),
"LineBreakTest.txt": (PrependBOM, "testdata"),
"NameAliases.txt": (DontCopy, ParseNameAliases),
"NamesList.txt": (DontCopy, ParseNamesList),
"NormalizationCorrections.txt": (CopyOnly,), # Only used in gensprep.
"NormalizationTest.txt": (CopyAndStrip,),
"PropertyAliases.txt": (DontCopy, ParsePropertyAliases, 0),
"PropertyValueAliases.txt": (DontCopy, ParsePropertyValueAliases, 1),
"PropList.txt": (DontCopy, ParseNamedProperties),
"SentenceBreakProperty.txt": (DontCopy, ParseSentenceBreak),
"SentenceBreakTest.txt": (PrependBOM, "testdata"),
"Scripts.txt": (DontCopy, ParseScripts),
"ScriptExtensions.txt": (DontCopy, ParseScriptExtensions),
"SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
"UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
"WordBreakProperty.txt": (DontCopy, ParseWordBreak),
"WordBreakTest.txt": (PrependBOM, "testdata"),
# From<version>/
"IdnaMappingTable.txt": (IdnaToUTS46TextFile, "norm2")
# List of lists of files to be parsed in order.
# Inner lists contain (basename, path, parser) tuples.
_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
# Get the standard basename from a versioned filename.
# For example, match "UnicodeData-6.1.0d8.txt"
# so we can turn it into "UnicodeData.txt".
_file_version_re = re.compile("([a-zA-Z0-9]+)" +
"-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
def PreprocessFiles(source_files, icu_src_root):
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
norm2_path = os.path.join(unidata_path, "norm2")
testdata_path = os.path.join(icu_src_root, "source", "test", "testdata")
folder_to_path = {
"unidata": unidata_path,
"norm2": norm2_path,
"testdata": testdata_path
files_processed = set()
for source_file in source_files:
(folder, basename) = os.path.split(source_file)
match = _file_version_re.match(basename)
if match:
new_basename = +
if new_basename != basename:
print "Removing version suffix from " + source_file
# ... so that we can easily compare UCD files.
new_source_file = os.path.join(folder, new_basename)
shutil.move(source_file, new_source_file)
basename = new_basename
source_file = new_source_file
if basename in _files:
print "Preprocessing %s" % basename
if basename in files_processed:
raise Exception("duplicate file basename %s!" % basename)
value = _files[basename]
preprocessor = value[0]
if len(value) >= 2 and isinstance(value[1], (str, unicode)):
# The value was [preprocessor, dest_folder, ...], leave [...].
dest_folder = value[1]
value = value[2:]
# The value was [preprocessor, ...], leave [...].
dest_folder = "unidata"
value = value[1:]
dest_path = folder_to_path[dest_folder]
if not os.path.exists(dest_path): os.makedirs(dest_path)
dest_file = os.path.join(dest_path, basename)
parse_file = preprocessor(source_file, dest_file)
if value:
order = 9 if len(value) < 2 else value[1]
_files_to_parse[order].append((basename, parse_file, value[0]))
# Character names ---------------------------------------------------------- ***
# TODO: Turn this script into a module that
# a) gives access to the parsed data
# b) has a PreparseUCD(ucd_root, icu_src_root) function
# c) has a ParsePreparsedUCD(filename) function
# d) has a WritePreparsedUCD(filename) function
# and then use it from a new script for names.
# Some more API:
# - generator GetRangesAndProps() -> (start, end, props)*
def IncCounter(counters, key, inc=1):
if key in counters:
counters[key] += inc
counters[key] = inc
endings = (
"TILE ", "CARD ", "FACE ",
# List SIGN before VOWEL to catch "vowel sign".
# For names of math symbols,
def SplitName(name, tokens):
start = 0
for e in endings:
i = name.find(e)
if i >= 0:
start = i + len(e)
token = name[:start]
IncCounter(tokens, token)
for i in xrange(start, len(name)):
c = name[i]
if c == ' ' or c == '-':
token = name[start:i + 1]
IncCounter(tokens, token)
start = i + 1
IncCounter(tokens, name[start:])
def PrintNameStats():
# TODO: This name analysis code is out of date.
# It needs to consider the multi-type Name_Alias values.
name_pnames = ("na", "na1", "Name_Alias")
counts = {}
for pname in name_pnames:
counts[pname] = 0
total_lengths = counts.copy()
max_length = 0
max_per_cp = 0
name_chars = set()
num_digits = 0
token_counters = {}
char_counters = {}
for i in xrange(len(_starts) - 1):
start = _starts[i]
# end = _starts[i + 1] - 1
props = _props[i]
per_cp = 0
for pname in name_pnames:
if pname in props:
counts[pname] += 1
name = props[pname]
total_lengths[pname] += len(name)
name_chars |= set(name)
if len(name) > max_length: max_length = len(name)
per_cp += len(name) + 1
if per_cp > max_per_cp: max_per_cp = per_cp
tokens = SplitName(name, token_counters)
for c in name:
if c in "0123456789": num_digits += 1
IncCounter(char_counters, c)
for pname in name_pnames:
print ("'%s' character names: %d / %d bytes" %
(pname, counts[pname], total_lengths[pname]))
print "%d total bytes in character names" % sum(total_lengths.itervalues())
print ("%d name-characters: %s" %
(len(name_chars), "".join(sorted(name_chars))))
print "%d digits 0-9" % num_digits
count_chars = [(count, c) for (c, count) in char_counters.iteritems()]
for cc in count_chars:
print "name-chars: %6d * '%s'" % cc
print "max. name length: %d" % max_length
print "max. length of all (names+NUL) per cp: %d" % max_per_cp
token_lengths = sum([len(t) + 1 for t in token_counters])
print ("%d total tokens, %d bytes with NUL" %
(len(token_counters), token_lengths))
counts_tokens = []
for (token, count) in token_counters.iteritems():
# If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
# but have to store the token string itself with a length or terminator byte,
# plus a 2-byte entry in an token index table.
savings = count * (len(token) - 1) - (len(token) + 1 + 2)
if savings > 0:
counts_tokens.append((savings, count, token))
print "%d tokens might save space with 1-byte codes" % len(counts_tokens)
# Codes=bytes, 40 byte values for name_chars.
# That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
# Make each 2-byte token the token string index itself, rather than
# and index into a string index table.
# More lead bytes but also more savings.
num_units = 256
max_lead = (token_lengths + 255) / 256
max_token_units = num_units - len(name_chars)
results = []
for num_lead in xrange(min(max_lead, max_token_units) + 1):
max1 = max_token_units - num_lead
ct = counts_tokens[:max1]
tokens1 = set([t for (s, c, t) in ct])
for (token, count) in token_counters.iteritems():
if token in tokens1: continue
# If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
# but have to store the token string itself with a length or terminator byte.
savings = count * (len(token) - 2) - (len(token) + 1)
if savings > 0:
ct.append((savings, count, token))
# A 2-byte-code-token index cannot be limit_t_lengths or higher.
limit_t_lengths = num_lead * 256
token2_index = 0
for i in xrange(max1, len(ct)):
if token2_index >= limit_t_lengths:
del ct[i:]
token2_index += len(ct[i][2]) + 1
cumul_savings = sum([s for (s, c, t) in ct])
# print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
# (max1, len(ct), cumul_savings))
results.append((cumul_savings, max1, ct))
best = max(results) # (cumul_savings, max1, ct)
max1 = best[1]
print ("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
(best[0], max1, max_token_units - max1))
counts_tokens = best[2]
cumul_savings = 0
for i in xrange(len(counts_tokens)):
n = 1 if i < max1 else 2
i1 = i + 1
t = counts_tokens[i]
cumul_savings += t[0]
if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
print (("%04d. cumul. %6d bytes save %6d bytes from " +
"%5d * %d-byte token for %2d='%s'") %
(i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
# ICU API ------------------------------------------------------------------ ***
# Sample line to match:
_uchar_re = re.compile(
" *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
# Sample line to match:
# /** Zs @stable ICU 2.0 */
_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
# Sample line to match:
_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# /** L @stable ICU 2.0 */
_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
# Sample line to match:
_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
# Sample line to match:
_prop_and_value_re = re.compile(
" *(U_(BPT|DT|EA|GCB|HST|LB|JG|JT|NT|SB|WB)_([0-9A-Z_]+))")
# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):
_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
def ParseUCharHeader(icu_src_root):
uchar_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uchar.h")
with open(uchar_path, "r") as uchar_file:
mode = "" # Mode string (=pname) during context-sensitive parsing.
comment_value = "" # Property value from a comment preceding an enum.
# Note: The enum UProperty is first in uchar.h, before the enums for values.
for line in uchar_file:
# Parse some enums via context-sensitive "modes".
# Necessary because the enum constant names do not contain
# enough information.
if "enum UCharCategory" in line:
mode = "gc"
comment_value = ""
if mode == "gc":
if line.startswith("}"):
mode = ""
match = _gc_comment_re.match(line)
if match:
comment_value =
match = _gc_re.match(line)
if match and comment_value:
gc_enum =
prop = _properties["gc"]
vname = GetShortPropertyValueName(prop, comment_value)
icu_values = _pname_to_icu_prop["gc"][2]
icu_values.append((gc_enum, vname))
comment_value = ""
if "enum UCharDirection {" in line:
mode = "bc"
comment_value = ""
if mode == "bc":
if line.startswith("}"):
mode = ""
match = _bc_comment_re.match(line)
if match:
comment_value =
match = _bc_re.match(line)
if match and comment_value:
bc_enum =
prop = _properties["bc"]
vname = GetShortPropertyValueName(prop, comment_value)
icu_values = _pname_to_icu_prop["bc"][2]
icu_values.append((bc_enum, vname))
comment_value = ""
# No mode, parse enum constants whose names contain
# enough information to parse without requiring context.
match = _uchar_re.match(line)
if match:
prop_enum =
if prop_enum.endswith("_LIMIT"):
# Ignore "UCHAR_BINARY_LIMIT=57," etc.
pname = GetShortPropertyName(prop_enum[6:])
icu_prop = (prop_enum, pname, [])
_pname_to_icu_prop[pname] = icu_prop
match = _ublock_re.match(line)
if match:
prop_enum =
if prop_enum == "UBLOCK_COUNT":
prop = _properties["blk"]
vname = GetShortPropertyValueName(prop, prop_enum[7:])
icu_values = _pname_to_icu_prop["blk"][2]
icu_values.append((prop_enum, vname))
match = _prop_and_value_re.match(line)
if match:
(prop_enum, vname) =, 3)
if vname == "COUNT" or _prop_and_alias_re.match(line):
pname = GetShortPropertyName(
prop = _properties[pname]
vname = GetShortPropertyValueName(prop, vname)
icu_values = _pname_to_icu_prop[pname][2]
icu_values.append((prop_enum, vname))
# ccc, lccc, tccc use their numeric values as "enum" values.
# In the UCD data, these numeric values are the first value names,
# followed by the short & long value names.
# List the ccc values in numeric order.
prop = _properties["ccc"]
icu_values = _pname_to_icu_prop["ccc"][2]
for ccc in sorted([int(name) for name in prop[2]]):
icu_values.append((ccc, str(ccc)))
_pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc.
_pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc.
# No need to parse predictable General_Category_Mask enum constants.
# Just define them in ASCII order.
prop = _properties["gcm"]
icu_values = _pname_to_icu_prop["gcm"][2]
for vname in sorted(prop[2]):
icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
# Hardcode known values for the normalization quick check properties,
# see unorm2.h for the UNormalizationCheckResult enum.
icu_values = _pname_to_icu_prop["NFC_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
icu_values.append(("UNORM_MAYBE", "M"))
_pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC.
# No "maybe" values for NF[K]D.
icu_values = _pname_to_icu_prop["NFD_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
_pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD.
# Sample line to match:
# USCRIPT_LOMA = 139,/* Loma */
_uscript_re = re.compile(
" *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
def ParseUScriptHeader(icu_src_root):
uscript_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uscript.h")
icu_values = _pname_to_icu_prop["sc"][2]
with open(uscript_path, "r") as uscript_file:
for line in uscript_file:
match = _uscript_re.match(line)
if match:
(script_enum, script_code) =, 2)
icu_values.append((script_enum, script_code))
def CheckPNamesData():
"""Checks that every ICU property has a full set of value enum constants,
and that the _icu_properties value names map back to the UCD."""
missing_enums = []
for (p_enum, pname, values) in _icu_properties:
prop = _properties[pname]
vnames = set(prop[2]) # Modifiable copy of the set of short value names.
for (v_enum, vname) in values:
if vname not in vnames:
raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
(pname, vname, v_enum))
# Exceptions to the all-values check:
# - ICU does not have specific enum values for binary No/Yes.
# - ICU represents Age values via UVersionInfo rather than enum constants.
# - gc: ICU enum UCharCategory only has the single-category values.
# (ICU's gcm property has all of the UCD gc property values.)
if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
missing_enums.append((pname, vnames))
if missing_enums:
raise ValueError(
"missing uchar.h enum constants for some property values: %s" %
def WritePNamesDataHeader(out_path):
global _current_year
with open(out_path, "w") as out_file:
* Copyright (C) 2002-""" + _current_year +
""", International Business Machines Corporation and
* others. All Rights Reserved.
* machine-generated by: icu/tools/unicode/py/
# Note: The uchar.h & uscript.h parsers store the ICU Unicode properties
# and values in the order of their definition,
# and this function writes them in that order.
# Since the ICU API constants are stable and new values are only
# appended at the end
# (new properties are added at the end of each binary/enum/... range),
# the output is stable as well.
# When a property or value constant is renamed,
# it only changes the name itself in the output;
# it does not move in the output since there is no sorting.
# This minimizes diffs and assists with reviewing and evaluating updates.
version = _ucd_version.split('.')
while len(version) < 4: version.append("0")
out_file.write("#define UNICODE_VERSION { %s }\n\n" % ", ".join(version))
# Count the maximum number of aliases for any property or value.
# We write the final value at the end.
max_aliases = max(len(_binary_values["N"]), len(_binary_values["Y"]))
# Write an array of "binprop" Value object initializers
# with the value aliases shared among all binary properties.
out_file.write("static const Value VALUES_binprop[2] = {\n")
out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
# For each property with named values, write an array of
# Value object initializers with the value enum and the aliases.
for (p_enum, pname, values) in _icu_properties:
prop = _properties[pname]
aliases = prop[1]
if len(aliases) > max_aliases: max_aliases = len(aliases)
if not values: continue
out_file.write("static const Value VALUES_%s[%d] = {\n" %
(pname, len(values)))
for (v_enum, vname) in values:
aliases = _properties[pname][3][vname]
# ccc, lccc, tccc: Omit the numeric strings from the aliases.
# (See the comment about ccc in the PropertyValueAliases.txt header.)
if pname.endswith("ccc"): aliases = aliases[1:]
if len(aliases) > max_aliases: max_aliases = len(aliases)
cast = "(int32_t)" if pname == "gcm" else ""
out_file.write(' Value(%s%s, "%s"),\n' %
(cast, v_enum, " ".join(aliases)))
# For each property, write a Property object initializer
# with the property enum, its aliases, and a reference to its values.
out_file.write("static const Property PROPERTIES[%d] = {\n" %
for (enum, pname, values) in _icu_properties:
prop = _properties[pname]
aliases = " ".join(prop[1])
if prop[0] == "Binary":
out_file.write(' Property(%s, "%s"),\n' % (enum, aliases))
elif values: # Property with named values.
out_file.write(' Property(%s, "%s", VALUES_%s, %d),\n' %
(enum, aliases, pname, len(values)))
out_file.write(' Property(%s, "%s"),\n' % (enum, aliases))
out_file.write("const int32_t MAX_ALIASES = %d;\n" % max_aliases)
# main() ------------------------------------------------------------------- ***
def main():
global _null_or_defaults
if len(sys.argv) < 4:
print ("Usage: %s path/to/UCD/root path/to/ICU/src/root "
"path/to/ICU/tools/root" % sys.argv[0])
(ucd_root, icu_src_root, icu_tools_root) = sys.argv[1:4]
source_files = []
for root, dirs, files in os.walk(ucd_root):
for file in files:
source_files.append(os.path.join(root, file))
PreprocessFiles(source_files, icu_src_root)
# Parse the processed files in a particular order.
for files in _files_to_parse:
for (basename, path, parser) in files:
print "Parsing %s" % basename
value = _files[basename]
# Unicode data files are in UTF-8.
charset = "UTF-8"
if basename == "NamesList.txt":
# The NamesList used to be in Latin-1 before Unicode 6.2.
numeric_ucd_version = [int(field) for field in _ucd_version.split('.')]
if numeric_ucd_version < [6, 2]: charset = "ISO-8859-1"
in_file =, "r", charset)
with in_file:
_null_or_defaults = _null_values.copy()
# Every Catalog and Enumerated property must have a default value,
# from a @missing line. "nv" = "null value".
pnv = [pname for (pname, nv) in _null_or_defaults.iteritems() if nv == "??"]
if pnv:
raise Exception("no default values (@missing lines) for " +
"some Catalog or Enumerated properties: %s " % pnv)
# Write Normalizer2 input text files.
# Do this before compacting the data so that we need not handle fallbacks.
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
norm2_path = os.path.join(unidata_path, "norm2")
if not os.path.exists(norm2_path): os.makedirs(norm2_path)
# Optimize block vs. cp properties.
# Write the ppucd.txt output file.
# Use US-ASCII so that ICU tests can parse it in the platform charset,
# which may be EBCDIC.
# Fix up non-ASCII data (NamesList.txt headings) to fit.
out_path = os.path.join(unidata_path, "ppucd.txt")
with, "w", "US-ASCII") as out_file:
# TODO: PrintNameStats()
# ICU data for property & value names API
genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
if not os.path.exists(genprops_path): os.makedirs(genprops_path)
out_path = os.path.join(genprops_path, "pnames_data.h")
if __name__ == "__main__":