scuffed-code/tools/unicode/py/preparseucd.py

1785 lines
62 KiB
Python
Raw Normal View History

#!/usr/bin/python2.6
# -*- coding: utf-8 -*-
# Copyright (c) 2009-2011 International Business Machines
# Corporation and others. All Rights Reserved.
#
# file name: preparseucd.py
# encoding: US-ASCII
# tab size: 8 (not used)
# indentation:4
#
# created on: 2011nov03 (forked from ucdcopy.py)
# created by: Markus W. Scherer
#
# Copies Unicode Character Database (UCD) files from a tree
# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
# to ICU's source/data/unidata/ and source/test/testdata/
# and modifies some of the files to make them more compact.
# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
#
# Invoke with three command-line parameters:
# 1. source folder with UCD files
# 2. ICU source root folder
# 3. ICU tools root folder
#
# Sample invocation:
# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
import array
import bisect
import codecs
import datetime
import os
import os.path
import re
import shutil
import sys
# Unicode version ---------------------------------------------------------- ***
_ucd_version = "?"
_copyright = ""
_terms_of_use = ""
# ISO 15924 script codes --------------------------------------------------- ***
# Script codes from ISO 15924 http://www.unicode.org/iso15924/codechanges.html
# that are not yet in the UCD.
_scripts_only_in_iso15924 = (
"Blis", "Cirt", "Cyrs",
"Egyd", "Egyh", "Geok",
"Hans", "Hant", "Hmng", "Hung",
"Inds", "Jpan", "Latf", "Latg", "Lina",
"Maya", "Moon", "Perm", "Roro",
"Sara", "Sgnw", "Syre", "Syrj", "Syrn",
"Teng", "Visp", "Zxxx",
"Kore", "Mani", "Phlp", "Phlv", "Zmth", "Zsym",
"Nkgb",
"Bass", "Dupl", "Elba", "Gran",
"Kpel", "Loma", "Mend", "Narb", "Nbat",
"Palm", "Sind", "Wara",
"Afak", "Jurc", "Mroo", "Nshu", "Tang", "Wole",
"Khoj", "Tirh"
)
# Properties --------------------------------------------------------------- ***
_ignored_properties = set((
# Other_Xyz only contribute to Xyz, store only the latter.
"OAlpha",
"ODI",
"OGr_Ext",
"OIDC",
"OIDS",
"OLower",
"OMath",
"OUpper",
# Further properties that just contribute to others.
"JSN",
# These properties just don't seem useful.
"XO_NFC",
"XO_NFD",
"XO_NFKC",
"XO_NFKD",
# ICU does not use Unihan properties.
"cjkAccountingNumeric",
"cjkOtherNumeric",
"cjkPrimaryNumeric",
"cjkCompatibilityVariant",
"cjkIICore",
"cjkIRG_GSource",
"cjkIRG_HSource",
"cjkIRG_JSource",
"cjkIRG_KPSource",
"cjkIRG_KSource",
"cjkIRG_MSource",
"cjkIRG_TSource",
"cjkIRG_USource",
"cjkIRG_VSource",
"cjkRSUnicode"
))
# Dictionary of properties.
# Keyed by normalized property names and aliases.
# Each value is a tuple with
# 0: Type of property (binary, enum, ...)
# 1: List of aliases; short & long name followed by other aliases.
# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
# 2: Set of short property value names.
# 3: Dictionary of property values.
# For Catalog & Enumerated properties,
# maps each value name to a list of aliases.
# Empty for other types of properties.
_properties = {}
# Dictionary of binary-property values which we store as False/True.
# Same as the values dictionary of one of the binary properties.
_binary_values = {}
# Dictionary of null values.
# Keyed by short property names.
# These are type-specific values for properties that occur in the data.
# They are overridden by _defaults, block and code point properties.
_null_values = {}
# Property value names for null values.
# We do not store these in _defaults.
_null_names = frozenset(("<none>", "NaN"))
# Dictionary of explicit default property values.
# Keyed by short property names.
_defaults = {}
# _null_values overridden by explicit _defaults.
# Initialized after parsing is done.
_null_or_defaults = {}
# List of properties with an ICU UProperty enum.
# Each item is an (enum, pname, values) tuple.
# - enum: the ICU enum UProperty constant string
# - pname: the UCD short property name
# - values: list of (enum, vname) pairs per property value
# - enum: the ICU property value's enum constant string
# - vname: the UCD short property value name
_icu_properties = []
# Dictionary of short property names mapped to _icu_properties items.
_pname_to_icu_prop = {}
_non_alnum_re = re.compile("[^a-zA-Z0-9]")
def NormPropName(pname):
"""Returns a normalized form of pname.
Removes non-ASCII-alphanumeric characters and lowercases letters."""
return _non_alnum_re.sub("", pname).lower()
def GetProperty(pname):
"""Returns the _properties value for the pname.
Returns null if the property is ignored.
Caches alternate spellings of the property name."""
# Try the input name.
prop = _properties.get(pname)
if prop != None: return prop
if pname in _ignored_properties: return None
# Try the normalized input name.
norm_name = NormPropName(pname)
prop = _properties.get(norm_name)
if prop != None:
_properties[pname] = prop # Cache prop under this new name spelling.
return prop
elif pname in _ignored_properties:
_ignored_properties.add(pname) # Remember to ignore this new name spelling.
return None
else:
raise NameError("unknown property %s\n" % pname)
def GetShortPropertyName(pname):
if pname in _null_values: return pname # pname is already the short name.
prop = GetProperty(pname)
if not prop: return "" # For ignored properties.
return prop[1][0] or prop[1][1] # Long name if no short name.
def GetShortPropertyValueName(prop, vname):
if vname in prop[2]: return vname
values = prop[3]
aliases = values.get(vname)
if aliases == None:
norm_name = NormPropName(vname)
aliases = values.get(norm_name)
if aliases == None:
raise NameError("unknown value name %s for property %s\n" %
(vname, prop[1][0]))
values[vname] = aliases
return aliases[0] or aliases[1] # Long name if no short name.
def NormalizePropertyValue(prop, vname):
if prop[2]: # Binary/Catalog/Enumerated property.
value = GetShortPropertyValueName(prop, vname)
if prop[0] == "Binary":
value = value == "Y"
if prop[1][0].endswith("ccc"):
value = int(value)
else:
value = vname
return value
# Character data ----------------------------------------------------------- ***
# Lists of NamesList h1 and h2 headings.
# Each h1 value is a (start, end, comment) tuple.
# Each h2 value is a (cp, comment) tuple.
_h1 = []
_h2 = []
# List of Unicode blocks.
# Each item is a tuple of start & end code point integers
# and a dictionary of default property values.
_blocks = []
# List of ranges with algorithmic names.
# Each value is a list of [start, end, type, prefix]
# where prefix is optional.
_alg_names_ranges = []
# List of Unicode character ranges and their properties,
# stored as an inversion map with range_start & props dictionary.
# Starts with one range for all of Unicode without any properties.
# Setting values subdivides ranges.
_starts = array.array('l', [0, 0x110000]) # array of int32_t
_props = [{}] # props for 0 but not 110000
def FindRange(x):
""" Binary search for x in the inversion map.
Returns the smallest i where x < _starts[i]"""
return bisect.bisect(_starts, x) - 1
def UpdateProps(start, end, update):
assert 0 <= start <= end <= 0x10ffff
(need_to_update, do_update, u) = (update[0], update[1], update[2])
# Find the index i of the range in _starts that contains start.
i = FindRange(start)
limit = end + 1
# Intersect [start, limit[ with ranges in _starts.
c_start = _starts[i]
c_limit = _starts[i + 1]
c_props = _props[i]
# c_start <= start < c_limit
if c_start < start:
update_limit = c_limit if c_limit <= limit else limit
if need_to_update(u, start, update_limit - 1, c_props):
# Split off [c_start, start[ with a copy of c_props.
i += 1
c_props = c_props.copy()
_starts.insert(i, start)
_props.insert(i, c_props)
c_start = start
# Modify all ranges that are fully inside [start, limit[.
while c_limit <= limit:
# start <= c_start < c_limit <= limit
if need_to_update(u, c_start, c_limit - 1, c_props):
do_update(u, c_start, c_limit - 1, c_props)
if c_limit == 0x110000: return
i += 1
c_start = c_limit
c_limit = _starts[i + 1]
c_props = _props[i]
if c_start < limit and need_to_update(u, c_start, limit - 1, c_props):
# Split off [limit, c_limit[ with a copy of c_props.
_starts.insert(i + 1, limit)
_props.insert(i + 1, c_props.copy())
# Modify [c_start, limit[ c_props.
do_update(u, c_start, limit - 1, c_props)
def NeedToSetProps(props, start, end, c_props):
"""Returns True if props is not a sub-dict of c_props."""
for (pname, value) in props.iteritems():
if pname not in c_props or value != c_props[pname]: return True
return False
def DoSetProps(props, start, end, c_props):
c_props.update(props)
def SetProps(start, end, props):
UpdateProps(start, end, (NeedToSetProps, DoSetProps, props))
def NeedToSetAlways(nv, start, end, c_props):
return True
# For restoring boundaries after merging adjacent same-props ranges.
def AddBoundary(x):
"""Ensure that there is a range start/limit at x."""
assert 0 <= x <= 0x10ffff
i = FindRange(x)
if _starts[i] == x: return
# Split the range at x.
c_start = _starts[i]
c_limit = _starts[i + 1]
c_props = _props[i]
# c_start < x < c_limit
i += 1
_starts.insert(i, x)
_props.insert(i, c_props.copy())
def SetDefaultValue(pname, value):
"""Sets the property's default value. Ignores null values."""
prop = GetProperty(pname)
if prop and value not in _null_names:
value = NormalizePropertyValue(prop, value)
if value != _null_values[prop[1][0]]:
_defaults[prop[1][0]] = value
SetProps(0, 0x10ffff, {prop[1][0]: value})
def SetBinaryPropertyToTrue(pname, start, end):
prop = GetProperty(pname)
if prop:
assert prop[0] == "Binary"
SetProps(start, end, {prop[1][0]: True})
def SetPropValue(prop, vname, start, end):
value = NormalizePropertyValue(prop, vname)
SetProps(start, end, {prop[1][0]: value})
def SetPropertyValue(pname, vname, start, end):
prop = GetProperty(pname)
if prop: SetPropValue(prop, vname, start, end)
# Parsing ------------------------------------------------------------------ ***
_stripped_cp_re = re.compile("([0-9a-fA-F]+)$")
_stripped_range_re = re.compile("([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)$")
_missing_re = re.compile("# *@missing: *0000\.\.10FFFF *; *(.+)$")
def ReadUCDLines(in_file, want_ranges=True, want_other=False,
want_comments=False, want_missing=False):
"""Parses lines from a semicolon-delimited UCD text file.
Strips comments, ignores empty and all-comment lines.
Returns a tuple (type, line, ...).
"""
for line in in_file:
line = line.strip()
if not line: continue
if line.startswith("#"): # whole-line comment
if want_missing:
match = _missing_re.match(line)
if match:
fields = match.group(1).split(";")
for i in xrange(len(fields)): fields[i] = fields[i].strip()
yield ("missing", line, fields)
continue
if want_comments: yield ("comment", line)
continue
comment_start = line.find("#") # inline comment
if comment_start >= 0:
line = line[:comment_start].rstrip()
if not line: continue
fields = line.split(";")
for i in xrange(len(fields)): fields[i] = fields[i].strip()
if want_ranges:
first = fields[0]
match = _stripped_range_re.match(first)
if match:
start = int(match.group(1), 16)
end = int(match.group(2), 16)
yield ("range", line, start, end, fields)
continue
match = _stripped_cp_re.match(first)
if match:
c = int(match.group(1), 16)
yield ("range", line, c, c, fields)
continue
if want_other:
yield ("other", line, fields)
else:
raise SyntaxError("unable to parse line\n %s\n" % line)
def AddBinaryProperty(short_name, long_name):
_null_values[short_name] = False
bin_prop = _properties["Math"]
prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
_properties[short_name] = prop
_properties[long_name] = prop
_properties[NormPropName(short_name)] = prop
_properties[NormPropName(long_name)] = prop
def AddPOSIXBinaryProperty(name):
# We only define a long name for ICU-specific (non-UCD) POSIX properties.
_null_values[name] = False
bin_prop = _properties["Math"]
prop = ("Binary", ["", name], bin_prop[2], bin_prop[3])
_properties[name] = prop
_properties[NormPropName(name)] = prop
# This is to match UProperty UCHAR_POSIX_ALNUM etc.
_properties["posix" + NormPropName(name)] = prop
# Match a comment line like
# PropertyAliases-6.1.0.txt
# and extract the Unicode version.
_ucd_version_re = re.compile("# *PropertyAliases" +
"-([0-9]+(?:\\.[0-9]+)*)(?:d[0-9]+)?" +
"\\.txt")
def ParsePropertyAliases(in_file):
global _copyright, _terms_of_use, _ucd_version
prop_type_nulls = {
"Binary": False,
"Catalog": "??", # Must be specified, e.g., in @missing line.
"Enumerated": "??", # Must be specified.
"Numeric": "NaN",
"String": "",
"Miscellaneous": ""
}
for data in ReadUCDLines(in_file, want_ranges=False,
want_other=True, want_comments=True):
if data[0] == "comment":
line = data[1]
match = _ucd_version_re.match(line)
if match:
_ucd_version = match.group(1)
elif line.startswith("# Copyright"):
_copyright = line
elif "terms of use" in line:
_terms_of_use = line
else:
words = line[1:].lstrip().split()
if len(words) == 2 and words[1] == "Properties":
prop_type = words[0]
null_value = prop_type_nulls[prop_type]
else:
# type == "other"
aliases = data[2]
name = aliases[0]
if name in _ignored_properties:
for alias in aliases:
_ignored_properties.add(alias)
_ignored_properties.add(NormPropName(alias))
else:
if name.endswith("ccc"):
_null_values[name] = 0
else:
_null_values[name] = null_value
prop = (prop_type, aliases, set(), {})
for alias in aliases:
_properties[alias] = prop
_properties[NormPropName(alias)] = prop
# Add provisional and ICU-specific properties we need.
# We add some in support of runtime API, even if we do not write
# data for them to ppucd.txt (e.g., lccc & tccc).
# We add others just to represent UCD data that contributes to
# some functionality, although Unicode has not "blessed" them
# as separate properties (e.g., Turkic_Case_Folding).
# Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
name = "Turkic_Case_Folding"
_null_values[name] = ""
prop = ("String", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
name = "Conditional_Case_Mappings"
_null_values[name] = ""
prop = ("Miscellaneous", [name, name], set(), {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# lccc = ccc of first cp in canonical decomposition.
_null_values["lccc"] = 0
ccc_prop = list(_properties["ccc"])
ccc_prop[1] = ["lccc", "Lead_Canonical_Combining_Class"]
prop = tuple(ccc_prop)
_properties["lccc"] = prop
_properties["Lead_Canonical_Combining_Class"] = prop
_properties["leadcanonicalcombiningclass"] = prop
# tccc = ccc of last cp in canonical decomposition.
_null_values["tccc"] = 0
ccc_prop[1] = ["tccc", "Trail_Canonical_Combining_Class"]
prop = tuple(ccc_prop)
_properties["tccc"] = prop
_properties["Trail_Canonical_Combining_Class"] = prop
_properties["trailcanonicalcombiningclass"] = prop
# Script_Extensions
if "scx" not in _properties:
_null_values["scx"] = ""
prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
_properties["scx"] = prop
_properties["Script_Extensions"] = prop
_properties["scriptextensions"] = prop
# General Category as a bit mask.
_null_values["gcm"] = "??"
gc_prop = _properties["gc"]
prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
_properties["gcm"] = prop
_properties["General_Category_Mask"] = prop
_properties["generalcategorymask"] = prop
# Various binary properties.
AddBinaryProperty("Sensitive", "Case_Sensitive")
AddBinaryProperty("nfdinert", "NFD_Inert")
AddBinaryProperty("nfkdinert", "NFKD_Inert")
AddBinaryProperty("nfcinert", "NFC_Inert")
AddBinaryProperty("nfkcinert", "NFKC_Inert")
AddBinaryProperty("segstart", "Segment_Starter")
# C/POSIX character classes that do not have Unicode property [value] aliases.
# See uchar.h.
AddPOSIXBinaryProperty("alnum")
AddPOSIXBinaryProperty("blank")
AddPOSIXBinaryProperty("graph")
AddPOSIXBinaryProperty("print")
AddPOSIXBinaryProperty("xdigit")
def ParsePropertyValueAliases(in_file):
global _binary_values
for data in ReadUCDLines(in_file, want_ranges=False,
want_other=True, want_missing=True):
if data[0] == "missing":
SetDefaultValue(data[2][0], data[2][1])
else:
# type == "other"
fields = data[2]
pname = fields[0]
prop = GetProperty(pname)
if prop:
del fields[0] # Only the list of aliases remains.
short_name = fields[0]
if short_name == "n/a": # no short name
fields[0] = ""
short_name = fields[1]
prop[2].add(short_name)
values = prop[3]
for alias in fields:
if alias:
values[alias] = fields
values[NormPropName(alias)] = fields
if prop[0] == "Binary" and not _binary_values:
_binary_values = values
# Some of the @missing lines with non-null default property values
# are in files that we do not parse;
# either because the data for that property is easily
# (i.e., the @missing line would be the only reason to parse such a file)
# or because we compute the property at runtime,
# such as the Hangul_Syllable_Type.
if "dt" not in _defaults: # DerivedDecompositionType.txt
_defaults["dt"] = "None"
if "nt" not in _defaults: # DerivedNumericType.txt
_defaults["nt"] = "None"
if "hst" not in _defaults: # HangulSyllableType.txt
_defaults["hst"] = "NA"
if "gc" not in _defaults: # No @missing line in any .txt file?
_defaults["gc"] = "Cn"
# Copy the gc default value to gcm.
_defaults["gcm"] = _defaults["gc"]
# Add ISO 15924-only script codes.
# Only for the ICU script code API, not necessary for parsing the UCD.
script_prop = _properties["sc"]
short_script_names = script_prop[2] # set
script_values = script_prop[3] # dict
remove_scripts = []
for script in _scripts_only_in_iso15924:
if script in short_script_names:
remove_scripts.append(script)
else:
short_script_names.add(script)
# Do not invent a Unicode long script name before the UCD adds the script.
script_list = [script, script] # [short, long]
script_values[script] = script_list
# Probably not necessary because
# we will not parse these scripts from the UCD:
script_values[NormPropName(script)] = script_list
if remove_scripts:
raise ValueError(
"remove %s from _scripts_only_in_iso15924" % remove_scripts)
def ParseBlocks(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue("blk", data[2][0])
else:
# type == "range"
(start, end, name) = (data[2], data[3], data[4][1])
_blocks.append((start, end, {"blk": name}))
SetPropertyValue("blk", name, start, end)
_blocks.sort()
# Check for overlapping blocks.
prev_end = -1
for b in _blocks:
start = b[0]
end = b[1]
if prev_end >= start:
raise ValueError(
"block %04lX..%04lX %s overlaps with another " +
"ending at %04lX\n %s\n" %
(start, end, b[2]["blk"], prev_end))
prev_end = end
def ParseUnicodeData(in_file):
dt_prop = GetProperty("dt")
range_first_line = ""
range_first = -1
for data in ReadUCDLines(in_file, want_missing=True):
# type == "range"
(line, c, end, fields) = (data[1], data[2], data[3], data[4])
assert c == end
name = fields[1]
if name.startswith("<"):
if name.endswith(", First>"):
if range_first >= 0:
raise SyntaxError(
"error: unterminated range started at\n %s\n" %
range_first_line)
range_first = c
range_first_line = line
continue
elif name.endswith(", Last>"):
if range_first < 0:
raise SyntaxError(
"error: range end without start at\n %s\n" %
line)
elif range_first > c:
raise SyntaxError(
"error: range start/end out of order at\n %s\n %s\n" %
(range_first_line, line))
first_name = range_first_line.split(";")[1][1:-8]
name = name[1:-7]
if first_name != name:
raise SyntaxError(
"error: range start/end name mismatch at\n %s\n %s\n" %
(range_first_line, line))
end = c
c = range_first
range_first = -1
# Remember algorithmic name ranges.
if "Ideograph" in name:
_alg_names_ranges.append([c, end, "han", "CJK UNIFIED IDEOGRAPH-"])
elif name == "Hangul Syllable":
_alg_names_ranges.append([c, end, "hangul"])
name = ""
else:
# Ignore non-names like <control>.
name = ""
props = {}
if name: props["na"] = name
props["gc"] = fields[2]
ccc = int(fields[3])
if ccc: props["ccc"] = ccc
props["bc"] = fields[4]
# Decomposition type & mapping.
dm = fields[5]
if dm:
if dm.startswith("<"):
dt_limit = dm.index(">")
dt = NormalizePropertyValue(dt_prop, dm[1:dt_limit])
dm = dm[dt_limit + 1:].lstrip()
else:
dt = "Can"
props["dt"] = dt
props["dm"] = dm
# Numeric type & value.
decimal = fields[6]
digit = fields[7]
nv = fields[8]
if (decimal and decimal != nv) or (digit and digit != nv):
raise SyntaxError("error: numeric values differ at\n %s\n" % line)
if nv:
props["nv"] = nv
props["nt"] = "De" if decimal else "Di" if digit else "Nu"
if fields[9] == "Y": props["Bidi_M"] = True
# ICU 49 and above does not support Unicode_1_Name any more.
# See ticket #9013.
# na1 = fields[10]
# if na1: props["na1"] = na1
# ISO_Comment is deprecated and has no values.
# isc = fields[11]
# if isc: props["isc"] = isc
# Simple case mappings.
suc = fields[12]
slc = fields[13]
stc = fields[14]
if suc: props["suc"] = suc
if slc: props["slc"] = slc
if stc: props["stc"] = stc
SetProps(c, end, props)
if range_first >= 0:
raise SyntaxError(
"error: unterminated range started at\n %s\n" %
range_first_line)
# Hangul syllables have canonical decompositions which are not listed in UnicodeData.txt.
SetPropertyValue("dt", "Can", 0xac00, 0xd7a3)
_alg_names_ranges.sort()
_names_h1_re = re.compile("@@\t([0-9a-fA-F]+)\t(.+?)\t([0-9a-fA-F]+)$")
_names_h2_re = re.compile("@\t\t(.+)")
_names_char_re = re.compile("([0-9a-fA-F]+)\t.+")
def ParseNamesList(in_file):
pending_h2 = ""
for line in in_file:
line = line.strip()
if not line: continue
match = _names_h1_re.match(line)
if match:
pending_h2 = "" # Drop a pending h2 when we get to an h1.
start = int(match.group(1), 16)
end = int(match.group(3), 16)
comment = match.group(2).replace(u"\xa0", " ")
_h1.append((start, end, comment))
continue
match = _names_h2_re.match(line)
if match:
pending_h2 = match.group(1).replace(u"\xa0", " ")
continue
if pending_h2:
match = _names_char_re.match(line)
if match:
c = int(match.group(1), 16)
_h2.append((c, pending_h2))
pending_h2 = ""
_h1.sort()
_h2.sort()
def ParseNamedProperties(in_file):
"""Parses a .txt file where the first column is a code point range
and the second column is a property name.
Sets binary properties to True,
and other properties to the values in the third column."""
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue(data[2][0], data[2][1])
else:
# type == "range"
if len(data[4]) == 2:
SetBinaryPropertyToTrue(data[4][1], data[2], data[3])
else:
SetPropertyValue(data[4][1], data[4][2], data[2], data[3])
def ParseOneProperty(in_file, pname):
"""Parses a .txt file where the first column is a code point range
and the second column is the value of a known property."""
prop = GetProperty(pname)
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue(pname, data[2][0])
else:
# type == "range"
SetPropValue(prop, data[4][1], data[2], data[3])
def ParseBidiMirroring(in_file): ParseOneProperty(in_file, "bmg")
def ParseDerivedAge(in_file): ParseOneProperty(in_file, "age")
def ParseDerivedBidiClass(in_file): ParseOneProperty(in_file, "bc")
def ParseDerivedJoiningGroup(in_file): ParseOneProperty(in_file, "jg")
def ParseDerivedJoiningType(in_file): ParseOneProperty(in_file, "jt")
def ParseEastAsianWidth(in_file): ParseOneProperty(in_file, "ea")
def ParseGraphemeBreakProperty(in_file): ParseOneProperty(in_file, "GCB")
def ParseIndicMatraCategory(in_file): ParseOneProperty(in_file, "InMC")
def ParseIndicSyllabicCategory(in_file): ParseOneProperty(in_file, "InSC")
def ParseLineBreak(in_file): ParseOneProperty(in_file, "lb")
def ParseScripts(in_file): ParseOneProperty(in_file, "sc")
def ParseScriptExtensions(in_file): ParseOneProperty(in_file, "scx")
def ParseSentenceBreak(in_file): ParseOneProperty(in_file, "SB")
def ParseWordBreak(in_file): ParseOneProperty(in_file, "WB")
def DoSetNameAlias(alias, start, end, c_props):
if "Name_Alias" in c_props:
c_props["Name_Alias"] += ',' + alias
else:
c_props["Name_Alias"] = alias
def ParseNameAliases(in_file):
"""Parses Name_Alias from NameAliases.txt.
A character can have multiple aliases.
In Unicode 6.0, there are two columns,
with a name correction in the second column.
In Unicode 6.1, there are three columns.
The second contains an alias, the third its type.
The documented types are:
correction, control, alternate, figment, abbreviation
This function does not sort the types, assuming they appear in this order."""
for data in ReadUCDLines(in_file):
start = data[2]
end = data[3]
if start != end:
raise ValueError("NameAliases.txt has an alias for a range %04lX..%04lX" %
(start, end))
fields = data[4]
if len(fields) == 2:
alias = "correction=" + fields[1]
else:
alias = fields[2] + '=' + fields[1]
update = (NeedToSetAlways, DoSetNameAlias, alias)
UpdateProps(start, end, update)
def NeedToSetNumericValue(nv, start, end, c_props):
c_nv = c_props.get("nv")
if c_nv == None:
# DerivedNumericValues.txt adds a Numeric_Value.
assert "nt" not in c_props
return True
if nv != c_nv:
raise ValueError("UnicodeData.txt has nv=%s for %04lX..%04lX " +
"but DerivedNumericValues.txt has nv=%s" %
(c_nv, start, end, nv))
return False
def DoSetNumericValue(nv, start, end, c_props):
c_props.update({"nt": "Nu", "nv": nv})
def ParseDerivedNumericValues(in_file):
"""Parses DerivedNumericValues.txt.
For most characters, the numeric type & value were parsed previously
from UnicodeData.txt but that does not show the values for Han characters.
Here we check that values match those from UnicodeData.txt
and add new ones."""
# Ignore the @missing line which has an incorrect number of fields,
# and the "NaN" in the wrong field (at least in Unicode 5.1..6.1).
# Also, "NaN" is just the Numeric null value anyway.
for data in ReadUCDLines(in_file):
# Conditional update to the numeric value in the 4th field.
update = (NeedToSetNumericValue, DoSetNumericValue, data[4][3])
UpdateProps(data[2], data[3], update)
def ParseCaseFolding(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
assert data[2][0] == "C" # common to scf & cf
SetDefaultValue("scf", data[2][1])
SetDefaultValue("cf", data[2][1])
else:
# type == "range"
start = data[2]
end = data[3]
status = data[4][1]
mapping = data[4][2]
assert status in "CSFT"
if status == "C":
SetProps(start, end, {"scf": mapping, "cf": mapping})
elif status == "S":
SetPropertyValue("scf", mapping, start, end)
elif status == "F":
SetPropertyValue("cf", mapping, start, end)
else: # status == "T"
SetPropertyValue("Turkic_Case_Folding", mapping, start, end)
def DoSetConditionalCaseMappings(ccm, start, end, c_props):
if "Conditional_Case_Mappings" in c_props:
c_props["Conditional_Case_Mappings"] += ',' + ccm
else:
c_props["Conditional_Case_Mappings"] = ccm
def ParseSpecialCasing(in_file):
for data in ReadUCDLines(in_file, want_missing=True):
if data[0] == "missing":
SetDefaultValue("lc", data[2][0])
SetDefaultValue("tc", data[2][1])
SetDefaultValue("uc", data[2][2])
else:
# type == "range"
start = data[2]
end = data[3]
fields = data[4]
if len(fields) < 5 or not fields[4]:
# Unconditional mappings.
SetProps(start, end, {"lc": fields[1], "tc": fields[2], "uc": fields[3]})
else:
# Conditional_Case_Mappings
ccm = (fields[4] + ":lc=" + fields[1] +
"&tc=" + fields[2] + "&uc=" + fields[3])
update = (NeedToSetAlways, DoSetConditionalCaseMappings, ccm)
UpdateProps(start, end, update)
# Postprocessing ----------------------------------------------------------- ***
def CompactBlock(b, i):
assert b[0] == _starts[i]
orig_i = i
# Count the number of occurrences of each property's value in this block.
num_cp_so_far = 0
prop_counters = {}
while True:
start = _starts[i]
if start > b[1]: break
num_cp_in_this_range = _starts[i + 1] - start
props = _props[i]
for (pname, value) in props.iteritems():
if pname in prop_counters:
counter = prop_counters[pname]
else:
counter = {_null_or_defaults[pname]: num_cp_so_far}
prop_counters[pname] = counter
if value in counter:
counter[value] += num_cp_in_this_range
else:
counter[value] = num_cp_in_this_range
# Also count default values for properties that do not occur in a range.
for pname in prop_counters:
if pname not in props:
counter = prop_counters[pname]
value = _null_or_defaults[pname]
counter[value] += num_cp_in_this_range
num_cp_so_far += num_cp_in_this_range
# Invariant: For each counter, the sum of counts must equal num_cp_so_far.
i += 1
# For each property that occurs within this block,
# set the most common value as a block property value.
b_props = b[2]
for (pname, counter) in prop_counters.iteritems():
max_value = None
max_count = 0
num_unique = 0
for (value, count) in counter.iteritems():
if count > max_count:
max_value = value
max_count = count
if count == 1: num_unique += 1
if max_value != _null_or_defaults[pname]:
# Avoid picking randomly among several unique values.
if (max_count > 1 or num_unique == 1):
b_props[pname] = max_value
# For each range and property, remove the default+block value
# but set the default value if that property was not set
# (i.e., it used to inherit the default value).
b_defaults = _null_or_defaults.copy()
b_defaults.update(b_props)
i = orig_i
while True:
start = _starts[i]
if start > b[1]: break
props = _props[i]
for pname in prop_counters:
if pname in props:
if props[pname] == b_defaults[pname]: del props[pname]
elif pname in b_props:
# b_props only has non-default values.
# Set the default value if it used to be inherited.
props[pname] = _null_or_defaults[pname]
i += 1
# Return the _starts index of the first range after this block.
return i
def CompactNonBlock(limit, i):
"""Remove default property values from between-block ranges."""
while True:
start = _starts[i]
if start >= limit: break
props = _props[i]
for pname in props.keys(): # .keys() is a copy so we can del props[pname].
if props[pname] == _null_or_defaults[pname]: del props[pname]
i += 1
# Return the _starts index of the first range after this block.
return i
def CompactBlocks():
"""Optimizes block properties.
Sets properties on blocks to the most commonly used values,
and removes default+block values from code point properties."""
# Ensure that there is a boundary in _starts for each block
# so that the simple mixing method below works.
for b in _blocks: AddBoundary(b[0])
# Walk through ranges and blocks together.
i = 0
for b in _blocks:
b_start = b[0]
if _starts[i] < b_start:
i = CompactNonBlock(b_start, i)
i = CompactBlock(b, i)
CompactNonBlock(0x110000, i)
# Output ------------------------------------------------------------------- ***
def AppendRange(fields, start, end):
if start == end:
fields.append("%04lX" % start)
else:
fields.append("%04lX..%04lX" % (start, end))
def AppendProps(fields, props):
# Sort property names (props keys) by their normalized forms
# and output properties in that order.
for pname in sorted(props, key=NormPropName):
value = props[pname]
if isinstance(value, bool):
if not value: pname = "-" + pname
fields.append(pname)
else:
fields.append("%s=%s" % (pname, value))
def WriteFieldsRangeProps(fields, start, end, props, out_file):
AppendRange(fields, start, end)
AppendProps(fields, props)
out_file.write(";".join(fields))
out_file.write("\n")
def WritePreparsedUCD(out_file):
out_file.write("# Preparsed UCD generated by ICU preparseucd.py\n");
if _copyright: out_file.write(_copyright + "\n")
if _terms_of_use: out_file.write(_terms_of_use + "\n")
out_file.write("ucd;%s\n\n" % _ucd_version)
# Sort property names (props keys) by their normalized forms
# and output properties in that order.
pnames = sorted(_null_values, key=NormPropName)
for pname in pnames:
prop = _properties[pname]
out_file.write(";".join(["property", prop[0]] + prop[1]))
out_file.write("\n")
out_file.write("\n")
out_file.write(";".join(["binary"] + _binary_values["N"]))
out_file.write("\n")
out_file.write(";".join(["binary"] + _binary_values["Y"]))
out_file.write("\n")
for pname in pnames:
prop = _properties[pname]
short_names = prop[2]
if short_names and prop[0] != "Binary":
for name in sorted(short_names):
out_file.write(";".join(["value", prop[1][0]] + prop[3][name]))
out_file.write("\n")
out_file.write("\n")
# Ensure that there is a boundary in _starts for each
# range of data we mix into the output,
# so that the simple mixing method below works.
for b in _blocks: AddBoundary(b[0])
for r in _alg_names_ranges: AddBoundary(r[0])
for h in _h1: AddBoundary(h[0])
for h in _h2: AddBoundary(h[0])
# Write the preparsed data.
# TODO: doc syntax
# - ppucd.txt = preparsed UCD
# - Only whole-line comments starting with #, no inline comments.
# - defaults must precede any block or cp lines
# - block;a..b must precede any cp lines with code points in a..b
# - Some code may require that all cp lines with code points in a..b
# appear between block;a..b and the next block line.
# - block lines are not required; cp lines can have data for
# ranges outside of blocks.
WriteFieldsRangeProps(["defaults"], 0, 0x10ffff, _defaults, out_file)
i_blocks = 0
i_alg = 0
i_h1 = 0
i_h2 = 0
for i in xrange(len(_starts) - 1):
start = _starts[i]
end = _starts[i + 1] - 1
# Block with default properties.
if i_blocks < len(_blocks) and start == _blocks[i_blocks][0]:
b = _blocks[i_blocks]
WriteFieldsRangeProps(["\nblock"], b[0], b[1], b[2], out_file)
i_blocks += 1
# NamesList h1 heading (for [most of] a block).
if i_h1 < len(_h1) and start == _h1[i_h1][0]:
h = _h1[i_h1]
out_file.write("# %04lX..%04lX %s\n" % (h[0], h[1], h[2]))
i_h1 += 1
# Algorithmic-names range.
if i_alg < len(_alg_names_ranges) and start == _alg_names_ranges[i_alg][0]:
r = _alg_names_ranges[i_alg]
fields = ["algnamesrange"]
AppendRange(fields, r[0], r[1])
fields.extend(r[2:])
out_file.write(";".join(fields))
out_file.write("\n")
i_alg += 1
# NamesList h2 heading.
if i_h2 < len(_h2) and start == _h2[i_h2][0]:
out_file.write("# %s\n" % (_h2[i_h2][1]))
i_h2 += 1
# Code point/range data.
props = _props[i]
# Omit ranges with only default+block properties.
if props:
WriteFieldsRangeProps(["cp"], start, end, props, out_file)
# Preprocessing ------------------------------------------------------------ ***
_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
def CopyAndStripWithOptionalMerge(s, t, do_merge):
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
first = -1 # First code point with first_data.
last = -1 # Last code point with first_data.
first_data = "" # Common data for code points [first..last].
for line in in_file:
match = _strip_re.match(line)
if match:
line = match.group(1)
else:
line = line.rstrip()
if do_merge:
match = _code_point_re.match(line)
if match:
c = int(match.group(1), 16)
data = line[match.end() - 1:]
else:
c = -1
data = ""
if last >= 0 and (c != (last + 1) or data != first_data):
# output the current range
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
if c < 0:
# no data on this line, output as is
out_file.write(line)
out_file.write("\n")
else:
# data on this line, store for possible range compaction
if last < 0:
# set as the first line in a possible range
first = c
last = c
first_data = data
else:
# must be c == (last + 1) and data == first_data
# because of previous conditions
# continue with the current range
last = c
else:
# Only strip, don't merge: just output the stripped line.
out_file.write(line)
out_file.write("\n")
if do_merge and last >= 0:
# output the last range in the file
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
out_file.flush()
return t
def CopyAndStrip(s, t):
"""Copies a file and removes comments behind data lines but not in others."""
return CopyAndStripWithOptionalMerge(s, t, False)
def CopyAndStripAndMerge(s, t):
"""Copies and strips a file and merges lines.
Copies a file, removes comments, and
merges lines with adjacent code point ranges and identical per-code point
data lines into one line with range syntax.
"""
return CopyAndStripWithOptionalMerge(s, t, True)
def PrependBOM(s, t):
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
out_file.write("\xef\xbb\xbf") # UTF-8 BOM for ICU svn
shutil.copyfileobj(in_file, out_file)
return t
def CopyOnly(s, t):
shutil.copy(s, t)
return t
def DontCopy(s, t):
return s
# Each _files value is a
# (preprocessor, dest_folder, parser, order) tuple
# where all fields except the preprocessor are optional.
# After the initial preprocessing (copy/strip/merge),
# if a parser is specified, then a tuple is added to _files_to_parse
# at index "order" (default order 9).
# An explicit order number is set only for files that must be parsed
# before others.
_files = {
"BidiMirroring.txt": (CopyOnly, ParseBidiMirroring),
"BidiTest.txt": (CopyOnly, "testdata"),
"Blocks.txt": (CopyOnly, ParseBlocks),
"CaseFolding.txt": (CopyOnly, ParseCaseFolding),
"DerivedAge.txt": (CopyOnly, ParseDerivedAge),
"DerivedBidiClass.txt": (CopyOnly, ParseDerivedBidiClass),
"DerivedCoreProperties.txt": (CopyAndStrip, ParseNamedProperties),
"DerivedJoiningGroup.txt": (CopyOnly, ParseDerivedJoiningGroup),
"DerivedJoiningType.txt": (CopyOnly, ParseDerivedJoiningType),
"DerivedNormalizationProps.txt": (CopyAndStrip, ParseNamedProperties),
"DerivedNumericValues.txt": (CopyOnly, ParseDerivedNumericValues),
"EastAsianWidth.txt": (CopyAndStripAndMerge, ParseEastAsianWidth),
"GraphemeBreakProperty.txt": (CopyAndStrip, ParseGraphemeBreakProperty),
"GraphemeBreakTest.txt": (PrependBOM, "testdata"),
"IndicMatraCategory.txt": (DontCopy, ParseIndicMatraCategory),
"IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
"LineBreak.txt": (CopyAndStripAndMerge, ParseLineBreak),
"LineBreakTest.txt": (PrependBOM, "testdata"),
"NameAliases.txt": (CopyOnly, ParseNameAliases),
"NamesList.txt": (DontCopy, ParseNamesList),
"NormalizationCorrections.txt": (CopyOnly,), # Only used in gensprep.
"NormalizationTest.txt": (CopyAndStrip,),
"PropertyAliases.txt": (CopyOnly, ParsePropertyAliases, 0),
"PropertyValueAliases.txt": (CopyOnly, ParsePropertyValueAliases, 1),
"PropList.txt": (CopyAndStrip, ParseNamedProperties),
"SentenceBreakProperty.txt": (CopyAndStrip, ParseSentenceBreak),
"SentenceBreakTest.txt": (PrependBOM, "testdata"),
"Scripts.txt": (CopyAndStrip, ParseScripts),
"ScriptExtensions.txt": (CopyOnly, ParseScriptExtensions),
"SpecialCasing.txt": (CopyOnly, ParseSpecialCasing),
"UnicodeData.txt": (CopyOnly, ParseUnicodeData, 2),
"WordBreakProperty.txt": (CopyAndStrip, ParseWordBreak),
"WordBreakTest.txt": (PrependBOM, "testdata")
}
# List of lists of files to be parsed in order.
# Inner lists contain (basename, path, parser) tuples.
_files_to_parse = [[], [], [], [], [], [], [], [], [], []]
# Get the standard basename from a versioned filename.
# For example, match "UnicodeData-6.1.0d8.txt"
# so we can turn it into "UnicodeData.txt".
_file_version_re = re.compile("([a-zA-Z0-9]+)" +
"-[0-9]+(?:\\.[0-9]+)*(?:d[0-9]+)?" +
"(\\.[a-z]+)$")
def PreprocessFiles(source_files, icu_src_root):
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
testdata_path = os.path.join(icu_src_root, "source", "test", "testdata")
folder_to_path = {
"unidata": unidata_path,
"testdata": testdata_path
}
files_processed = set()
for source_file in source_files:
basename = os.path.basename(source_file)
match = _file_version_re.match(basename)
if match:
basename = match.group(1) + match.group(2)
print "Preprocessing %s" % basename
if basename in _files:
if basename in files_processed:
raise Exception("duplicate file basename %s!" % basename)
files_processed.add(basename)
value = _files[basename]
preprocessor = value[0]
if len(value) >= 2 and isinstance(value[1], (str, unicode)):
# The value was [preprocessor, dest_folder, ...], leave [...].
dest_folder = value[1]
value = value[2:]
else:
# The value was [preprocessor, ...], leave [...].
dest_folder = "unidata"
value = value[1:]
dest_path = folder_to_path[dest_folder]
if not os.path.exists(dest_path): os.makedirs(dest_path)
dest_file = os.path.join(dest_path, basename)
parse_file = preprocessor(source_file, dest_file)
if value:
order = 9 if len(value) < 2 else value[1]
_files_to_parse[order].append((basename, parse_file, value[0]))
# Character names ---------------------------------------------------------- ***
# TODO: Turn this script into a module that
# a) gives access to the parsed data
# b) has a PreparseUCD(ucd_root, icu_src_root) function
# c) has a ParsePreparsedUCD(filename) function
# d) has a WritePreparsedUCD(filename) function
# and then use it from a new script for names.
# Some more API:
# - generator GetRangesAndProps() -> (start, end, props)*
def IncCounter(counters, key, inc=1):
if key in counters:
counters[key] += inc
else:
counters[key] = inc
endings = (
# List PHASE- before LETTER for BAMUM LETTER PHASE-xyz.
"PHASE-",
"LETTER ", "LIGATURE ", "CHARACTER ", "SYLLABLE ",
"CHOSEONG ", "JUNGSEONG ", "JONGSEONG ",
"SYLLABICS ", "IDEOGRAPH ", "IDEOGRAPH-", "IDEOGRAM ", "MONOGRAM ",
"ACROPHONIC ", "HIEROGLYPH ",
"DIGIT ", "NUMBER ", "NUMERAL ", "FRACTION ",
"PUNCTUATION ", "SIGN ", "SYMBOL ",
"TILE ", "CARD ", "FACE ",
"ACCENT ", "POINT ",
# List SIGN before VOWEL to catch "vowel sign".
"VOWEL ", "TONE ", "RADICAL ",
# For names of math symbols,
# e.g., MATHEMATICAL BOLD ITALIC CAPITAL A
"SCRIPT ", "FRAKTUR ", "MONOSPACE ",
"ITALIC ", "BOLD ", "DOUBLE-STRUCK ", "SANS-SERIF ",
"INITIAL ", "TAILED ", "STRETCHED ", "LOOPED ",
# BRAILLE PATTERN DOTS-xyz
"DOTS-",
"SELECTOR ", "SELECTOR-"
)
def SplitName(name, tokens):
start = 0
for e in endings:
i = name.find(e)
if i >= 0:
start = i + len(e)
token = name[:start]
IncCounter(tokens, token)
break
for i in xrange(start, len(name)):
c = name[i]
if c == ' ' or c == '-':
token = name[start:i + 1]
IncCounter(tokens, token)
start = i + 1
IncCounter(tokens, name[start:])
def PrintNameStats():
# TODO: This name analysis code is out of date.
# It needs to consider the multi-type Name_Alias values.
name_pnames = ("na", "na1", "Name_Alias")
counts = {}
for pname in name_pnames:
counts[pname] = 0
total_lengths = counts.copy()
max_length = 0
max_per_cp = 0
name_chars = set()
num_digits = 0
token_counters = {}
char_counters = {}
for i in xrange(len(_starts) - 1):
start = _starts[i]
# end = _starts[i + 1] - 1
props = _props[i]
per_cp = 0
for pname in name_pnames:
if pname in props:
counts[pname] += 1
name = props[pname]
total_lengths[pname] += len(name)
name_chars |= set(name)
if len(name) > max_length: max_length = len(name)
per_cp += len(name) + 1
if per_cp > max_per_cp: max_per_cp = per_cp
tokens = SplitName(name, token_counters)
for c in name:
if c in "0123456789": num_digits += 1
IncCounter(char_counters, c)
print
for pname in name_pnames:
print ("'%s' character names: %d / %d bytes" %
(pname, counts[pname], total_lengths[pname]))
print "%d total bytes in character names" % sum(total_lengths.itervalues())
print ("%d name-characters: %s" %
(len(name_chars), "".join(sorted(name_chars))))
print "%d digits 0-9" % num_digits
count_chars = [(count, c) for (c, count) in char_counters.iteritems()]
count_chars.sort(reverse=True)
for cc in count_chars:
print "name-chars: %6d * '%s'" % cc
print "max. name length: %d" % max_length
print "max. length of all (names+NUL) per cp: %d" % max_per_cp
token_lengths = sum([len(t) + 1 for t in token_counters])
print ("%d total tokens, %d bytes with NUL" %
(len(token_counters), token_lengths))
counts_tokens = []
for (token, count) in token_counters.iteritems():
# If we encode a token with a 1-byte code, then we save len(t)-1 bytes each time
# but have to store the token string itself with a length or terminator byte,
# plus a 2-byte entry in an token index table.
savings = count * (len(token) - 1) - (len(token) + 1 + 2)
if savings > 0:
counts_tokens.append((savings, count, token))
counts_tokens.sort(reverse=True)
print "%d tokens might save space with 1-byte codes" % len(counts_tokens)
# Codes=bytes, 40 byte values for name_chars.
# That leaves 216 units for 1-byte tokens or lead bytes of 2-byte tokens.
# Make each 2-byte token the token string index itself, rather than
# and index into a string index table.
# More lead bytes but also more savings.
num_units = 256
max_lead = (token_lengths + 255) / 256
max_token_units = num_units - len(name_chars)
results = []
for num_lead in xrange(min(max_lead, max_token_units) + 1):
max1 = max_token_units - num_lead
ct = counts_tokens[:max1]
tokens1 = set([t for (s, c, t) in ct])
for (token, count) in token_counters.iteritems():
if token in tokens1: continue
# If we encode a token with a 2-byte code, then we save len(t)-2 bytes each time
# but have to store the token string itself with a length or terminator byte.
savings = count * (len(token) - 2) - (len(token) + 1)
if savings > 0:
ct.append((savings, count, token))
ct.sort(reverse=True)
# A 2-byte-code-token index cannot be limit_t_lengths or higher.
limit_t_lengths = num_lead * 256
token2_index = 0
for i in xrange(max1, len(ct)):
if token2_index >= limit_t_lengths:
del ct[i:]
break
token2_index += len(ct[i][2]) + 1
cumul_savings = sum([s for (s, c, t) in ct])
# print ("%2d 1-byte codes: %4d tokens might save %6d bytes" %
# (max1, len(ct), cumul_savings))
results.append((cumul_savings, max1, ct))
best = max(results) # (cumul_savings, max1, ct)
max1 = best[1]
print ("maximum savings: %d bytes with %d 1-byte codes & %d lead bytes" %
(best[0], max1, max_token_units - max1))
counts_tokens = best[2]
cumul_savings = 0
for i in xrange(len(counts_tokens)):
n = 1 if i < max1 else 2
i1 = i + 1
t = counts_tokens[i]
cumul_savings += t[0]
if i1 <= 250 or (i1 % 100) == 0 or i1 == len(counts_tokens):
print (("%04d. cumul. %6d bytes save %6d bytes from " +
"%5d * %d-byte token for %2d='%s'") %
(i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
# ICU API ------------------------------------------------------------------ ***
# Sample line to match:
# UCHAR_UNIFIED_IDEOGRAPH=29,
_uchar_re = re.compile(
" *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
# Sample line to match:
# /** Zs @stable ICU 2.0 */
_gc_comment_re = re.compile(" */\*\* *([A-Z][a-z]) ")
# Sample line to match:
# U_SPACE_SEPARATOR = 12,
_gc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# /** L @stable ICU 2.0 */
_bc_comment_re = re.compile(" */\*\* *([A-Z]{1,3}) ")
# Sample line to match:
# U_LEFT_TO_RIGHT = 0,
_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# UBLOCK_CYRILLIC =9, /*[0400]*/
_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# U_EA_AMBIGUOUS, /*[A]*/
_prop_and_value_re = re.compile(
" *(U_(DT|EA|GCB|HST|LB|JG|JT|NT|SB|WB)_([0-9A-Z_]+))")
# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):
# U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
def ParseUCharHeader(icu_src_root):
uchar_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uchar.h")
with open(uchar_path, "r") as uchar_file:
mode = "" # Mode string (=pname) during context-sensitive parsing.
comment_value = "" # Property value from a comment preceding an enum.
# Note: The enum UProperty is first in uchar.h, before the enums for values.
for line in uchar_file:
# Parse some enums via context-sensitive "modes".
# Necessary because the enum constant names do not contain
# enough information.
if "enum UCharCategory" in line:
mode = "gc"
comment_value = ""
continue
if mode == "gc":
if line.startswith("}"):
mode = ""
continue
match = _gc_comment_re.match(line)
if match:
comment_value = match.group(1)
continue
match = _gc_re.match(line)
if match and comment_value:
gc_enum = match.group(1)
prop = _properties["gc"]
vname = GetShortPropertyValueName(prop, comment_value)
icu_values = _pname_to_icu_prop["gc"][2]
icu_values.append((gc_enum, vname))
comment_value = ""
continue
if "enum UCharDirection {" in line:
mode = "bc"
comment_value = ""
continue
if mode == "bc":
if line.startswith("}"):
mode = ""
continue
match = _bc_comment_re.match(line)
if match:
comment_value = match.group(1)
continue
match = _bc_re.match(line)
if match and comment_value:
bc_enum = match.group(1)
prop = _properties["bc"]
vname = GetShortPropertyValueName(prop, comment_value)
icu_values = _pname_to_icu_prop["bc"][2]
icu_values.append((bc_enum, vname))
comment_value = ""
continue
# No mode, parse enum constants whose names contain
# enough information to parse without requiring context.
match = _uchar_re.match(line)
if match:
prop_enum = match.group(1)
if prop_enum.endswith("_LIMIT"):
# Ignore "UCHAR_BINARY_LIMIT=57," etc.
continue
pname = GetShortPropertyName(prop_enum[6:])
icu_prop = (prop_enum, pname, [])
_icu_properties.append(icu_prop)
_pname_to_icu_prop[pname] = icu_prop
continue
match = _ublock_re.match(line)
if match:
prop_enum = match.group(1)
if prop_enum == "UBLOCK_COUNT":
continue
prop = _properties["blk"]
vname = GetShortPropertyValueName(prop, prop_enum[7:])
icu_values = _pname_to_icu_prop["blk"][2]
icu_values.append((prop_enum, vname))
continue
match = _prop_and_value_re.match(line)
if match:
(prop_enum, vname) = match.group(1, 3)
if vname == "COUNT" or _prop_and_alias_re.match(line):
continue
pname = GetShortPropertyName(match.group(2))
prop = _properties[pname]
vname = GetShortPropertyValueName(prop, vname)
icu_values = _pname_to_icu_prop[pname][2]
icu_values.append((prop_enum, vname))
# ccc, lccc, tccc use their numeric values as "enum" values.
# In the UCD data, these numeric values are the first value names,
# followed by the short & long value names.
# List the ccc values in numeric order.
prop = _properties["ccc"]
icu_values = _pname_to_icu_prop["ccc"][2]
for ccc in sorted([int(name) for name in prop[2]]):
icu_values.append((ccc, str(ccc)))
_pname_to_icu_prop["lccc"][2].extend(icu_values) # Copy ccc -> lccc.
_pname_to_icu_prop["tccc"][2].extend(icu_values) # Copy ccc -> tccc.
# No need to parse predictable General_Category_Mask enum constants.
# Just define them in ASCII order.
prop = _properties["gcm"]
icu_values = _pname_to_icu_prop["gcm"][2]
for vname in sorted(prop[2]):
icu_values.append(("U_GC_" + vname.upper() + "_MASK", vname))
# Hardcode known values for the normalization quick check properties,
# see unorm2.h for the UNormalizationCheckResult enum.
icu_values = _pname_to_icu_prop["NFC_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
icu_values.append(("UNORM_MAYBE", "M"))
_pname_to_icu_prop["NFKC_QC"][2].extend(icu_values) # Copy NFC -> NFKC.
# No "maybe" values for NF[K]D.
icu_values = _pname_to_icu_prop["NFD_QC"][2]
icu_values.append(("UNORM_NO", "N"))
icu_values.append(("UNORM_YES", "Y"))
_pname_to_icu_prop["NFKD_QC"][2].extend(icu_values) # Copy NFD -> NFKD.
# Sample line to match:
# USCRIPT_LOMA = 139,/* Loma */
_uscript_re = re.compile(
" *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
def ParseUScriptHeader(icu_src_root):
uscript_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uscript.h")
icu_values = _pname_to_icu_prop["sc"][2]
with open(uscript_path, "r") as uscript_file:
for line in uscript_file:
match = _uscript_re.match(line)
if match:
(script_enum, script_code) = match.group(1, 2)
icu_values.append((script_enum, script_code))
def CheckPNamesData():
"""Checks that every ICU property has a full set of value enum constants,
and that the _icu_properties value names map back to the UCD."""
missing_enums = []
for (p_enum, pname, values) in _icu_properties:
prop = _properties[pname]
vnames = set(prop[2]) # Modifiable copy of the set of short value names.
for (v_enum, vname) in values:
if vname not in vnames:
raise ValueError("%s = %s (uchar.h %s) not in the UCD\n" %
(pname, vname, v_enum))
vnames.remove(vname)
# Exceptions to the all-values check:
# - ICU does not have specific enum values for binary No/Yes.
# - ICU represents Age values via UVersionInfo rather than enum constants.
# - gc: ICU enum UCharCategory only has the single-category values.
# (ICU's gcm property has all of the UCD gc property values.)
if vnames and not (prop[0] == "Binary" or pname in ("age", "gc")):
missing_enums.append((pname, vnames))
if missing_enums:
raise ValueError(
"missing uchar.h enum constants for some property values: %s" %
missing_enums)
def WritePNamesDataHeader(out_path):
year = datetime.date.today().strftime("%Y")
with open(out_path, "w") as out_file:
out_file.write("""/**
* Copyright (C) 2002-""" + year +
""", International Business Machines Corporation and
* others. All Rights Reserved.
*
* machine-generated by: icu/tools/unicode/py/preparseucd.py
*
* Unicode """ + _ucd_version + """
*/
""")
# Write an array of "binprop" Value object initializers
# with the value aliases shared among all binary properties.
out_file.write("const int32_t VALUES_binprop_COUNT = 2;\n\n")
out_file.write("const Value VALUES_binprop[] = {\n")
out_file.write(' Value(0, "%s"),\n' % " ".join(_binary_values["N"]))
out_file.write(' Value(1, "%s"),\n' % " ".join(_binary_values["Y"]))
out_file.write("};\n\n")
# For each property with named values, write an array of
# Value object initializers with the value enum and the aliases.
for (p_enum, pname, values) in _icu_properties:
if not values: continue
out_file.write("const int32_t VALUES_%s_COUNT = %d;\n\n" %
(pname, len(values)))
out_file.write("const Value VALUES_%s[] = {\n" % pname)
for (v_enum, vname) in values:
aliases = _properties[pname][3][vname]
# ccc, lccc, tccc: Omit the numeric strings from the aliases.
# (See the comment about ccc in the PropertyValueAliases.txt header.)
if pname.endswith("ccc"): aliases = aliases[1:]
cast = "(int32_t)" if pname == "gcm" else ""
out_file.write(' Value(%s%s, "%s"),\n' %
(cast, v_enum, " ".join(aliases)))
out_file.write("};\n\n")
# For each property, write a Property object initializer
# with the property enum, its aliases, and a reference to its values.
out_file.write("const int32_t PROPERTIES_COUNT = %d;\n\n" %
len(_icu_properties))
out_file.write("const Property PROPERTIES[] = {\n")
for (enum, pname, values) in _icu_properties:
prop = _properties[pname]
aliases = prop[1]
if values: # Property with named values.
if prop[0] == "Binary": pname = "binprop"
out_file.write(
' Property(%s, "%s", VALUES_%s, VALUES_%s_COUNT),\n' %
(enum, " ".join(aliases), pname, pname))
else:
out_file.write(' Property(%s, "%s", NULL, 0),\n' %
(enum, " ".join(aliases)))
out_file.write("};\n")
# main() ------------------------------------------------------------------- ***
def main():
global _null_or_defaults
if len(sys.argv) < 4:
print ("Usage: %s path/to/UCD/root path/to/ICU/src/root "
"path/to/ICU/tools/root" % sys.argv[0])
return
(ucd_root, icu_src_root, icu_tools_root) = sys.argv[1:4]
source_files = []
for root, dirs, files in os.walk(ucd_root):
for file in files:
source_files.append(os.path.join(root, file))
PreprocessFiles(source_files, icu_src_root)
# Parse the processed files in a particular order.
for files in _files_to_parse:
for (basename, path, parser) in files:
print "Parsing %s" % basename
value = _files[basename]
if basename == "NamesList.txt":
in_file = codecs.open(path, "r", "ISO-8859-1")
else:
in_file = open(path, "r")
with in_file:
parser(in_file)
_null_or_defaults = _null_values.copy()
_null_or_defaults.update(_defaults)
# Every Catalog and Enumerated property must have a default value,
# from a @missing line. "nv" = "null value".
pnv = [pname for (pname, nv) in _null_or_defaults.iteritems() if nv == "??"]
if pnv:
raise Exception("no default values (@missing lines) for " +
"some Catalog or Enumerated properties: %s " % pnv)
# Optimize block vs. cp properties.
CompactBlocks()
# Write the ppucd.txt output file.
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
if not os.path.exists(unidata_path): os.makedirs(unidata_path)
out_path = os.path.join(unidata_path, "ppucd.txt")
with open(out_path, "w") as out_file:
WritePreparsedUCD(out_file)
out_file.flush()
# TODO: PrintNameStats()
# ICU data for property & value names API
ParseUCharHeader(icu_src_root)
ParseUScriptHeader(icu_src_root)
CheckPNamesData()
genprops_path = os.path.join(icu_tools_root, "unicode", "c", "genprops")
if not os.path.exists(genprops_path): os.makedirs(genprops_path)
out_path = os.path.join(genprops_path, "pnames_data.h")
WritePNamesDataHeader(out_path)
if __name__ == "__main__":
main()