ICU-8972 progress towards preparseucd.py also parsing uchar.h & uscript.h and writing pnames_data.h; add remaining ICU-specific properties that used to be in SyntheticPropertyAliases.txt

X-SVN-Rev: 31157
This commit is contained in:
Markus Scherer 2011-12-19 05:21:15 +00:00
parent 7c797527dc
commit 5597c3ef96

View File

@ -13,16 +13,17 @@
#
# Copies Unicode Character Database (UCD) files from a tree
# of files downloaded from (for example) ftp://www.unicode.org/Public/6.1.0/
# to a folder like ICU's source/data/unidata/
# to ICU's source/data/unidata/ and source/test/testdata/
# and modifies some of the files to make them more compact.
# Parses them an writes a ppucd.txt file (PreParsed UCD) with simple syntax.
# Parses them and writes unidata/ppucd.txt (PreParsed UCD) with simple syntax.
#
# Invoke with two command-line parameters:
# Invoke with three command-line parameters:
# 1. source folder with UCD files
# 2. destination folder for processed output files
# 2. ICU source root folder
# 3. ICU tools root folder
#
# Sample invocation:
# ~/svn.icu/tools/trunk/src/unicode/py$ ./preparseucd.py ~/uni61/20111205/ucd /tmp/ucd
# ~/svn.icu/tools/trunk/src/unicode$ py/preparseucd.py ~/uni61/20111205mod/ucd ~/svn.icu/trunk/src ~/svn.icu/tools/trunk/src
import array
import bisect
@ -108,7 +109,8 @@ _ignored_properties = set((
# 0: Type of property (binary, enum, ...)
# 1: List of aliases; short & long name followed by other aliases.
# The short name is "" if it is listed as "n/a" in PropertyValueAliases.txt.
# 2: Set of short property value names.
# 2: Dictionary, maps short property value names
# initially to None, later to ICU4C API enum constants.
# 3: Dictionary of property values.
# For Catalog & Enumerated properties,
# maps each value name to a list of aliases.
@ -137,6 +139,9 @@ _defaults = {}
# Initialized after parsing is done.
_null_or_defaults = {}
# Dictionary of short property names mapped to ICU4C UProperty enum constants.
_property_name_to_enum = {}
_non_alnum_re = re.compile("[^a-zA-Z0-9]")
def NormPropName(pname):
@ -378,6 +383,22 @@ def ReadUCDLines(in_file, want_ranges=True, want_other=False,
raise SyntaxError("unable to parse line\n %s\n" % line)
def AddBinaryProperty(short_name, long_name):
_null_values[short_name] = False
bin_prop = _properties["Math"]
prop = ("Binary", [short_name, long_name], bin_prop[2], bin_prop[3])
_properties[short_name] = prop
_properties[long_name] = prop
_properties[NormPropName(short_name)] = prop
_properties[NormPropName(long_name)] = prop
def AddPOSIXBinaryProperty(short_name, long_name):
AddBinaryProperty(short_name, long_name)
# This is to match UProperty UCHAR_POSIX_ALNUM etc.
_properties["posix" + NormPropName(short_name)] = _properties[short_name]
# Match a comment line like
# PropertyAliases-6.1.0.txt
# and extract the Unicode version.
@ -424,7 +445,7 @@ def ParsePropertyAliases(in_file):
_null_values[name] = 0
else:
_null_values[name] = null_value
prop = (prop_type, aliases, set(), {})
prop = (prop_type, aliases, {}, {})
for alias in aliases:
_properties[alias] = prop
_properties[NormPropName(alias)] = prop
@ -438,13 +459,13 @@ def ParsePropertyAliases(in_file):
# Turkic_Case_Folding: The 'T' mappings in CaseFolding.txt.
name = "Turkic_Case_Folding"
_null_values[name] = ""
prop = ("String", [name, name], set(), {})
prop = ("String", [name, name], {}, {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# Conditional_Case_Mappings: SpecialCasing.txt lines with conditions.
name = "Conditional_Case_Mappings"
_null_values[name] = ""
prop = ("Miscellaneous", [name, name], set(), {})
prop = ("Miscellaneous", [name, name], {}, {})
_properties[name] = prop
_properties[NormPropName(name)] = prop
# lccc = ccc of first cp in canonical decomposition.
@ -465,10 +486,31 @@ def ParsePropertyAliases(in_file):
# Script_Extensions
if "scx" not in _properties:
_null_values["scx"] = ""
prop = ("Miscellaneous", ["scx", "Script_Extensions"], set(), {})
prop = ("Miscellaneous", ["scx", "Script_Extensions"], {}, {})
_properties["scx"] = prop
_properties["Script_Extensions"] = prop
_properties["scriptextensions"] = prop
# General Category as a bit mask.
_null_values["gcm"] = "??"
gc_prop = _properties["gc"]
prop = ("Bitmask", ["gcm", "General_Category_Mask"], gc_prop[2], gc_prop[3])
_properties["gcm"] = prop
_properties["General_Category_Mask"] = prop
_properties["generalcategorymask"] = prop
# Various binary properties.
AddBinaryProperty("Sensitive", "Case_Sensitive")
AddBinaryProperty("nfdinert", "NFD_Inert")
AddBinaryProperty("nfkdinert", "NFKD_Inert")
AddBinaryProperty("nfcinert", "NFC_Inert")
AddBinaryProperty("nfkcinert", "NFKC_Inert")
AddBinaryProperty("segstart", "Segment_Starter")
# C/POSIX character classes that do not have Unicode property [value] aliases.
# See uchar.h.
AddPOSIXBinaryProperty("alnum", "alnum")
AddPOSIXBinaryProperty("blank", "blank")
AddPOSIXBinaryProperty("graph", "graph")
AddPOSIXBinaryProperty("print", "print")
AddPOSIXBinaryProperty("xdigit", "xdigit")
def ParsePropertyValueAliases(in_file):
@ -488,7 +530,7 @@ def ParsePropertyValueAliases(in_file):
if short_name == "n/a": # no short name
fields[0] = ""
short_name = fields[1]
prop[2].add(short_name)
prop[2][short_name] = None
values = prop[3]
for alias in fields:
if alias:
@ -510,17 +552,19 @@ def ParsePropertyValueAliases(in_file):
_defaults["hst"] = "NA"
if "gc" not in _defaults: # No @missing line in any .txt file?
_defaults["gc"] = "Cn"
# Copy the gc default value to gcm.
_defaults["gcm"] = _defaults["gc"]
# Add ISO 15924-only script codes.
# Only for the ICU script code API, not necessary for parsing the UCD.
script_prop = _properties["sc"]
short_script_names = script_prop[2] # set
short_script_names = script_prop[2] # dict
script_values = script_prop[3] # dict
remove_scripts = []
for script in _scripts_only_in_iso15924:
if script in short_script_names:
remove_scripts.append(script)
else:
short_script_names.add(script)
short_script_names[script] = None
# Do not invent a Unicode long script name before the UCD adds the script.
script_list = [script, script] # [short, long]
script_values[script] = script_list
@ -1413,15 +1457,136 @@ def PrintNameStats():
"%5d * %d-byte token for %2d='%s'") %
(i1, cumul_savings, t[0], t[1], n, len(t[2]), t[2]))
# ICU API ------------------------------------------------------------------ ***
# Sample line to match:
# USCRIPT_LOMA = 139,/* Loma */
_uscript_re = re.compile(
" *(USCRIPT_[A-Z_]+) *= *[0-9]+ *, */\* *([A-Z][a-z]{3}) *\*/")
def ParseUScriptHeader(icu_src_root):
uscript_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uscript.h")
short_script_name_to_enum = _properties["sc"][2]
scripts_not_in_ucd = set()
with open(uscript_path, "r") as uscript_file:
for line in uscript_file:
match = _uscript_re.match(line)
if match:
script_enum = match.group(1)
script_code = match.group(2)
if script_code not in short_script_name_to_enum:
scripts_not_in_ucd.add(script_code)
else:
short_script_name_to_enum[script_code] = script_enum
if scripts_not_in_ucd:
raise ValueError("uscript.h has UScript constants for scripts "
"not in the UCD nor in ISO 15924: %s" % scripts_not_in_ucd)
# Sample line to match:
# UCHAR_UNIFIED_IDEOGRAPH=29,
_uchar_re = re.compile(
" *(UCHAR_[0-9A-Z_]+) *= *(?:[0-9]+|0x[0-9a-fA-F]+),")
# Sample line to match:
# /** L @stable ICU 2.0 */
_bc_comment_re = re.compile(" */\*\* *([A-Z]+) *")
# Sample line to match:
# U_LEFT_TO_RIGHT = 0,
_bc_re = re.compile(" *(U_[A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# UBLOCK_CYRILLIC =9, /*[0400]*/
_ublock_re = re.compile(" *(UBLOCK_[0-9A-Z_]+) *= *[0-9]+,")
# Sample line to match:
# U_EA_AMBIGUOUS, /*[A]*/
_prop_and_value_re = re.compile(
" *(U_(DT|EA|GCB|HST|LB|JG|JT|NT|SB|WB)_([0-9A-Z_]+))")
# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):
# U_JG_HAMZA_ON_HEH_GOAL=U_JG_TEH_MARBUTA_GOAL,
_prop_and_alias_re = re.compile(" *U_[0-9A-Z_]+ *= *U")
def ParseUCharHeader(icu_src_root):
uchar_path = os.path.join(icu_src_root, "source",
"common", "unicode", "uchar.h")
with open(uchar_path, "r") as uchar_file:
mode = ""
prop = None
comment_value = "??"
for line in uchar_file:
if "enum UCharDirection {" in line:
mode = "UCharDirection"
prop = _properties["bc"]
comment_value = "??"
continue
if mode == "UCharDirection":
if line.startswith("}"):
mode = ""
continue
match = _bc_comment_re.match(line)
if match:
comment_value = match.group(1)
continue
match = _bc_re.match(line)
if match:
bc_enum = match.group(1)
vname = GetShortPropertyValueName(prop, comment_value)
prop[2][vname] = bc_enum
continue
match = _uchar_re.match(line)
if match:
prop_enum = match.group(1)
if prop_enum.endswith("_LIMIT"):
# Ignore "UCHAR_BINARY_LIMIT=57," etc.
continue
pname = GetShortPropertyName(prop_enum[6:])
_property_name_to_enum[pname] = prop_enum
continue
match = _ublock_re.match(line)
if match:
prop_enum = match.group(1)
if prop_enum == "UBLOCK_COUNT":
continue
prop = _properties["blk"]
vname = GetShortPropertyValueName(prop, prop_enum[7:])
prop[2][vname] = prop_enum
continue
match = _prop_and_value_re.match(line)
if match:
prop_enum = match.group(1)
vname = match.group(3)
if vname == "COUNT" or _prop_and_alias_re.match(line):
continue
prop = GetProperty(match.group(2))
vname = GetShortPropertyValueName(prop, vname)
prop[2][vname] = prop_enum
# No need to parse predictable General_Category_Mask enum constants.
short_gcm_name_to_enum = _properties["gcm"][2]
for value in short_gcm_name_to_enum:
short_gcm_name_to_enum[value] = "U_GC_" + value.upper() + "_MASK"
def WritePNamesDataHeader(icu_tools_root):
short_script_name_to_enum = _properties["sc"][2]
# print short_script_name_to_enum
# print _property_name_to_enum
print _properties["ea"][2]
print _properties["gcm"][2]
# main() ------------------------------------------------------------------- ***
def main():
global _null_or_defaults
if len(sys.argv) < 3:
print """Usage: preparseucd path/to/UCD/root path/to/ICU/src/root"""
if len(sys.argv) < 4:
print ("Usage: %s path/to/UCD/root path/to/ICU/src/root "
"path/to/ICU/tools/root" % sys.argv[0])
return
ucd_root = sys.argv[1]
icu_src_root = sys.argv[2]
(ucd_root, icu_src_root, icu_tools_root) = sys.argv[1:4]
source_files = []
for root, dirs, files in os.walk(ucd_root):
for file in files:
@ -1456,6 +1621,10 @@ def main():
WritePreparsedUCD(out_file)
out_file.flush()
# TODO: PrintNameStats()
ParseUScriptHeader(icu_src_root)
ParseUCharHeader(icu_src_root)
WritePNamesDataHeader(icu_tools_root)
if __name__ == "__main__":
main()