e4f1a48919
X-SVN-Rev: 28649
183 lines
5.6 KiB
Python
Executable File
183 lines
5.6 KiB
Python
Executable File
#!/usr/bin/python2.4
|
|
# Copyright (c) 2009-2010 International Business Machines
|
|
# Corporation and others. All Rights Reserved.
|
|
#
|
|
# file name: ucdcopy.py
|
|
# encoding: US-ASCII
|
|
# tab size: 8 (not used)
|
|
# indentation:4
|
|
#
|
|
# created on: 2009aug04
|
|
# created by: Markus W. Scherer
|
|
#
|
|
# Copy Unicode Character Database (ucd) files from a tree
|
|
# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
|
|
# to a folder like ICU's source/data/unidata/
|
|
# and modify some of the files to make them more compact.
|
|
#
|
|
# Invoke with two command-line parameters, for the source
|
|
# and destination folders.
|
|
|
|
import os
|
|
import os.path
|
|
import re
|
|
import shutil
|
|
import sys
|
|
|
|
_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
|
|
_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
|
|
|
|
def CopyAndStripWithOptionalMerge(s, t, do_merge):
|
|
in_file = open(s, "r")
|
|
out_file = open(t, "w")
|
|
first = -1 # First code point with first_data.
|
|
last = -1 # Last code point with first_data.
|
|
first_data = "" # Common data for code points [first..last].
|
|
for line in in_file:
|
|
match = _strip_re.match(line)
|
|
if match:
|
|
line = match.group(1)
|
|
else:
|
|
line = line.rstrip()
|
|
if do_merge:
|
|
match = _code_point_re.match(line)
|
|
if match:
|
|
c = int(match.group(1), 16)
|
|
data = line[match.end() - 1:]
|
|
else:
|
|
c = -1
|
|
data = ""
|
|
if last >= 0 and (c != (last + 1) or data != first_data):
|
|
# output the current range
|
|
if first == last:
|
|
out_file.write("%04X%s\n" % (first, first_data))
|
|
else:
|
|
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
|
|
first = -1
|
|
last = -1
|
|
first_data = ""
|
|
if c < 0:
|
|
# no data on this line, output as is
|
|
out_file.write(line)
|
|
out_file.write("\n")
|
|
else:
|
|
# data on this line, store for possible range compaction
|
|
if last < 0:
|
|
# set as the first line in a possible range
|
|
first = c
|
|
last = c
|
|
first_data = data
|
|
else:
|
|
# must be c == (last + 1) and data == first_data
|
|
# because of previous conditions
|
|
# continue with the current range
|
|
last = c
|
|
else:
|
|
# Only strip, don't merge: just output the stripped line.
|
|
out_file.write(line)
|
|
out_file.write("\n")
|
|
if do_merge and last >= 0:
|
|
# output the last range in the file
|
|
if first == last:
|
|
out_file.write("%04X%s\n" % (first, first_data))
|
|
else:
|
|
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
|
|
first = -1
|
|
last = -1
|
|
first_data = ""
|
|
in_file.close()
|
|
out_file.flush()
|
|
out_file.close()
|
|
|
|
|
|
def CopyAndStrip(s, t):
|
|
"""Copies a file and removes comments behind data lines but not in others."""
|
|
CopyAndStripWithOptionalMerge(s, t, False)
|
|
|
|
|
|
def CopyAndStripAndMerge(s, t):
|
|
"""Copies and strips a file and merges lines.
|
|
|
|
Copies a file, removes comments, and
|
|
merges lines with adjacent code point ranges and identical per-code point
|
|
data lines into one line with range syntax.
|
|
"""
|
|
CopyAndStripWithOptionalMerge(s, t, True)
|
|
|
|
|
|
_files = {
|
|
# Simply copy these files.
|
|
"BidiMirroring.txt": shutil.copy,
|
|
"BidiTest.txt": (shutil.copy, "testdata"),
|
|
"Blocks.txt": shutil.copy,
|
|
"CaseFolding.txt": shutil.copy,
|
|
"DerivedAge.txt": shutil.copy,
|
|
"DerivedBidiClass.txt": shutil.copy,
|
|
"DerivedJoiningGroup.txt": shutil.copy,
|
|
"DerivedJoiningType.txt": shutil.copy,
|
|
"DerivedNumericValues.txt": shutil.copy,
|
|
"GraphemeBreakTest.txt": (shutil.copy, "testdata"),
|
|
"LineBreakTest.txt": (shutil.copy, "testdata"),
|
|
"NameAliases.txt": shutil.copy,
|
|
"NormalizationCorrections.txt": shutil.copy,
|
|
"PropertyAliases.txt": shutil.copy,
|
|
"PropertyValueAliases.txt": shutil.copy,
|
|
"SentenceBreakTest.txt": (shutil.copy, "testdata"),
|
|
"ScriptExtensions.txt": shutil.copy,
|
|
"SpecialCasing.txt": shutil.copy,
|
|
"UnicodeData.txt": shutil.copy,
|
|
"WordBreakTest.txt": (shutil.copy, "testdata"),
|
|
|
|
# Copy these files and remove comments behind data lines but not in others.
|
|
"DerivedCoreProperties.txt": CopyAndStrip,
|
|
"DerivedNormalizationProps.txt": CopyAndStrip,
|
|
"GraphemeBreakProperty.txt": CopyAndStrip,
|
|
"NormalizationTest.txt": CopyAndStrip,
|
|
"PropList.txt": CopyAndStrip,
|
|
"Scripts.txt": CopyAndStrip,
|
|
"SentenceBreakProperty.txt": CopyAndStrip,
|
|
"WordBreakProperty.txt": CopyAndStrip,
|
|
|
|
# Also merge lines with adjacent code point ranges.
|
|
"EastAsianWidth.txt": CopyAndStripAndMerge,
|
|
"LineBreak.txt": CopyAndStripAndMerge
|
|
}
|
|
|
|
_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
|
|
"-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
|
|
"(\\.[a-z]+)$")
|
|
|
|
def main():
|
|
source_root = sys.argv[1]
|
|
dest_root = sys.argv[2]
|
|
source_files = []
|
|
for root, dirs, files in os.walk(source_root):
|
|
for file in files:
|
|
source_files.append(os.path.join(root, file))
|
|
files_processed = set()
|
|
for source_file in source_files:
|
|
basename = os.path.basename(source_file)
|
|
match = _file_version_re.match(basename)
|
|
if match:
|
|
basename = match.group(1) + match.group(2)
|
|
print basename
|
|
if basename in _files:
|
|
if basename in files_processed:
|
|
print "duplicate file basename %s!" % basename
|
|
sys.exit(1)
|
|
files_processed.add(basename)
|
|
action = _files[basename]
|
|
if isinstance(action, tuple):
|
|
dest_folder = action[1]
|
|
action = action[0]
|
|
else:
|
|
dest_folder = "unidata"
|
|
dest_path = os.path.join(dest_root, dest_folder)
|
|
if not os.path.exists(dest_path): os.makedirs(dest_path)
|
|
dest_file = os.path.join(dest_path, basename)
|
|
action(source_file, dest_file)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|