ef5aeb05a3
X-SVN-Rev: 30880
92 lines
3.2 KiB
Python
Executable File
92 lines
3.2 KiB
Python
Executable File
#!/usr/bin/python2.4
|
|
# Copyright (C) 2010-2011, International Business Machines
|
|
# Corporation and others. All Rights Reserved.
|
|
#
|
|
# file name: idna2nrm.py
|
|
# encoding: US-ASCII
|
|
# tab size: 8 (not used)
|
|
# indentation:4
|
|
#
|
|
# created on: 2010jan28
|
|
# created by: Markus W. Scherer
|
|
|
|
"""Turn Unicode IdnaMappingTable.txt into ICU gennorm2 source file format."""
|
|
|
|
__author__ = "Markus Scherer"
|
|
|
|
import re
|
|
|
|
replacements = [
|
|
# Several versions of avoiding circular FFFD>FFFD mappings,
|
|
# depending on the version of the input file.
|
|
(re.compile(r"FFFD ; disallowed"), "# FFFD (avoid circular mapping)"),
|
|
(re.compile(r"\.\.FFFD"), "..FFFC"),
|
|
(re.compile(r"(FFF[^E])\.\.FFFF"), "\1..FFFC"),
|
|
# Since we switch between checking and not checking for STD3 character
|
|
# restrictions at runtime, checking the non-LDH ASCII characters in code,
|
|
# we treat these values here like their regular siblings.
|
|
(re.compile(r"^([^;]+) ; disallowed_STD3_valid"), r"# \1disallowed_STD3_valid"),
|
|
(re.compile(r"; disallowed_STD3_mapped +; "), ">"),
|
|
# For UTS #46, we do not care about "not valid in IDNA2008".
|
|
(re.compile(r"; *; NV8 +"), ""),
|
|
# Normal transformations.
|
|
(re.compile(r"; disallowed"), ">FFFD"),
|
|
(re.compile(r"; ignored"), ">"),
|
|
(re.compile(r"^([^;]+) ; valid"), r"# \1valid"),
|
|
(re.compile(r"; mapped +; "), ">"),
|
|
(re.compile(r"^([^;]+) ; deviation +; "), r"# \1deviation >")
|
|
]
|
|
|
|
in_file = open("IdnaMappingTable.txt", "r")
|
|
out_file = open("uts46.txt", "w")
|
|
|
|
out_file.write("# Original file:\n")
|
|
for line in in_file:
|
|
orig_line = line
|
|
if line.startswith("# For documentation, see"):
|
|
out_file.write(line)
|
|
out_file.write(r"""
|
|
# ================================================
|
|
# This file has been reformatted into syntax for the
|
|
# gennorm2 Normalizer2 data generator tool.
|
|
#
|
|
# "valid", "disallowed_STD3_valid" and "deviation" lines are commented out.
|
|
# "mapped" and "disallowed_STD3_mapped" are changed to use the ">" mapping syntax.
|
|
# "disallowed" lines map to U+FFFD.
|
|
# "ignored" lines map to an empty string.
|
|
#
|
|
# Characters disallowed under STD3 rules are treated as valid or mapped;
|
|
# they are handled in code.
|
|
# Deviation characters are also handled in code.
|
|
#
|
|
# Use this file as the second gennorm2 input file after nfc.txt.
|
|
# ================================================
|
|
""")
|
|
continue
|
|
if line[0] in "#\r\n":
|
|
out_file.write(line)
|
|
continue
|
|
for rep in replacements: line = rep[0].sub(rep[1], line)
|
|
# Align inline comments at column 40.
|
|
comment_pos = line.find("#", 1)
|
|
if comment_pos < 40:
|
|
line = line[:comment_pos] + ((40 - comment_pos) * ' ') + line[comment_pos:]
|
|
elif comment_pos > 40:
|
|
space_pos = comment_pos
|
|
while space_pos > 0 and line[space_pos - 1] == ' ':
|
|
space_pos = space_pos - 1
|
|
if space_pos < 40:
|
|
# Fewer than 40 characters before the comment:
|
|
# Align comments at column 40.
|
|
line = line[:40] + line[comment_pos:]
|
|
else:
|
|
# 40 or more characters before the comment:
|
|
# Keep one space between contents and comment.
|
|
line = line[:space_pos] + " " + line[comment_pos:]
|
|
# Write the modified line.
|
|
out_file.write(line)
|
|
if "..FFFF" in orig_line and "..FFFC" in line:
|
|
out_file.write("FFFE..FFFF >FFFD\n");
|
|
in_file.close()
|
|
out_file.close()
|