Initial checkin; creates ICU rules from ICU4J rules

X-SVN-Rev: 1442
This commit is contained in:
Alan Liu 2000-05-23 16:50:38 +00:00
parent 721dca33fc
commit 4576173828
2 changed files with 476 additions and 0 deletions

View File

@ -0,0 +1,238 @@
@rem = '--*-Perl-*--
@echo off
if "%OS%" == "Windows_NT" goto WinNT
perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
goto endofperl
perl -x -S "%0" %*
if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
if %errorlevel% == 9009 echo You do not have Perl in your PATH.
goto endofperl
@rem ';
#line 14
# This perl script creates ICU transliterator data files, that live
# in icu/data, from ICU4J Java transliterator data files, in
# icu4j/src/com/ibm/text/resources.
# The transformation that is done is very minimal. The script assumes
# that the Java input files use only // comments (no /**/ comments)
# and that they follow a rigid format. Leading or trailing '+' (but not both)
# concatenation operators are stripped from each line.
# The output files are named according to ICU conventions (see NAME_MAP
# below) and created in the current directory. They should be manually
# checked and then copied into the icu/data directory. An ICU build must
# then be initiated, and the standard suite of ICU transliterator tests
# should be run after that.
# Alan Liu 5/19/00
if (scalar @ARGV != 1) {
$DIR = shift;
if (! -d $DIR) {
sub usage {
my $me = $0;
$me =~ s|.+[/\\]||;
print "Usage: $me <dir>\n";
print " where <dir> contains the TransliteratorRule_*.java\n";
print " files.\n";
print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
# Mapping from Java IDs to ICU file names
# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
$NAME_MAP = <<'END';
{ "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
{ "Latin-Arabic", "Arabic-Latin", "larabic" }
{ "Latin-Cyrillic", "Cyrillic-Latin", "lcyril" }
{ "Latin-Devanagari", "Devanagari-Latin", "ldevan" }
{ "Latin-Greek", "Greek-Latin", "lgreek" }
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
{ "Latin-Kana", "Kana-Latin", "lkana" }
// Other miscellaneous rules
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
{ "KeyboardEscape-Latin1", "", "kbdescl1" }
{ "UnicodeName-UnicodeChar", "", "ucname" }
foreach (split(/\n/, $NAME_MAP)) {
if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
$NAME_MAP{$1} = $3;
} elsif (/\S/) {
print STDERR "Ignoring $_\n";
# Header blocks of text written at start of ICU output files
// Copyright (c) 1999-2000, International Business Machines
// Corporation and others. All Rights Reserved.
$TOOL = $0;
# Iterate over all Java RBT resource files. Process those with a mapping to
# an ICU name.
foreach (<$DIR/TransliterationRule_*.java>) {
next if (/~$/);
my $id;
if (m|TransliterationRule_(.+)\.java$|) {
$id = $1;
} else { die; }
$id =~ s/_/-/g;
if (!exists $NAME_MAP{$id}) {
print STDERR "$id: skipping, no ICU file name\n";
file($id, $_, $NAME_MAP{$id});
# Process one file
# Param: ID, e.g. Fullwidth-Halfwidth
# Param: Java input file name, e.g.
# f:/icu4j/src/com/ibm/text/resources/
# Param: ICU output file name, e.g. fullhalf
sub file {
my $id = shift;
my $IN = shift;
my $out = shift;
my $OUT = "$out.txt";
# Show input size. Show output size later -- useful for quick sanity check.
print "$id (", -s $IN, ") -> $OUT (";
# Write output file header
open(OUT, ">$OUT") or die;
print OUT $HEADER1;
print OUT "// Tool: $TOOL\n// Source: $IN\n";
print OUT "// Date: ", scalar localtime, "\n";
print OUT $HEADER2;
print OUT "\n";
print OUT "// $id\n";
print OUT "\n";
print OUT "$out {\n";
print OUT " Rule {\n";
# Open input file and skip over everything before "Rule" RB key
open(IN, $IN) or die;
while (<IN>) {
last if (/\"Rule\"/);
# Process each line by deleting leading or trailing '+' (but not both)
# and by normalizing leading space.
# Recognize these kinds of lines:
# "9>\u0669;"+ // optional comment
# +"9>\u0669;" // optional comment
# // comment
# + "Zh>$ZH;" + "Zh<$ZH}$lower;"
# "'account of%'>\u2100", -- this occurs in a String[] resource
while (<IN>) {
last if (/^\s*\}/); # Any line starting with '}' ends the rule set
# NOTE: We have to handle a rule like this:
# "a", "b", "c",
# that fails to terminate statements with separators.
# Trim leading and trailing space
my $raw = $_;
# Transform escaped characters
# Process double-quoted strings
my $q;
for (;;) {
if (s|^\s*,\s*||) { # Trim leading ','
# Add separator between comma-separated rules
# if it isn't there already:
# "a>b", "c>d" -> "a>b;" "c>d"
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
} else {
s|^\s*\+\s*||; # Trim leading '+'
if (s|^(\".*?\")||) {
$q .= ' ' if ($q);
$q .= $1;
} else {
if (s|^\s*,\s*||) { # Trim final ','
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
} else {
s|^\s*\+\s*||; # Trim final '+'
# Remove and save trailing // comment
my $cmt;
if (s|^\s*(//.*)$||) {
$cmt = ' ' if ($q);
$cmt .= $1;
if (/\S/) {
print STDERR "Error: left over \"$_\" in \"$raw\"\n";
$_ = " " . $q . $cmt . "\n";
# Restore escaped characters
print OUT;
# Finish up
print OUT " }\n";
print OUT "}\n";
# Write output file size for sanity check
print -s $OUT, ")\n";
sub hideEscapes {
# Transform escaped characters
s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
s|\\\"|<<dq>>|; # Transform backslash double quote
s|\\(.)|<<q$1>>|; # Transform backslash escapes
sub restoreEscapes {
# Restore escaped characters
s|<<u0000>>|\\\\u0000|g; # Double escape U+0000

View File

@ -0,0 +1,238 @@
@rem = '--*-Perl-*--
@echo off
if "%OS%" == "Windows_NT" goto WinNT
perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
goto endofperl
perl -x -S "%0" %*
if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
if %errorlevel% == 9009 echo You do not have Perl in your PATH.
goto endofperl
@rem ';
#line 14
# This perl script creates ICU transliterator data files, that live
# in icu/data, from ICU4J Java transliterator data files, in
# icu4j/src/com/ibm/text/resources.
# The transformation that is done is very minimal. The script assumes
# that the Java input files use only // comments (no /**/ comments)
# and that they follow a rigid format. Leading or trailing '+' (but not both)
# concatenation operators are stripped from each line.
# The output files are named according to ICU conventions (see NAME_MAP
# below) and created in the current directory. They should be manually
# checked and then copied into the icu/data directory. An ICU build must
# then be initiated, and the standard suite of ICU transliterator tests
# should be run after that.
# Alan Liu 5/19/00
if (scalar @ARGV != 1) {
$DIR = shift;
if (! -d $DIR) {
sub usage {
my $me = $0;
$me =~ s|.+[/\\]||;
print "Usage: $me <dir>\n";
print " where <dir> contains the TransliteratorRule_*.java\n";
print " files.\n";
print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
# Mapping from Java IDs to ICU file names
# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
$NAME_MAP = <<'END';
{ "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
{ "Latin-Arabic", "Arabic-Latin", "larabic" }
{ "Latin-Cyrillic", "Cyrillic-Latin", "lcyril" }
{ "Latin-Devanagari", "Devanagari-Latin", "ldevan" }
{ "Latin-Greek", "Greek-Latin", "lgreek" }
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
{ "Latin-Kana", "Kana-Latin", "lkana" }
// Other miscellaneous rules
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
{ "KeyboardEscape-Latin1", "", "kbdescl1" }
{ "UnicodeName-UnicodeChar", "", "ucname" }
foreach (split(/\n/, $NAME_MAP)) {
if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
$NAME_MAP{$1} = $3;
} elsif (/\S/) {
print STDERR "Ignoring $_\n";
# Header blocks of text written at start of ICU output files
// Copyright (c) 1999-2000, International Business Machines
// Corporation and others. All Rights Reserved.
$TOOL = $0;
# Iterate over all Java RBT resource files. Process those with a mapping to
# an ICU name.
foreach (<$DIR/TransliterationRule_*.java>) {
next if (/~$/);
my $id;
if (m|TransliterationRule_(.+)\.java$|) {
$id = $1;
} else { die; }
$id =~ s/_/-/g;
if (!exists $NAME_MAP{$id}) {
print STDERR "$id: skipping, no ICU file name\n";
file($id, $_, $NAME_MAP{$id});
# Process one file
# Param: ID, e.g. Fullwidth-Halfwidth
# Param: Java input file name, e.g.
# f:/icu4j/src/com/ibm/text/resources/
# Param: ICU output file name, e.g. fullhalf
sub file {
my $id = shift;
my $IN = shift;
my $out = shift;
my $OUT = "$out.txt";
# Show input size. Show output size later -- useful for quick sanity check.
print "$id (", -s $IN, ") -> $OUT (";
# Write output file header
open(OUT, ">$OUT") or die;
print OUT $HEADER1;
print OUT "// Tool: $TOOL\n// Source: $IN\n";
print OUT "// Date: ", scalar localtime, "\n";
print OUT $HEADER2;
print OUT "\n";
print OUT "// $id\n";
print OUT "\n";
print OUT "$out {\n";
print OUT " Rule {\n";
# Open input file and skip over everything before "Rule" RB key
open(IN, $IN) or die;
while (<IN>) {
last if (/\"Rule\"/);
# Process each line by deleting leading or trailing '+' (but not both)
# and by normalizing leading space.
# Recognize these kinds of lines:
# "9>\u0669;"+ // optional comment
# +"9>\u0669;" // optional comment
# // comment
# + "Zh>$ZH;" + "Zh<$ZH}$lower;"
# "'account of%'>\u2100", -- this occurs in a String[] resource
while (<IN>) {
last if (/^\s*\}/); # Any line starting with '}' ends the rule set
# NOTE: We have to handle a rule like this:
# "a", "b", "c",
# that fails to terminate statements with separators.
# Trim leading and trailing space
my $raw = $_;
# Transform escaped characters
# Process double-quoted strings
my $q;
for (;;) {
if (s|^\s*,\s*||) { # Trim leading ','
# Add separator between comma-separated rules
# if it isn't there already:
# "a>b", "c>d" -> "a>b;" "c>d"
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
} else {
s|^\s*\+\s*||; # Trim leading '+'
if (s|^(\".*?\")||) {
$q .= ' ' if ($q);
$q .= $1;
} else {
if (s|^\s*,\s*||) { # Trim final ','
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
} else {
s|^\s*\+\s*||; # Trim final '+'
# Remove and save trailing // comment
my $cmt;
if (s|^\s*(//.*)$||) {
$cmt = ' ' if ($q);
$cmt .= $1;
if (/\S/) {
print STDERR "Error: left over \"$_\" in \"$raw\"\n";
$_ = " " . $q . $cmt . "\n";
# Restore escaped characters
print OUT;
# Finish up
print OUT " }\n";
print OUT "}\n";
# Write output file size for sanity check
print -s $OUT, ")\n";
sub hideEscapes {
# Transform escaped characters
s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
s|\\\"|<<dq>>|; # Transform backslash double quote
s|\\(.)|<<q$1>>|; # Transform backslash escapes
sub restoreEscapes {
# Restore escaped characters
s|<<u0000>>|\\\\u0000|g; # Double escape U+0000