Initial checkin; creates ICU rules from ICU4J rules
X-SVN-Rev: 1442
This commit is contained in:
parent
721dca33fc
commit
4576173828
238
icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
Executable file
238
icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
Executable file
@ -0,0 +1,238 @@
|
|||||||
|
@rem = '--*-Perl-*--
|
||||||
|
@echo off
|
||||||
|
if "%OS%" == "Windows_NT" goto WinNT
|
||||||
|
perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||||
|
goto endofperl
|
||||||
|
:WinNT
|
||||||
|
perl -x -S "%0" %*
|
||||||
|
if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
|
||||||
|
if %errorlevel% == 9009 echo You do not have Perl in your PATH.
|
||||||
|
goto endofperl
|
||||||
|
@rem ';
|
||||||
|
#!perl
|
||||||
|
#line 14
|
||||||
|
|
||||||
|
# This perl script creates ICU transliterator data files, that live
|
||||||
|
# in icu/data, from ICU4J Java transliterator data files, in
|
||||||
|
# icu4j/src/com/ibm/text/resources.
|
||||||
|
#
|
||||||
|
# The transformation that is done is very minimal. The script assumes
|
||||||
|
# that the Java input files use only // comments (no /**/ comments)
|
||||||
|
# and that they follow a rigid format. Leading or trailing '+' (but not both)
|
||||||
|
# concatenation operators are stripped from each line.
|
||||||
|
#
|
||||||
|
# The output files are named according to ICU conventions (see NAME_MAP
|
||||||
|
# below) and created in the current directory. They should be manually
|
||||||
|
# checked and then copied into the icu/data directory. An ICU build must
|
||||||
|
# then be initiated, and the standard suite of ICU transliterator tests
|
||||||
|
# should be run after that.
|
||||||
|
#
|
||||||
|
# Alan Liu 5/19/00
|
||||||
|
|
||||||
|
if (scalar @ARGV != 1) {
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
$DIR = shift;
|
||||||
|
if (! -d $DIR) {
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
sub usage {
|
||||||
|
my $me = $0;
|
||||||
|
$me =~ s|.+[/\\]||;
|
||||||
|
print "Usage: $me <dir>\n";
|
||||||
|
print " where <dir> contains the TransliteratorRule_*.java\n";
|
||||||
|
print " files.\n";
|
||||||
|
print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
|
||||||
|
die;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mapping from Java IDs to ICU file names
|
||||||
|
# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
|
||||||
|
$NAME_MAP = <<'END';
|
||||||
|
{ "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
|
||||||
|
{ "Latin-Arabic", "Arabic-Latin", "larabic" }
|
||||||
|
{ "Latin-Cyrillic", "Cyrillic-Latin", "lcyril" }
|
||||||
|
{ "Latin-Devanagari", "Devanagari-Latin", "ldevan" }
|
||||||
|
{ "Latin-Greek", "Greek-Latin", "lgreek" }
|
||||||
|
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
|
||||||
|
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
|
||||||
|
{ "Latin-Kana", "Kana-Latin", "lkana" }
|
||||||
|
|
||||||
|
// Other miscellaneous rules
|
||||||
|
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
|
||||||
|
{ "KeyboardEscape-Latin1", "", "kbdescl1" }
|
||||||
|
{ "UnicodeName-UnicodeChar", "", "ucname" }
|
||||||
|
END
|
||||||
|
|
||||||
|
foreach (split(/\n/, $NAME_MAP)) {
|
||||||
|
s|//.+||;
|
||||||
|
if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
|
||||||
|
$NAME_MAP{$1} = $3;
|
||||||
|
} elsif (/\S/) {
|
||||||
|
print STDERR "Ignoring $_\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Header blocks of text written at start of ICU output files
|
||||||
|
$HEADER1 = <<END;
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Copyright (c) 1999-2000, International Business Machines
|
||||||
|
// Corporation and others. All Rights Reserved.
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// THIS IS A MACHINE-GENERATED FILE
|
||||||
|
END
|
||||||
|
$HEADER2 = <<END;
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
END
|
||||||
|
|
||||||
|
$TOOL = $0;
|
||||||
|
|
||||||
|
# Iterate over all Java RBT resource files. Process those with a mapping to
|
||||||
|
# an ICU name.
|
||||||
|
foreach (<$DIR/TransliterationRule_*.java>) {
|
||||||
|
next if (/~$/);
|
||||||
|
my $id;
|
||||||
|
if (m|TransliterationRule_(.+)\.java$|) {
|
||||||
|
$id = $1;
|
||||||
|
} else { die; }
|
||||||
|
$id =~ s/_/-/g;
|
||||||
|
if (!exists $NAME_MAP{$id}) {
|
||||||
|
print STDERR "$id: skipping, no ICU file name\n";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
file($id, $_, $NAME_MAP{$id});
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process one file
|
||||||
|
# Param: ID, e.g. Fullwidth-Halfwidth
|
||||||
|
# Param: Java input file name, e.g.
|
||||||
|
# f:/icu4j/src/com/ibm/text/resources/TransliterationRule_Fullwidth_Halfwidth.java
|
||||||
|
# Param: ICU output file name, e.g. fullhalf
|
||||||
|
sub file {
|
||||||
|
my $id = shift;
|
||||||
|
my $IN = shift;
|
||||||
|
my $out = shift;
|
||||||
|
|
||||||
|
my $OUT = "$out.txt";
|
||||||
|
|
||||||
|
# Show input size. Show output size later -- useful for quick sanity check.
|
||||||
|
print "$id (", -s $IN, ") -> $OUT (";
|
||||||
|
|
||||||
|
# Write output file header
|
||||||
|
open(OUT, ">$OUT") or die;
|
||||||
|
print OUT $HEADER1;
|
||||||
|
print OUT "// Tool: $TOOL\n// Source: $IN\n";
|
||||||
|
print OUT "// Date: ", scalar localtime, "\n";
|
||||||
|
print OUT $HEADER2;
|
||||||
|
print OUT "\n";
|
||||||
|
print OUT "// $id\n";
|
||||||
|
print OUT "\n";
|
||||||
|
print OUT "$out {\n";
|
||||||
|
print OUT " Rule {\n";
|
||||||
|
|
||||||
|
# Open input file and skip over everything before "Rule" RB key
|
||||||
|
open(IN, $IN) or die;
|
||||||
|
while (<IN>) {
|
||||||
|
last if (/\"Rule\"/);
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each line by deleting leading or trailing '+' (but not both)
|
||||||
|
# and by normalizing leading space.
|
||||||
|
|
||||||
|
# Recognize these kinds of lines:
|
||||||
|
# "9>\u0669;"+ // optional comment
|
||||||
|
# +"9>\u0669;" // optional comment
|
||||||
|
# // comment
|
||||||
|
# + "Zh>$ZH;" + "Zh<$ZH}$lower;"
|
||||||
|
# "'account of%'>\u2100", -- this occurs in a String[] resource
|
||||||
|
while (<IN>) {
|
||||||
|
last if (/^\s*\}/); # Any line starting with '}' ends the rule set
|
||||||
|
|
||||||
|
# NOTE: We have to handle a rule like this:
|
||||||
|
# "a", "b", "c",
|
||||||
|
# that fails to terminate statements with separators.
|
||||||
|
|
||||||
|
# Trim leading and trailing space
|
||||||
|
s|^\s+||;
|
||||||
|
s|\s+$||;
|
||||||
|
|
||||||
|
my $raw = $_;
|
||||||
|
|
||||||
|
# Transform escaped characters
|
||||||
|
hideEscapes();
|
||||||
|
|
||||||
|
# Process double-quoted strings
|
||||||
|
my $q;
|
||||||
|
for (;;) {
|
||||||
|
if (s|^\s*,\s*||) { # Trim leading ','
|
||||||
|
# Add separator between comma-separated rules
|
||||||
|
# if it isn't there already:
|
||||||
|
# "a>b", "c>d" -> "a>b;" "c>d"
|
||||||
|
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
|
||||||
|
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
|
||||||
|
} else {
|
||||||
|
s|^\s*\+\s*||; # Trim leading '+'
|
||||||
|
}
|
||||||
|
if (s|^(\".*?\")||) {
|
||||||
|
$q .= ' ' if ($q);
|
||||||
|
$q .= $1;
|
||||||
|
} else {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s|^\s*,\s*||) { # Trim final ','
|
||||||
|
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
|
||||||
|
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
|
||||||
|
} else {
|
||||||
|
s|^\s*\+\s*||; # Trim final '+'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Remove and save trailing // comment
|
||||||
|
my $cmt;
|
||||||
|
if (s|^\s*(//.*)$||) {
|
||||||
|
$cmt = ' ' if ($q);
|
||||||
|
$cmt .= $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/\S/) {
|
||||||
|
chomp($raw);
|
||||||
|
print STDERR "Error: left over \"$_\" in \"$raw\"\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
$_ = " " . $q . $cmt . "\n";
|
||||||
|
|
||||||
|
# Restore escaped characters
|
||||||
|
restoreEscapes();
|
||||||
|
|
||||||
|
print OUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Finish up
|
||||||
|
close(IN);
|
||||||
|
print OUT " }\n";
|
||||||
|
print OUT "}\n";
|
||||||
|
close(OUT);
|
||||||
|
|
||||||
|
# Write output file size for sanity check
|
||||||
|
print -s $OUT, ")\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub hideEscapes {
|
||||||
|
# Transform escaped characters
|
||||||
|
s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
|
||||||
|
s|\\\"|<<dq>>|; # Transform backslash double quote
|
||||||
|
s|\\(.)|<<q$1>>|; # Transform backslash escapes
|
||||||
|
}
|
||||||
|
|
||||||
|
sub restoreEscapes {
|
||||||
|
# Restore escaped characters
|
||||||
|
s|<<dq>>|\\\"|g;
|
||||||
|
s|<<q(.)>>|\\$1|g;
|
||||||
|
s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
|
||||||
|
s|<<u(....)>>|\\u$1|g;
|
||||||
|
}
|
||||||
|
|
||||||
|
__END__
|
||||||
|
:endofperl
|
238
icu4j/src/com/ibm/tools/translit/dumpICUrules.bat
Executable file
238
icu4j/src/com/ibm/tools/translit/dumpICUrules.bat
Executable file
@ -0,0 +1,238 @@
|
|||||||
|
@rem = '--*-Perl-*--
|
||||||
|
@echo off
|
||||||
|
if "%OS%" == "Windows_NT" goto WinNT
|
||||||
|
perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
|
||||||
|
goto endofperl
|
||||||
|
:WinNT
|
||||||
|
perl -x -S "%0" %*
|
||||||
|
if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
|
||||||
|
if %errorlevel% == 9009 echo You do not have Perl in your PATH.
|
||||||
|
goto endofperl
|
||||||
|
@rem ';
|
||||||
|
#!perl
|
||||||
|
#line 14
|
||||||
|
|
||||||
|
# This perl script creates ICU transliterator data files, that live
|
||||||
|
# in icu/data, from ICU4J Java transliterator data files, in
|
||||||
|
# icu4j/src/com/ibm/text/resources.
|
||||||
|
#
|
||||||
|
# The transformation that is done is very minimal. The script assumes
|
||||||
|
# that the Java input files use only // comments (no /**/ comments)
|
||||||
|
# and that they follow a rigid format. Leading or trailing '+' (but not both)
|
||||||
|
# concatenation operators are stripped from each line.
|
||||||
|
#
|
||||||
|
# The output files are named according to ICU conventions (see NAME_MAP
|
||||||
|
# below) and created in the current directory. They should be manually
|
||||||
|
# checked and then copied into the icu/data directory. An ICU build must
|
||||||
|
# then be initiated, and the standard suite of ICU transliterator tests
|
||||||
|
# should be run after that.
|
||||||
|
#
|
||||||
|
# Alan Liu 5/19/00
|
||||||
|
|
||||||
|
if (scalar @ARGV != 1) {
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
$DIR = shift;
|
||||||
|
if (! -d $DIR) {
|
||||||
|
usage();
|
||||||
|
}
|
||||||
|
|
||||||
|
sub usage {
|
||||||
|
my $me = $0;
|
||||||
|
$me =~ s|.+[/\\]||;
|
||||||
|
print "Usage: $me <dir>\n";
|
||||||
|
print " where <dir> contains the TransliteratorRule_*.java\n";
|
||||||
|
print " files.\n";
|
||||||
|
print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
|
||||||
|
die;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mapping from Java IDs to ICU file names
|
||||||
|
# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
|
||||||
|
$NAME_MAP = <<'END';
|
||||||
|
{ "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
|
||||||
|
{ "Latin-Arabic", "Arabic-Latin", "larabic" }
|
||||||
|
{ "Latin-Cyrillic", "Cyrillic-Latin", "lcyril" }
|
||||||
|
{ "Latin-Devanagari", "Devanagari-Latin", "ldevan" }
|
||||||
|
{ "Latin-Greek", "Greek-Latin", "lgreek" }
|
||||||
|
{ "Latin-Hebrew", "Hebrew-Latin", "lhebrew" }
|
||||||
|
{ "Latin-Jamo", "Jamo-Latin", "ljamo" }
|
||||||
|
{ "Latin-Kana", "Kana-Latin", "lkana" }
|
||||||
|
|
||||||
|
// Other miscellaneous rules
|
||||||
|
{ "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
|
||||||
|
{ "KeyboardEscape-Latin1", "", "kbdescl1" }
|
||||||
|
{ "UnicodeName-UnicodeChar", "", "ucname" }
|
||||||
|
END
|
||||||
|
|
||||||
|
foreach (split(/\n/, $NAME_MAP)) {
|
||||||
|
s|//.+||;
|
||||||
|
if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
|
||||||
|
$NAME_MAP{$1} = $3;
|
||||||
|
} elsif (/\S/) {
|
||||||
|
print STDERR "Ignoring $_\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Header blocks of text written at start of ICU output files
|
||||||
|
$HEADER1 = <<END;
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// Copyright (c) 1999-2000, International Business Machines
|
||||||
|
// Corporation and others. All Rights Reserved.
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
// THIS IS A MACHINE-GENERATED FILE
|
||||||
|
END
|
||||||
|
$HEADER2 = <<END;
|
||||||
|
//--------------------------------------------------------------------
|
||||||
|
END
|
||||||
|
|
||||||
|
$TOOL = $0;
|
||||||
|
|
||||||
|
# Iterate over all Java RBT resource files. Process those with a mapping to
|
||||||
|
# an ICU name.
|
||||||
|
foreach (<$DIR/TransliterationRule_*.java>) {
|
||||||
|
next if (/~$/);
|
||||||
|
my $id;
|
||||||
|
if (m|TransliterationRule_(.+)\.java$|) {
|
||||||
|
$id = $1;
|
||||||
|
} else { die; }
|
||||||
|
$id =~ s/_/-/g;
|
||||||
|
if (!exists $NAME_MAP{$id}) {
|
||||||
|
print STDERR "$id: skipping, no ICU file name\n";
|
||||||
|
next;
|
||||||
|
}
|
||||||
|
file($id, $_, $NAME_MAP{$id});
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process one file
|
||||||
|
# Param: ID, e.g. Fullwidth-Halfwidth
|
||||||
|
# Param: Java input file name, e.g.
|
||||||
|
# f:/icu4j/src/com/ibm/text/resources/TransliterationRule_Fullwidth_Halfwidth.java
|
||||||
|
# Param: ICU output file name, e.g. fullhalf
|
||||||
|
sub file {
|
||||||
|
my $id = shift;
|
||||||
|
my $IN = shift;
|
||||||
|
my $out = shift;
|
||||||
|
|
||||||
|
my $OUT = "$out.txt";
|
||||||
|
|
||||||
|
# Show input size. Show output size later -- useful for quick sanity check.
|
||||||
|
print "$id (", -s $IN, ") -> $OUT (";
|
||||||
|
|
||||||
|
# Write output file header
|
||||||
|
open(OUT, ">$OUT") or die;
|
||||||
|
print OUT $HEADER1;
|
||||||
|
print OUT "// Tool: $TOOL\n// Source: $IN\n";
|
||||||
|
print OUT "// Date: ", scalar localtime, "\n";
|
||||||
|
print OUT $HEADER2;
|
||||||
|
print OUT "\n";
|
||||||
|
print OUT "// $id\n";
|
||||||
|
print OUT "\n";
|
||||||
|
print OUT "$out {\n";
|
||||||
|
print OUT " Rule {\n";
|
||||||
|
|
||||||
|
# Open input file and skip over everything before "Rule" RB key
|
||||||
|
open(IN, $IN) or die;
|
||||||
|
while (<IN>) {
|
||||||
|
last if (/\"Rule\"/);
|
||||||
|
}
|
||||||
|
|
||||||
|
# Process each line by deleting leading or trailing '+' (but not both)
|
||||||
|
# and by normalizing leading space.
|
||||||
|
|
||||||
|
# Recognize these kinds of lines:
|
||||||
|
# "9>\u0669;"+ // optional comment
|
||||||
|
# +"9>\u0669;" // optional comment
|
||||||
|
# // comment
|
||||||
|
# + "Zh>$ZH;" + "Zh<$ZH}$lower;"
|
||||||
|
# "'account of%'>\u2100", -- this occurs in a String[] resource
|
||||||
|
while (<IN>) {
|
||||||
|
last if (/^\s*\}/); # Any line starting with '}' ends the rule set
|
||||||
|
|
||||||
|
# NOTE: We have to handle a rule like this:
|
||||||
|
# "a", "b", "c",
|
||||||
|
# that fails to terminate statements with separators.
|
||||||
|
|
||||||
|
# Trim leading and trailing space
|
||||||
|
s|^\s+||;
|
||||||
|
s|\s+$||;
|
||||||
|
|
||||||
|
my $raw = $_;
|
||||||
|
|
||||||
|
# Transform escaped characters
|
||||||
|
hideEscapes();
|
||||||
|
|
||||||
|
# Process double-quoted strings
|
||||||
|
my $q;
|
||||||
|
for (;;) {
|
||||||
|
if (s|^\s*,\s*||) { # Trim leading ','
|
||||||
|
# Add separator between comma-separated rules
|
||||||
|
# if it isn't there already:
|
||||||
|
# "a>b", "c>d" -> "a>b;" "c>d"
|
||||||
|
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
|
||||||
|
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
|
||||||
|
} else {
|
||||||
|
s|^\s*\+\s*||; # Trim leading '+'
|
||||||
|
}
|
||||||
|
if (s|^(\".*?\")||) {
|
||||||
|
$q .= ' ' if ($q);
|
||||||
|
$q .= $1;
|
||||||
|
} else {
|
||||||
|
last;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s|^\s*,\s*||) { # Trim final ','
|
||||||
|
print STDERR "Error: Can't parse \"$raw\"" unless ($q);
|
||||||
|
$q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
|
||||||
|
} else {
|
||||||
|
s|^\s*\+\s*||; # Trim final '+'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Remove and save trailing // comment
|
||||||
|
my $cmt;
|
||||||
|
if (s|^\s*(//.*)$||) {
|
||||||
|
$cmt = ' ' if ($q);
|
||||||
|
$cmt .= $1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/\S/) {
|
||||||
|
chomp($raw);
|
||||||
|
print STDERR "Error: left over \"$_\" in \"$raw\"\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
$_ = " " . $q . $cmt . "\n";
|
||||||
|
|
||||||
|
# Restore escaped characters
|
||||||
|
restoreEscapes();
|
||||||
|
|
||||||
|
print OUT;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Finish up
|
||||||
|
close(IN);
|
||||||
|
print OUT " }\n";
|
||||||
|
print OUT "}\n";
|
||||||
|
close(OUT);
|
||||||
|
|
||||||
|
# Write output file size for sanity check
|
||||||
|
print -s $OUT, ")\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
sub hideEscapes {
|
||||||
|
# Transform escaped characters
|
||||||
|
s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
|
||||||
|
s|\\\"|<<dq>>|; # Transform backslash double quote
|
||||||
|
s|\\(.)|<<q$1>>|; # Transform backslash escapes
|
||||||
|
}
|
||||||
|
|
||||||
|
sub restoreEscapes {
|
||||||
|
# Restore escaped characters
|
||||||
|
s|<<dq>>|\\\"|g;
|
||||||
|
s|<<q(.)>>|\\$1|g;
|
||||||
|
s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
|
||||||
|
s|<<u(....)>>|\\u$1|g;
|
||||||
|
}
|
||||||
|
|
||||||
|
__END__
|
||||||
|
:endofperl
|
Loading…
Reference in New Issue
Block a user