Initial checkin; creates ICU rules from ICU4J rules

X-SVN-Rev: 1442
2000-05-23 16:50:38 +00:00 · 2000-05-23 16:50:38 +00:00 · 4576173828
commit 4576173828
parent 721dca33fc
2 changed files with 476 additions and 0 deletions
--- a/icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
+++ b/icu4j/src/com/ibm/icu/dev/tool/translit/dumpICUrules.bat
@ -0,0 +1,238 @@
+@rem = '--*-Perl-*--
+@echo off
+if "%OS%" == "Windows_NT" goto WinNT
+perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
+goto endofperl
+:WinNT
+perl -x -S "%0" %*
+if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
+if %errorlevel% == 9009 echo You do not have Perl in your PATH.
+goto endofperl
+@rem ';
+#!perl
+#line 14
+
+# This perl script creates ICU transliterator data files, that live
+# in icu/data, from ICU4J Java transliterator data files, in
+# icu4j/src/com/ibm/text/resources.
+#
+# The transformation that is done is very minimal.  The script assumes
+# that the Java input files use only // comments (no /**/ comments)
+# and that they follow a rigid format.  Leading or trailing '+' (but not both)
+# concatenation operators are stripped from each line.
+#
+# The output files are named according to ICU conventions (see NAME_MAP
+# below) and created in the current directory.  They should be manually
+# checked and then copied into the icu/data directory.  An ICU build must
+# then be initiated, and the standard suite of ICU transliterator tests
+# should be run after that.
+#
+# Alan Liu 5/19/00
+
+if (scalar @ARGV != 1) {
+    usage();
+}
+$DIR = shift;
+if (! -d $DIR) {
+    usage();
+}
+
+sub usage {
+    my $me = $0;
+    $me =~ s|.+[/\\]||;
+    print "Usage: $me <dir>\n";
+    print " where <dir> contains the TransliteratorRule_*.java\n";
+    print " files.\n";
+    print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
+    die;
+}
+
+# Mapping from Java IDs to ICU file names
+# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
+$NAME_MAP = <<'END';
+        { "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
+        { "Latin-Arabic",        "Arabic-Latin",        "larabic"  }
+        { "Latin-Cyrillic",      "Cyrillic-Latin",      "lcyril"   }
+        { "Latin-Devanagari",    "Devanagari-Latin",    "ldevan"   }
+        { "Latin-Greek",         "Greek-Latin",         "lgreek"   }
+        { "Latin-Hebrew",        "Hebrew-Latin",        "lhebrew"  }
+        { "Latin-Jamo",          "Jamo-Latin",          "ljamo"    }
+        { "Latin-Kana",          "Kana-Latin",          "lkana"    }
+
+        // Other miscellaneous rules
+        { "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
+        { "KeyboardEscape-Latin1", "", "kbdescl1" }
+        { "UnicodeName-UnicodeChar", "", "ucname" }
+END
+
+foreach (split(/\n/, $NAME_MAP)) {
+    s|//.+||;
+    if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
+        $NAME_MAP{$1} = $3;
+    } elsif (/\S/) {
+        print STDERR "Ignoring $_\n";
+    }
+}
+
+# Header blocks of text written at start of ICU output files
+$HEADER1 = <<END;
+//--------------------------------------------------------------------
+// Copyright (c) 1999-2000, International Business Machines
+// Corporation and others.  All Rights Reserved.
+//--------------------------------------------------------------------
+// THIS IS A MACHINE-GENERATED FILE
+END
+$HEADER2 = <<END;
+//--------------------------------------------------------------------
+END
+
+$TOOL = $0;
+
+# Iterate over all Java RBT resource files.  Process those with a mapping to
+# an ICU name.
+foreach (<$DIR/TransliterationRule_*.java>) {
+    next if (/~$/);
+    my $id;
+    if (m|TransliterationRule_(.+)\.java$|) {
+        $id = $1;
+    } else { die; }
+    $id =~ s/_/-/g;
+    if (!exists $NAME_MAP{$id}) {
+        print STDERR "$id: skipping, no ICU file name\n";
+        next;
+    }
+    file($id, $_, $NAME_MAP{$id});
+}
+
+# Process one file
+# Param: ID, e.g. Fullwidth-Halfwidth
+# Param: Java input file name, e.g.
+#  f:/icu4j/src/com/ibm/text/resources/TransliterationRule_Fullwidth_Halfwidth.java
+# Param: ICU output file name, e.g. fullhalf
+sub file {
+    my $id = shift;
+    my $IN = shift;
+    my $out = shift;
+
+    my $OUT = "$out.txt";
+
+    # Show input size. Show output size later -- useful for quick sanity check.
+    print "$id (", -s $IN, ") -> $OUT (";
+
+    # Write output file header
+    open(OUT, ">$OUT") or die;
+    print OUT $HEADER1;
+    print OUT "// Tool: $TOOL\n// Source: $IN\n";
+    print OUT "// Date: ", scalar localtime, "\n";
+    print OUT $HEADER2;
+    print OUT "\n";
+    print OUT "// $id\n";
+    print OUT "\n";
+    print OUT "$out {\n";
+    print OUT "  Rule {\n";
+
+    # Open input file and skip over everything before "Rule" RB key
+    open(IN, $IN) or die;
+    while (<IN>) {
+        last if (/\"Rule\"/);
+    }
+
+    # Process each line by deleting leading or trailing '+' (but not both)
+    # and by normalizing leading space.
+
+    # Recognize these kinds of lines:
+    #  "9>\u0669;"+ // optional comment
+    #  +"9>\u0669;" // optional comment
+    #  // comment
+    #  + "Zh>$ZH;" + "Zh<$ZH}$lower;"
+    #  "'account of%'>\u2100",  -- this occurs in a String[] resource
+    while (<IN>) {
+        last if (/^\s*\}/); # Any line starting with '}' ends the rule set
+
+        # NOTE: We have to handle a rule like this:
+        #   "a", "b", "c",
+        # that fails to terminate statements with separators.
+
+        # Trim leading and trailing space
+        s|^\s+||;
+        s|\s+$||;
+
+        my $raw = $_;
+
+        # Transform escaped characters
+        hideEscapes();
+        
+        # Process double-quoted strings
+        my $q;
+        for (;;) {
+            if (s|^\s*,\s*||) { # Trim leading ','
+                # Add separator between comma-separated rules
+                # if it isn't there already:
+                # "a>b", "c>d" -> "a>b;" "c>d"
+                print STDERR "Error: Can't parse \"$raw\"" unless ($q);
+                $q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
+            } else {
+                s|^\s*\+\s*||; # Trim leading '+'
+            }
+            if (s|^(\".*?\")||) {
+                $q .= ' ' if ($q);
+                $q .= $1;
+            } else {
+                last;
+            }
+        }
+
+        if (s|^\s*,\s*||) { # Trim final ','
+            print STDERR "Error: Can't parse \"$raw\"" unless ($q);
+            $q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);            
+        } else {
+            s|^\s*\+\s*||; # Trim final '+'
+        }
+
+        # Remove and save trailing // comment
+        my $cmt;
+        if (s|^\s*(//.*)$||) {
+            $cmt = ' ' if ($q);
+            $cmt .= $1;
+        }
+        
+        if (/\S/) {
+            chomp($raw);
+            print STDERR "Error: left over \"$_\" in \"$raw\"\n";
+        }
+
+        $_ = "    " . $q . $cmt . "\n";
+
+        # Restore escaped characters
+        restoreEscapes();
+
+        print OUT;
+    }
+
+    # Finish up
+    close(IN);
+    print OUT "  }\n";
+    print OUT "}\n";
+    close(OUT);
+
+    # Write output file size for sanity check
+    print -s $OUT, ")\n";
+}
+
+sub hideEscapes {
+    # Transform escaped characters
+    s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
+    s|\\\"|<<dq>>|; # Transform backslash double quote
+    s|\\(.)|<<q$1>>|; # Transform backslash escapes
+}
+
+sub restoreEscapes {
+    # Restore escaped characters
+    s|<<dq>>|\\\"|g;
+    s|<<q(.)>>|\\$1|g;
+    s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
+    s|<<u(....)>>|\\u$1|g;
+}
+
+__END__
+:endofperl
--- a/icu4j/src/com/ibm/tools/translit/dumpICUrules.bat
+++ b/icu4j/src/com/ibm/tools/translit/dumpICUrules.bat
@ -0,0 +1,238 @@
+@rem = '--*-Perl-*--
+@echo off
+if "%OS%" == "Windows_NT" goto WinNT
+perl -x -S "%0" %1 %2 %3 %4 %5 %6 %7 %8 %9
+goto endofperl
+:WinNT
+perl -x -S "%0" %*
+if NOT "%COMSPEC%" == "%SystemRoot%\system32\cmd.exe" goto endofperl
+if %errorlevel% == 9009 echo You do not have Perl in your PATH.
+goto endofperl
+@rem ';
+#!perl
+#line 14
+
+# This perl script creates ICU transliterator data files, that live
+# in icu/data, from ICU4J Java transliterator data files, in
+# icu4j/src/com/ibm/text/resources.
+#
+# The transformation that is done is very minimal.  The script assumes
+# that the Java input files use only // comments (no /**/ comments)
+# and that they follow a rigid format.  Leading or trailing '+' (but not both)
+# concatenation operators are stripped from each line.
+#
+# The output files are named according to ICU conventions (see NAME_MAP
+# below) and created in the current directory.  They should be manually
+# checked and then copied into the icu/data directory.  An ICU build must
+# then be initiated, and the standard suite of ICU transliterator tests
+# should be run after that.
+#
+# Alan Liu 5/19/00
+
+if (scalar @ARGV != 1) {
+    usage();
+}
+$DIR = shift;
+if (! -d $DIR) {
+    usage();
+}
+
+sub usage {
+    my $me = $0;
+    $me =~ s|.+[/\\]||;
+    print "Usage: $me <dir>\n";
+    print " where <dir> contains the TransliteratorRule_*.java\n";
+    print " files.\n";
+    print "e.g., $me F:/icu4j/src/com/ibm/text/resources\n";
+    die;
+}
+
+# Mapping from Java IDs to ICU file names
+# Copied from icu/data/translit_index.txt, with long lines folded into 1 line
+$NAME_MAP = <<'END';
+        { "Fullwidth-Halfwidth", "Halfwidth-Fullwidth", "fullhalf" }
+        { "Latin-Arabic",        "Arabic-Latin",        "larabic"  }
+        { "Latin-Cyrillic",      "Cyrillic-Latin",      "lcyril"   }
+        { "Latin-Devanagari",    "Devanagari-Latin",    "ldevan"   }
+        { "Latin-Greek",         "Greek-Latin",         "lgreek"   }
+        { "Latin-Hebrew",        "Hebrew-Latin",        "lhebrew"  }
+        { "Latin-Jamo",          "Jamo-Latin",          "ljamo"    }
+        { "Latin-Kana",          "Kana-Latin",          "lkana"    }
+
+        // Other miscellaneous rules
+        { "StraightQuotes-CurlyQuotes", "CurlyQuotes-StraightQuotes", "quotes" }
+        { "KeyboardEscape-Latin1", "", "kbdescl1" }
+        { "UnicodeName-UnicodeChar", "", "ucname" }
+END
+
+foreach (split(/\n/, $NAME_MAP)) {
+    s|//.+||;
+    if (m|\"(.+)\".+\"(.*)\".+\"(.+)\".+|) {
+        $NAME_MAP{$1} = $3;
+    } elsif (/\S/) {
+        print STDERR "Ignoring $_\n";
+    }
+}
+
+# Header blocks of text written at start of ICU output files
+$HEADER1 = <<END;
+//--------------------------------------------------------------------
+// Copyright (c) 1999-2000, International Business Machines
+// Corporation and others.  All Rights Reserved.
+//--------------------------------------------------------------------
+// THIS IS A MACHINE-GENERATED FILE
+END
+$HEADER2 = <<END;
+//--------------------------------------------------------------------
+END
+
+$TOOL = $0;
+
+# Iterate over all Java RBT resource files.  Process those with a mapping to
+# an ICU name.
+foreach (<$DIR/TransliterationRule_*.java>) {
+    next if (/~$/);
+    my $id;
+    if (m|TransliterationRule_(.+)\.java$|) {
+        $id = $1;
+    } else { die; }
+    $id =~ s/_/-/g;
+    if (!exists $NAME_MAP{$id}) {
+        print STDERR "$id: skipping, no ICU file name\n";
+        next;
+    }
+    file($id, $_, $NAME_MAP{$id});
+}
+
+# Process one file
+# Param: ID, e.g. Fullwidth-Halfwidth
+# Param: Java input file name, e.g.
+#  f:/icu4j/src/com/ibm/text/resources/TransliterationRule_Fullwidth_Halfwidth.java
+# Param: ICU output file name, e.g. fullhalf
+sub file {
+    my $id = shift;
+    my $IN = shift;
+    my $out = shift;
+
+    my $OUT = "$out.txt";
+
+    # Show input size. Show output size later -- useful for quick sanity check.
+    print "$id (", -s $IN, ") -> $OUT (";
+
+    # Write output file header
+    open(OUT, ">$OUT") or die;
+    print OUT $HEADER1;
+    print OUT "// Tool: $TOOL\n// Source: $IN\n";
+    print OUT "// Date: ", scalar localtime, "\n";
+    print OUT $HEADER2;
+    print OUT "\n";
+    print OUT "// $id\n";
+    print OUT "\n";
+    print OUT "$out {\n";
+    print OUT "  Rule {\n";
+
+    # Open input file and skip over everything before "Rule" RB key
+    open(IN, $IN) or die;
+    while (<IN>) {
+        last if (/\"Rule\"/);
+    }
+
+    # Process each line by deleting leading or trailing '+' (but not both)
+    # and by normalizing leading space.
+
+    # Recognize these kinds of lines:
+    #  "9>\u0669;"+ // optional comment
+    #  +"9>\u0669;" // optional comment
+    #  // comment
+    #  + "Zh>$ZH;" + "Zh<$ZH}$lower;"
+    #  "'account of%'>\u2100",  -- this occurs in a String[] resource
+    while (<IN>) {
+        last if (/^\s*\}/); # Any line starting with '}' ends the rule set
+
+        # NOTE: We have to handle a rule like this:
+        #   "a", "b", "c",
+        # that fails to terminate statements with separators.
+
+        # Trim leading and trailing space
+        s|^\s+||;
+        s|\s+$||;
+
+        my $raw = $_;
+
+        # Transform escaped characters
+        hideEscapes();
+        
+        # Process double-quoted strings
+        my $q;
+        for (;;) {
+            if (s|^\s*,\s*||) { # Trim leading ','
+                # Add separator between comma-separated rules
+                # if it isn't there already:
+                # "a>b", "c>d" -> "a>b;" "c>d"
+                print STDERR "Error: Can't parse \"$raw\"" unless ($q);
+                $q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);
+            } else {
+                s|^\s*\+\s*||; # Trim leading '+'
+            }
+            if (s|^(\".*?\")||) {
+                $q .= ' ' if ($q);
+                $q .= $1;
+            } else {
+                last;
+            }
+        }
+
+        if (s|^\s*,\s*||) { # Trim final ','
+            print STDERR "Error: Can't parse \"$raw\"" unless ($q);
+            $q =~ s|\"$|;\"| unless ($q =~ m|;\"$|);            
+        } else {
+            s|^\s*\+\s*||; # Trim final '+'
+        }
+
+        # Remove and save trailing // comment
+        my $cmt;
+        if (s|^\s*(//.*)$||) {
+            $cmt = ' ' if ($q);
+            $cmt .= $1;
+        }
+        
+        if (/\S/) {
+            chomp($raw);
+            print STDERR "Error: left over \"$_\" in \"$raw\"\n";
+        }
+
+        $_ = "    " . $q . $cmt . "\n";
+
+        # Restore escaped characters
+        restoreEscapes();
+
+        print OUT;
+    }
+
+    # Finish up
+    close(IN);
+    print OUT "  }\n";
+    print OUT "}\n";
+    close(OUT);
+
+    # Write output file size for sanity check
+    print -s $OUT, ")\n";
+}
+
+sub hideEscapes {
+    # Transform escaped characters
+    s|\\u([a-zA-Z0-9]{4})|<<u$1>>|g; # Transform Unicode escapes
+    s|\\\"|<<dq>>|; # Transform backslash double quote
+    s|\\(.)|<<q$1>>|; # Transform backslash escapes
+}
+
+sub restoreEscapes {
+    # Restore escaped characters
+    s|<<dq>>|\\\"|g;
+    s|<<q(.)>>|\\$1|g;
+    s|<<u0000>>|\\\\u0000|g; # Double escape U+0000
+    s|<<u(....)>>|\\u$1|g;
+}
+
+__END__
+:endofperl