ICU-1426 add el-Latin and Latin-el rules to locale resource

X-SVN-Rev: 6599
2001-11-03 02:04:37 +00:00 · 2001-11-03 02:04:37 +00:00 · 8886275ce8
commit 8886275ce8
parent 2820286119
6 changed files with 473 additions and 0 deletions
--- a/icu4c/data/el.txt
+++ b/icu4c/data/el.txt
@ -164,4 +164,220 @@ el {
            "1,000,000,000: << \u03b4\u03b9\u03c3\u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n"
            "1,000,000,000,000: =#,##0="
     }           
+
+    TransliterateLATIN {
+        "UNGEGN",
+
+        "# Rules are predicated on running NFD first, and NFC afterwards\n"
+        "::NFD (NFC) ; "
+
+        "# For modern Greek.\n"
+
+        "# Useful variables\n"
+
+        "$lower = [:Ll:] ; "
+        "$upper = [:Lu:] ; "
+        "$accent = [:M:] ; "
+
+        "$macron = \u0304 ;"
+        "$ddot = \u0308 ;"
+
+        "$lcgvowel = [\u03b1\u03b5\u03b7\u03b9\u03bf\u03c5\u03c9] ; "
+        "$ucgvowel = [\u0391\u0395\u0397\u0399\u039f\u03a5\u03a9] ; "
+        "$gvowel = [$lcgvowel $ucgvowel] ; "
+        "$lcgvowelC = [$lcgvowel $accent] ; "
+
+        "$evowel = [aeiouyAEIOUY];"
+        "$vowel = [ $evowel $gvowel] ; "
+
+        "$beforeLower = $accent * $lower ; "
+
+        "$gammaLike = [\u0393\u039a\u039e\u03a7\u03b3\u03ba\u03be\u03c7\u03f0] ; "
+        "$egammaLike = [GKXCgkxc] ; "
+        "$smooth = \u0313 ; "
+        "$rough = \u0314 ; "
+        "$iotasub = \u0345 ; "
+
+        "$softener = [\u03b2\u0392\u03b3\u0393\u03b4\u0394\u03b6\u0396\u03bb\u039b\u03bc\u039c\u03bd\u039d\u03c1\u03a1$gvowel] ;"
+
+        "$under = \u0331;"
+
+        "$caron = \u030C;"
+
+        "$afterLetter = [:^L:] [\'[:M:]]* ;"
+        "$beforeLetter = [\'[:M:]]* [:^L:] ;"
+
+
+        "# Fix punctuation\n"
+
+        "\; <> \? ;"
+        "\u00b7 <> \: ;"
+
+        "# Fix any ancient characters that creep in\n"
+
+        "\u0342 > \u0301 ;"
+        "\u0302 > \u0301 ;"
+        "\u0300 > \u0301 ;"
+        "$smooth > ;"
+        "$rough > ;"
+        "$iotasub > ;"
+        "\u037A > ;"
+
+        "# need to have these up here so the rules don't mask\n"
+
+        "\u03b7 <> i $under ;"
+        "\u0397 <> I $under ;"
+
+        "\u03a8 } $beforeLower <> Ps ; "
+        "\u03a8 <> PS ; "
+        "\u03c8 <> ps ; "
+
+        "\u03c9 <> o $under ;"
+        "\u03a9 <>  O $under;"
+
+        "# at begining or end of word, convert mp to b\n"
+
+        "[^[:L:][:M:]] } \u03bc\u03c0 > b ; "
+        "\u03bc\u03c0 } [^[:L:][:M:]] > b ; "
+        "[^[:L:][:M:]] } [\u039c\u03bc][\u03a0\u03c0] > B ; "
+        "[\u039c\u03bc][\u03a0\u03c0] } [^[:L:][:M:]] > B ;"
+
+        "\u03bc\u03c0 < b ; "
+        "\u039c\u03c0 < B { $beforeLower ; "
+        "\u039c\u03a0 < B ; "
+
+        "# handle diphthongs ending with upsilon\n"
+
+        "$vowel { \u03c5 } $softener <> v $under ; "
+        "$vowel { \u03c5 } <> f $under; "
+        "\u03c5 <> y ; "
+        "$vowel { \u03a5 } $softener <> V $under ; "
+        "$vowel { \u03a5 <> U $under ; "
+        "\u03a5 <> Y ; "
+
+        "# NORMAL\n"
+
+        "\u03b1 <> a ; "
+        "\u0391 <> A ; "
+
+        "\u03b2 <> v ; "
+        "\u0392 <> V ; "
+
+        "\u03b3 } $gammaLike <> n } $egammaLike ; "
+        "\u03b3 <> g ; "
+        "\u0393 } $gammaLike <> N } $egammaLike ; "
+        "\u0393 <> G ; "
+
+        "\u03b4 <> d ; "
+        "\u0394 <> D ; "
+
+        "\u03b5 <> e ; "
+        "\u0395 <> E ; "
+
+        "\u03b6 <> z ; "
+        "\u0396 <> Z ; "
+
+        "\u03b8 <> th ; "
+        "\u0398 } $beforeLower <> Th ; "
+        "\u0398 <> TH ; "
+
+        "\u03b9 <> i ; "
+        "\u0399 <> I ; "
+
+        "\u03ba <> k ;"
+        "\u039a <> K ; "
+
+        "\u03bb <> l ; "
+        "\u039b <> L ; "
+
+        "\u03bc <> m ; "
+        "\u039c <> M ; "
+
+        "\u03bd } $gammaLike > n\' ; "
+        "\u03bd <> n ; "
+        "\u039d } $gammaLike <> N\' ; "
+        "\u039d <> N ; "
+
+        "\u03be <> x ; "
+        "\u039e <> X ; "
+
+        "\u03bf <> o ; "
+        "\u039f <> O ; "
+
+        "\u03c0 <> p ; "
+        "\u03a0 <> P ; "
+
+        "\u03c1 <> r ; "
+        "\u03a1 <> R ; "
+
+        "[Pp] { } \u03c2 > \' ; "
+        "[Pp] { } \u03c3 > \' ;"
+
+        "# Caron means exception\n"
+
+        "# before a letter, initial\n"
+        "\u03c2 } $beforeLetter <> s $caron } $beforeLetter;"
+        "\u03c3 } $beforeLetter <> s } $beforeLetter;"
+
+        "# otherwise, after a letter = final\n"
+        "$afterLetter { \u03c3 <> $afterLetter { s $caron;"
+        "$afterLetter { \u03c2 <> $afterLetter { s ;"
+
+        "# otherwise (isolated) = initial\n"
+        "\u03c2 <> s $caron;"
+        "\u03c3 <> s ;"
+
+        "[Pp] { \u03a3 <> \'S ; "
+        "\u03a3 <> S ; "
+
+        "\u03c4 <> t ; "
+        "\u03a4 <> T ; "
+
+        "\u03c6 <> f ; "
+        "\u03a6 <> F ;"
+
+        "\u03c7 <> ch ; "
+        "\u03a7 } $beforeLower <> Ch ; "
+        "\u03a7 <> CH ; "
+
+        "# Completeness for ASCII\n"
+
+        "$ignore = [[:Mark:]''] * ;"
+
+        "| ch < h ;"
+        "| k  < c ;"
+        "| i  < j ;"
+        "| k < q ;"
+        "| y < u ;"
+        "| y < w ;"
+
+        "| Ch < H ;"
+        "| K < C ;"
+        "| I < J ;"
+        "| K < Q ;"
+        "| Y < W ;"
+        "| Y < U ;"
+
+        "# Completeness for Greek\n"
+
+        "\u03d0 > | \u03b2 ;"
+        "\u03d1 > | \u03b8 ;"
+        "\u03d2 > | \u03a5 ;"
+        "\u03d5 > | \u03c6 ;"
+        "\u03d6 > | \u03c0 ;"
+
+        "\u03f0 > | \u03ba ;"
+        "\u03f1 > | \u03c1 ;"
+        "\u03f2 > | \u03c3 ;"
+        "\u03f3 > j ;"
+        "\u03f4 > | \u0398 ;"
+        "\u03f5 > | \u03b5 ;"
+
+        "# delete any trailing ' marks used for roundtripping\n"
+
+        " < [\u03a0\u03c0] { \' } [Ss] ;"
+        " < [\u039d\u03bd] { \' } $egammaLike ;"
+
+        "::NFC (NFD) ; "
+     }
 }
--- a/icu4c/data/translit_index.txt
+++ b/icu4c/data/translit_index.txt
@ -70,6 +70,9 @@ translit_index {
 { "Latin-Greek", "file", "translit_Greek_Latin", "REVERSE" },
 { "Greek-Latin", "file", "translit_Greek_Latin", "FORWARD" },

+{ "Latin-Greek/UNGEGN", "alias", "Latin-el/UNGEGN", "" },
+{ "Greek-Latin/UNGEGN", "alias", "el-Latin/UNGEGN", "" },
+
 { "LowerLatin-Jamo", "internal", "translit_Latin_Jamo", "FORWARD" },
 { "Latin-Jamo", "alias", "Any-Lower;LowerLatin-Jamo", "" },
 { "Jamo-Latin", "file", "translit_Latin_Jamo", "REVERSE" },
--- a/icu4c/source/data/locales/el.txt
+++ b/icu4c/source/data/locales/el.txt
@ -164,4 +164,220 @@ el {
            "1,000,000,000: << \u03b4\u03b9\u03c3\u03b5\u03ba\u03b1\u03c4\u03bf\u03bc\u03bc\u03b9\u03cc\u03c1\u03b9\u03bf[ >>];\n"
            "1,000,000,000,000: =#,##0="
     }           
+
+    TransliterateLATIN {
+        "UNGEGN",
+
+        "# Rules are predicated on running NFD first, and NFC afterwards\n"
+        "::NFD (NFC) ; "
+
+        "# For modern Greek.\n"
+
+        "# Useful variables\n"
+
+        "$lower = [:Ll:] ; "
+        "$upper = [:Lu:] ; "
+        "$accent = [:M:] ; "
+
+        "$macron = \u0304 ;"
+        "$ddot = \u0308 ;"
+
+        "$lcgvowel = [\u03b1\u03b5\u03b7\u03b9\u03bf\u03c5\u03c9] ; "
+        "$ucgvowel = [\u0391\u0395\u0397\u0399\u039f\u03a5\u03a9] ; "
+        "$gvowel = [$lcgvowel $ucgvowel] ; "
+        "$lcgvowelC = [$lcgvowel $accent] ; "
+
+        "$evowel = [aeiouyAEIOUY];"
+        "$vowel = [ $evowel $gvowel] ; "
+
+        "$beforeLower = $accent * $lower ; "
+
+        "$gammaLike = [\u0393\u039a\u039e\u03a7\u03b3\u03ba\u03be\u03c7\u03f0] ; "
+        "$egammaLike = [GKXCgkxc] ; "
+        "$smooth = \u0313 ; "
+        "$rough = \u0314 ; "
+        "$iotasub = \u0345 ; "
+
+        "$softener = [\u03b2\u0392\u03b3\u0393\u03b4\u0394\u03b6\u0396\u03bb\u039b\u03bc\u039c\u03bd\u039d\u03c1\u03a1$gvowel] ;"
+
+        "$under = \u0331;"
+
+        "$caron = \u030C;"
+
+        "$afterLetter = [:^L:] [\'[:M:]]* ;"
+        "$beforeLetter = [\'[:M:]]* [:^L:] ;"
+
+
+        "# Fix punctuation\n"
+
+        "\; <> \? ;"
+        "\u00b7 <> \: ;"
+
+        "# Fix any ancient characters that creep in\n"
+
+        "\u0342 > \u0301 ;"
+        "\u0302 > \u0301 ;"
+        "\u0300 > \u0301 ;"
+        "$smooth > ;"
+        "$rough > ;"
+        "$iotasub > ;"
+        "\u037A > ;"
+
+        "# need to have these up here so the rules don't mask\n"
+
+        "\u03b7 <> i $under ;"
+        "\u0397 <> I $under ;"
+
+        "\u03a8 } $beforeLower <> Ps ; "
+        "\u03a8 <> PS ; "
+        "\u03c8 <> ps ; "
+
+        "\u03c9 <> o $under ;"
+        "\u03a9 <>  O $under;"
+
+        "# at begining or end of word, convert mp to b\n"
+
+        "[^[:L:][:M:]] } \u03bc\u03c0 > b ; "
+        "\u03bc\u03c0 } [^[:L:][:M:]] > b ; "
+        "[^[:L:][:M:]] } [\u039c\u03bc][\u03a0\u03c0] > B ; "
+        "[\u039c\u03bc][\u03a0\u03c0] } [^[:L:][:M:]] > B ;"
+
+        "\u03bc\u03c0 < b ; "
+        "\u039c\u03c0 < B { $beforeLower ; "
+        "\u039c\u03a0 < B ; "
+
+        "# handle diphthongs ending with upsilon\n"
+
+        "$vowel { \u03c5 } $softener <> v $under ; "
+        "$vowel { \u03c5 } <> f $under; "
+        "\u03c5 <> y ; "
+        "$vowel { \u03a5 } $softener <> V $under ; "
+        "$vowel { \u03a5 <> U $under ; "
+        "\u03a5 <> Y ; "
+
+        "# NORMAL\n"
+
+        "\u03b1 <> a ; "
+        "\u0391 <> A ; "
+
+        "\u03b2 <> v ; "
+        "\u0392 <> V ; "
+
+        "\u03b3 } $gammaLike <> n } $egammaLike ; "
+        "\u03b3 <> g ; "
+        "\u0393 } $gammaLike <> N } $egammaLike ; "
+        "\u0393 <> G ; "
+
+        "\u03b4 <> d ; "
+        "\u0394 <> D ; "
+
+        "\u03b5 <> e ; "
+        "\u0395 <> E ; "
+
+        "\u03b6 <> z ; "
+        "\u0396 <> Z ; "
+
+        "\u03b8 <> th ; "
+        "\u0398 } $beforeLower <> Th ; "
+        "\u0398 <> TH ; "
+
+        "\u03b9 <> i ; "
+        "\u0399 <> I ; "
+
+        "\u03ba <> k ;"
+        "\u039a <> K ; "
+
+        "\u03bb <> l ; "
+        "\u039b <> L ; "
+
+        "\u03bc <> m ; "
+        "\u039c <> M ; "
+
+        "\u03bd } $gammaLike > n\' ; "
+        "\u03bd <> n ; "
+        "\u039d } $gammaLike <> N\' ; "
+        "\u039d <> N ; "
+
+        "\u03be <> x ; "
+        "\u039e <> X ; "
+
+        "\u03bf <> o ; "
+        "\u039f <> O ; "
+
+        "\u03c0 <> p ; "
+        "\u03a0 <> P ; "
+
+        "\u03c1 <> r ; "
+        "\u03a1 <> R ; "
+
+        "[Pp] { } \u03c2 > \' ; "
+        "[Pp] { } \u03c3 > \' ;"
+
+        "# Caron means exception\n"
+
+        "# before a letter, initial\n"
+        "\u03c2 } $beforeLetter <> s $caron } $beforeLetter;"
+        "\u03c3 } $beforeLetter <> s } $beforeLetter;"
+
+        "# otherwise, after a letter = final\n"
+        "$afterLetter { \u03c3 <> $afterLetter { s $caron;"
+        "$afterLetter { \u03c2 <> $afterLetter { s ;"
+
+        "# otherwise (isolated) = initial\n"
+        "\u03c2 <> s $caron;"
+        "\u03c3 <> s ;"
+
+        "[Pp] { \u03a3 <> \'S ; "
+        "\u03a3 <> S ; "
+
+        "\u03c4 <> t ; "
+        "\u03a4 <> T ; "
+
+        "\u03c6 <> f ; "
+        "\u03a6 <> F ;"
+
+        "\u03c7 <> ch ; "
+        "\u03a7 } $beforeLower <> Ch ; "
+        "\u03a7 <> CH ; "
+
+        "# Completeness for ASCII\n"
+
+        "$ignore = [[:Mark:]''] * ;"
+
+        "| ch < h ;"
+        "| k  < c ;"
+        "| i  < j ;"
+        "| k < q ;"
+        "| y < u ;"
+        "| y < w ;"
+
+        "| Ch < H ;"
+        "| K < C ;"
+        "| I < J ;"
+        "| K < Q ;"
+        "| Y < W ;"
+        "| Y < U ;"
+
+        "# Completeness for Greek\n"
+
+        "\u03d0 > | \u03b2 ;"
+        "\u03d1 > | \u03b8 ;"
+        "\u03d2 > | \u03a5 ;"
+        "\u03d5 > | \u03c6 ;"
+        "\u03d6 > | \u03c0 ;"
+
+        "\u03f0 > | \u03ba ;"
+        "\u03f1 > | \u03c1 ;"
+        "\u03f2 > | \u03c3 ;"
+        "\u03f3 > j ;"
+        "\u03f4 > | \u0398 ;"
+        "\u03f5 > | \u03b5 ;"
+
+        "# delete any trailing ' marks used for roundtripping\n"
+
+        " < [\u03a0\u03c0] { \' } [Ss] ;"
+        " < [\u039d\u03bd] { \' } $egammaLike ;"
+
+        "::NFC (NFD) ; "
+     }
 }
--- a/icu4c/source/data/translit/translit_index.txt
+++ b/icu4c/source/data/translit/translit_index.txt
@ -70,6 +70,9 @@ translit_index {
 { "Latin-Greek", "file", "translit_Greek_Latin", "REVERSE" },
 { "Greek-Latin", "file", "translit_Greek_Latin", "FORWARD" },

+{ "Latin-Greek/UNGEGN", "alias", "Latin-el/UNGEGN", "" },
+{ "Greek-Latin/UNGEGN", "alias", "el-Latin/UNGEGN", "" },
+
 { "LowerLatin-Jamo", "internal", "translit_Latin_Jamo", "FORWARD" },
 { "Latin-Jamo", "alias", "Any-Lower;LowerLatin-Jamo", "" },
 { "Jamo-Latin", "file", "translit_Latin_Jamo", "REVERSE" },
--- a/icu4c/source/test/intltest/transtst.cpp
+++ b/icu4c/source/test/intltest/transtst.cpp
@ -133,6 +133,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
        TESTCASE(51,TestSanskritLatinRT);
        TESTCASE(52,TestLocaleInstantiation);
        TESTCASE(53,TestTitleAccents);
+        TESTCASE(54,TestLocaleResource);
        default: name = ""; break;
    }
 }
@ -2570,6 +2571,35 @@ void TransliteratorTest::TestTitleAccents(void) {
    delete t;
 }

+/**
+ * Basic test of a locale resource based rule.
+ */
+void TransliteratorTest::TestLocaleResource() {
+    const char* DATA[] = {
+        // id                    from               to
+        //"Latin-Greek/UNGEGN",    "b",               "\\u03bc\\u03c0",
+        "Latin-el",              "b",               "\\u03bc\\u03c0",
+        "Latin-Greek",           "b",               "\\u03B2",
+        "Greek-Latin/UNGEGN",    "\\u03bc\\u03c0",  "b",
+        "el-Latin",              "\\u03bc\\u03c0",  "b",
+        "Greek-Latin",           "\\u03B2",         "b",
+    };
+    const int32_t DATA_length = sizeof(DATA) / sizeof(DATA[0]);
+    for (int32_t i=0; i<DATA_length; i+=3) {
+        UParseError pe;
+        UErrorCode ec = U_ZERO_ERROR;
+        Transliterator *t = Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
+        if (U_FAILURE(ec)) {
+            errln((UnicodeString)"FAIL: createInstance(" + DATA[i] + ")");
+            delete t;
+            continue;
+        }
+        expect(*t, CharsToUnicodeString(DATA[i+1]),
+               CharsToUnicodeString(DATA[i+2]));
+        delete t;
+    }
+}
+
 //======================================================================
 // icu4c ONLY
 // These tests are not mirrored (yet) in icu4j at
--- a/icu4c/source/test/intltest/transtst.h
+++ b/icu4c/source/test/intltest/transtst.h
@ -251,6 +251,11 @@ class TransliteratorTest : public IntlTest {
     */
    void TestTitleAccents(void);

+    /**
+     * Basic test of a locale resource based rule.
+     */
+    void TestLocaleResource(void);
+
    //======================================================================
    // Support methods
    //======================================================================