ICU-3140 updated Thai tailoring, Fractional UCA, test and test file

X-SVN-Rev: 12711
2003-07-30 21:12:10 +00:00 · 2003-07-30 21:12:10 +00:00 · c2a77c0fda
commit c2a77c0fda
parent 65b66ed697
4 changed files with 32942 additions and 141 deletions
--- a/icu4c/source/data/locales/th.txt
+++ b/icu4c/source/data/locales/th.txt
@ -13,143 +13,48 @@ th {
    // First put in all of the consonants, after Z
    //
    CollationElements {
-        Version { "2.0" }
-        Sequence { "[normalization on]& Z "
-        "<\u0E01"                    //  KO KAI
-        "<\u0E02"                    //  KHO KHAI
-        "<\u0E03"                    //  KHO KHUAT
-        "<\u0E04"                    //  KHO KHWAI
-        "<\u0E05"                    //  KHO KHON
-        "<\u0E06"                    //  KHO RAKHANG
-        "<\u0E07"                    //  NGO NGU
-        "<\u0E08"                    //  CHO CHAN
-        "<\u0E09"                    //  CHO CHING
-        "<\u0E0A"                    //  CHO CHANG
-        "<\u0E0B"                    //  SO SO
-        "<\u0E0C"                    //  CHO CHOE
-        "<\u0E0D"                    //  YO YING
-        "<\u0E0E"                    //  DO CHADA
-        "<\u0E0F"                    //  TO PATAK
-        "<\u0E10"                    //  THO THAN
-        "<\u0E11"                    //  THO NANGMONTHO
-        "<\u0E12"                    //  THO PHUTHAO
-        "<\u0E13"                    //  NO NEN
-        "<\u0E14"                    //  DO DEK
-        "<\u0E15"                    //  TO TAO
-        "<\u0E16"                    //  THO THUNG
-        "<\u0E17"                    //  THO THAHAN
-        "<\u0E18"                    //  THO THONG
-        "<\u0E19"                    //  NO NU
-        "<\u0E1A"                    //  BO BAIMAI
-        "<\u0E1B"                    //  PO PLA
-        "<\u0E1C"                    //  PHO PHUNG
-        "<\u0E1D"                    //  FO FA
-        "<\u0E1E"                    //  PHO PHAN
-        "<\u0E1F"                    //  FO FAN
-        "<\u0E20"                    //  PHO SAMPHAO
-        "<\u0E21"                    //  MO MA
-        "<\u0E22"                    //  YO YAK
-        "<\u0E23"                    //  RO RUA
-        "<\u0E24"                    //  RU
-        "<\u0E24\u0E45"              //  See the comment below on LAKKHANGYAO
-        "<\u0E25"                    //  LO LING
-        "<\u0E26"                    //  LU
-        "<\u0E26\u0E45"              //  See the comment below on LAKKHANGYAO
-        "<\u0E27"                    //  WO WAEN
-        "<\u0E28"                    //  SO SALA
-        "<\u0E29"                    //  SO RUSI
-        "<\u0E2A"                    //  SO SUA
-        "<\u0E2B"                    //  HO HIP
-        "<\u0E2C"                    //  LO CHULA
-        "<\u0E2D"                    //  O ANG
-        "<\u0E2E"                    //  HO NOKHUK
+        Version { "3.0" }
+        Sequence { 
+           // Tailoring of UCA for Thai Royal Institute Dictionary Sort, B.E. 2525
+           "[normalization on]" // needed because Thai uses multiple accents

-        //
-        // Normal vowels
-        //
-        "<\u0E30"                    //  SARA A
-        "<\u0E31"                    //  MAI HAN-AKAT
-        "<\u0E32"                    //  SARA AA
+           // put Ru with Lakkhangyao after Ru and put Lu with Lakkhangyao after Lu
+           // see the comment below on Lakkhangyao
+           "& \u0e24"          // U+0E24 THAI CHARACTER RU
+           "< \u0e24\u0e45"    // U+0E24 THAI CHARACTER RU  U+0E45 THAI CHARACTER LAKKHANGYAO
+           "& \u0e26"          // U+0E26 THAI CHARACTER LU
+           "< \u0e26\u0e45"    // U+0E26 THAI CHARACTER LU U+0E45 THAI CHARACTER LAKKHANGYAO

-        // Normalizer will decompose this character to \u0e4d\u0e32.
-        // This is a Bad Thing, because we want the separate
-        // characters to sort differently than this individual one.
-        // Since there's no public way to set the decomposition to be
-        // used when creating a collator, there's no way around this
-        // right now.  It's best to go ahead and leave the character
-        // in, because it occurs this way a lot more often than it
-        // occurs as separate characters.
-        "<\u0E33"                    //  SARA AM
+           // put Lakkhangyao after Sara Ai Maimalai
+           // this rare symbol also comes after all characters. But when it is used in combination
+           // with Ru and Lu, the combination is treated as a seperate letter, ala CH sorting after
+           // C in the traditional Spanish.
+           "& \u0e44"  // U+0E44 THAI CHARACTER SARA AI MAIMALAI
+           "< \u0e45"  // U+0E45 THAI CHARACTER LAKKHANGYAO

-        "<\u0E34"                    //  SARA I
+           // put Yamakkan just before Maitaikhu. It will behave like an accent (primary ignorable)
+           "& [before 2] \u0E47" // U+0E47 THAI CHARACTER MAITAIKHU
+           "<< \u0E4E"           // U+0E4E THAI CHARACTER YAMAKKAN

-        "<\u0E35"                    //  SARA II
-        "<\u0E36"                    //  SARA UE
-        "<\u0E37"                    //  SARA UEE
-        "<\u0E38"                    //  SARA U
-        "<\u0E39"                    //  SARA UU
+           // put Thantakat and Nikhahit just after Mai Chattawa.  They will behave like an accent (primary ignorable)
+            "& \u0E4B"  // U+0E4B  THAI CHARACTER MAI CHATTAWA
+           "<< \u0E4C"  // U+0E4C  THAI CHARACTER THANTAKAT
+           "<< \u0E4D"  // U+0E4D  THAI CHARACTER NIKHAHIT

-        //
-        // Preceding vowels
-        //
-        "<\u0E40"                    //  SARA E
-        "<\u0E41"                    //  SARA AE
-        "<\u0E42"                    //  SARA O
-        "<\u0E43"                    //  SARA AI MAIMUAN
-        "<\u0E44"                    //  SARA AI MAIMALAI
+           // make punctuation and  Paiyannoi...Khomut secondary ignorable. This will make them sort after the same
+	   // strings that don't contain them.

-        //
-        // Digits
-        //
-        "<\u0E50"                    //  DIGIT ZERO
-        "<\u0E51"                    //  DIGIT ONE
-        "<\u0E52"                    //  DIGIT TWO
-        "<\u0E53"                    //  DIGIT THREE
-        "<\u0E54"                    //  DIGIT FOUR
-        "<\u0E55"                    //  DIGIT FIVE
-        "<\u0E56"                    //  DIGIT SIX
-        "<\u0E57"                    //  DIGIT SEVEN
-        "<\u0E58"                    //  DIGIT EIGHT
-        "<\u0E59"                    //  DIGIT NINE
+           "& [last secondary ignorable]"
+	   "<<< ' '"    // Space
+           "<<< '-'"    // Hyphen
+           "<<< '.'"    // Full stop
+           "<<< '...'"  // Ellipsis
+           "<<< \u0E2F" // U+0E2F  THAI CHARACTER PAIYANNOI (abbreviation mark)
+           "<<< \u0E46" // U+0E46  THAI CHARACTER MAIYAMOK (repetition mark)
+           "<<< \u0E4F" // U+0E4F  THAI CHARACTER FONGMAN (ancient symbol used as bullet mark)
+           "<<< \u0E5A" // U+0E5A  THAI CHARACTER ANGKHANKHU (ancient symbol used to mark end of section or episode)
+           "<<< \u0E5B" // U+0E5B  THAI CHARACTER KHOMUT (ancient symbol used to mark end of story)
 
-        // Sorta tonal marks, but maybe not really
-        "<\u0E4D"                    //  NIKHAHIT
-
-        // Thai symbols are supposed to sort "after white space".  I'm
-        // treating this as making them sort just after the normal
-        // Latin-1 symbols, which are in turn after the white space.
-        "&'\u007d'"                   //  right-brace
-        "<\u0E2F"                    //  PAIYANNOI      (ellipsis, abbreviation)
-        "<\u0E46"                    //  MAIYAMOK
-        "<\u0E4F"                    //  FONGMAN
-        "<\u0E5A"                    //  ANGKHANKHU
-        "<\u0E5B"                    //  KHOMUT
-        "<\u0E3F"                    //  CURRENCY SYMBOL BAHT
-
-        // These symbols are supposed to be "after all characters"
-        "<\u0E4E"                    //  YAMAKKAN
-
-        // This rare symbol also comes after all characters.  But when it is
-        // used in combination with RU and LU, the combination is treated as
-        // a separate letter, ala "CH" sorting after "C" in traditional Spanish.
-        // see above.
-        "<\u0E45"                    //  LAKKHANGYAO
-
-        // Tonal marks are primary ignorables but are treated as secondary
-        // differences
-        "&\u0301"                    // acute accent
-        "<<\u0E47"                    //  MAITAIKHU
-        "<<\u0E48"                    //  MAI EK
-        "<<\u0E49"                    //  MAI THO
-        "<<\u0E4A"                    //  MAI TRI
-        "<<\u0E4B"                    //  MAI CHATTAWA
-        "<<\u0E4C"                    //  THANTHAKHAT
-
-
-        // These are supposed to be ignored, so I'm treating them as controls
-        "&\u0001 "
-        "=\u0E3A"                    //  PHINTHU
-        "='.'"                      //  period 
        }
    }
    Countries { 
--- a/icu4c/source/data/unidata/FractionalUCA.txt
+++ b/icu4c/source/data/unidata/FractionalUCA.txt
@ -17220,12 +17220,13 @@ FDD0 0089; [, E1 81, 05]
 FDD0 008A; [, E1 91, 05]
 FDD0 008B; [, E1 A1, 05]
 FDD0 008C; [, E1 B1, 05]
+FDD0 008D; [,, 3E]

 # VALUES BASED ON UCA
 [first tertiary ignorable [,,]]
 [last tertiary ignorable [,,]]
-[first secondary ignorable [,, 05]]
-[last secondary ignorable [,, 05]]
+[first secondary ignorable [,, 3E]]
+[last secondary ignorable [,, 3E]]
 [first primary ignorable [, 87, 05]]
 [last primary ignorable [, E1 B1, 05]]
 [first variable [05 07, 05, 05]]
--- a/icu4c/source/test/intltest/thcoll.cpp
+++ b/icu4c/source/test/intltest/thcoll.cpp
@ -24,14 +24,15 @@
 * The TestDictionary test expects a file of this name, with this
 * encoding, to be present in the directory $ICU/source/test/testdata.
 */
-#define TEST_FILE           "th18057.txt"
+//#define TEST_FILE           "th18057.txt"
+#define TEST_FILE           "riwords.txt"
 #define TEST_FILE_ENCODING  "UTF8"

 /**
 * This is the most failures we show in TestDictionary.  If this number
 * is < 0, we show all failures.
 */
-#define MAX_FAILURES_TO_SHOW 8
+#define MAX_FAILURES_TO_SHOW -1

 #define CASE(id,test)                 \
    case id:                          \
@ -47,7 +48,7 @@ CollationThaiTest::CollationThaiTest() {
    UErrorCode status = U_ZERO_ERROR;
    coll = Collator::createInstance(Locale("th", "TH", ""), status);
    if (coll && U_SUCCESS(status)) {
-        coll->setStrength(Collator::TERTIARY);
+        //coll->setStrength(Collator::TERTIARY);
    } else {
        delete coll;
        coll = 0;
@ -87,7 +88,7 @@ static UBool readLine(FileStream *in, UnicodeString& line, const char* encoding)
    if (T_FileStream_eof(in)) {
        return FALSE;
    }
-    char buffer[128];
+    char buffer[1024];
    char* p = buffer;
    char* limit = p + sizeof(buffer) - 1; // Leave space for 0
    while (p<limit) {
@ -204,7 +205,7 @@ void CollationThaiTest::TestDictionary(void) {

    FileStream *in = T_FileStream_open(buffer, "rb");
    if (in == 0) {
-        errln((UnicodeString)"Error: could not open test file " + buffer);
+        infoln((UnicodeString)"INFO: could not open test file " + buffer + ". Aborting test.");
        return;        
    }

@ -488,7 +489,7 @@ void CollationThaiTest::TestInvalidThai(void) {
 void CollationThaiTest::TestReordering(void) {
  const char *tests[] = { 
                          "\\u0E41c\\u0301",       "=", "\\u0E41\\u0107", // composition
-                          "\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries
+                          "\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
                          "\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
                          "\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
                          "\\u0E41\\u0301",        "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
@ -525,12 +526,11 @@ void CollationThaiTest::TestReordering(void) {
  parseChars(rules, rule);
  RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
  if(U_SUCCESS(status)) {
-    //compareArray(*rcoll, testcontraction, 3);
+    compareArray(*rcoll, testcontraction, 3);
    delete rcoll;
  } else {
    errln("Couldn't instantiate collator from rules");
  }
-  //genericRulesStarter(rule, test10, 2);

 }

--- a/icu4c/source/test/testdata/riwords.txt
+++ b/icu4c/source/test/testdata/riwords.txt