ICU-3140 updated Thai tailoring, Fractional UCA, test and test file

X-SVN-Rev: 12711
This commit is contained in:
Vladimir Weinstein 2003-07-30 21:12:10 +00:00
parent 65b66ed697
commit c2a77c0fda
4 changed files with 32942 additions and 141 deletions

View File

@ -13,143 +13,48 @@ th {
// First put in all of the consonants, after Z // First put in all of the consonants, after Z
// //
CollationElements { CollationElements {
Version { "2.0" } Version { "3.0" }
Sequence { "[normalization on]& Z " Sequence {
"<\u0E01" // KO KAI // Tailoring of UCA for Thai Royal Institute Dictionary Sort, B.E. 2525
"<\u0E02" // KHO KHAI "[normalization on]" // needed because Thai uses multiple accents
"<\u0E03" // KHO KHUAT
"<\u0E04" // KHO KHWAI
"<\u0E05" // KHO KHON
"<\u0E06" // KHO RAKHANG
"<\u0E07" // NGO NGU
"<\u0E08" // CHO CHAN
"<\u0E09" // CHO CHING
"<\u0E0A" // CHO CHANG
"<\u0E0B" // SO SO
"<\u0E0C" // CHO CHOE
"<\u0E0D" // YO YING
"<\u0E0E" // DO CHADA
"<\u0E0F" // TO PATAK
"<\u0E10" // THO THAN
"<\u0E11" // THO NANGMONTHO
"<\u0E12" // THO PHUTHAO
"<\u0E13" // NO NEN
"<\u0E14" // DO DEK
"<\u0E15" // TO TAO
"<\u0E16" // THO THUNG
"<\u0E17" // THO THAHAN
"<\u0E18" // THO THONG
"<\u0E19" // NO NU
"<\u0E1A" // BO BAIMAI
"<\u0E1B" // PO PLA
"<\u0E1C" // PHO PHUNG
"<\u0E1D" // FO FA
"<\u0E1E" // PHO PHAN
"<\u0E1F" // FO FAN
"<\u0E20" // PHO SAMPHAO
"<\u0E21" // MO MA
"<\u0E22" // YO YAK
"<\u0E23" // RO RUA
"<\u0E24" // RU
"<\u0E24\u0E45" // See the comment below on LAKKHANGYAO
"<\u0E25" // LO LING
"<\u0E26" // LU
"<\u0E26\u0E45" // See the comment below on LAKKHANGYAO
"<\u0E27" // WO WAEN
"<\u0E28" // SO SALA
"<\u0E29" // SO RUSI
"<\u0E2A" // SO SUA
"<\u0E2B" // HO HIP
"<\u0E2C" // LO CHULA
"<\u0E2D" // O ANG
"<\u0E2E" // HO NOKHUK
// // put Ru with Lakkhangyao after Ru and put Lu with Lakkhangyao after Lu
// Normal vowels // see the comment below on Lakkhangyao
// "& \u0e24" // U+0E24 THAI CHARACTER RU
"<\u0E30" // SARA A "< \u0e24\u0e45" // U+0E24 THAI CHARACTER RU U+0E45 THAI CHARACTER LAKKHANGYAO
"<\u0E31" // MAI HAN-AKAT "& \u0e26" // U+0E26 THAI CHARACTER LU
"<\u0E32" // SARA AA "< \u0e26\u0e45" // U+0E26 THAI CHARACTER LU U+0E45 THAI CHARACTER LAKKHANGYAO
// Normalizer will decompose this character to \u0e4d\u0e32. // put Lakkhangyao after Sara Ai Maimalai
// This is a Bad Thing, because we want the separate // this rare symbol also comes after all characters. But when it is used in combination
// characters to sort differently than this individual one. // with Ru and Lu, the combination is treated as a seperate letter, ala CH sorting after
// Since there's no public way to set the decomposition to be // C in the traditional Spanish.
// used when creating a collator, there's no way around this "& \u0e44" // U+0E44 THAI CHARACTER SARA AI MAIMALAI
// right now. It's best to go ahead and leave the character "< \u0e45" // U+0E45 THAI CHARACTER LAKKHANGYAO
// in, because it occurs this way a lot more often than it
// occurs as separate characters.
"<\u0E33" // SARA AM
"<\u0E34" // SARA I // put Yamakkan just before Maitaikhu. It will behave like an accent (primary ignorable)
"& [before 2] \u0E47" // U+0E47 THAI CHARACTER MAITAIKHU
"<< \u0E4E" // U+0E4E THAI CHARACTER YAMAKKAN
"<\u0E35" // SARA II // put Thantakat and Nikhahit just after Mai Chattawa. They will behave like an accent (primary ignorable)
"<\u0E36" // SARA UE "& \u0E4B" // U+0E4B THAI CHARACTER MAI CHATTAWA
"<\u0E37" // SARA UEE "<< \u0E4C" // U+0E4C THAI CHARACTER THANTAKAT
"<\u0E38" // SARA U "<< \u0E4D" // U+0E4D THAI CHARACTER NIKHAHIT
"<\u0E39" // SARA UU
// // make punctuation and Paiyannoi...Khomut secondary ignorable. This will make them sort after the same
// Preceding vowels // strings that don't contain them.
//
"<\u0E40" // SARA E
"<\u0E41" // SARA AE
"<\u0E42" // SARA O
"<\u0E43" // SARA AI MAIMUAN
"<\u0E44" // SARA AI MAIMALAI
// "& [last secondary ignorable]"
// Digits "<<< ' '" // Space
// "<<< '-'" // Hyphen
"<\u0E50" // DIGIT ZERO "<<< '.'" // Full stop
"<\u0E51" // DIGIT ONE "<<< '...'" // Ellipsis
"<\u0E52" // DIGIT TWO "<<< \u0E2F" // U+0E2F THAI CHARACTER PAIYANNOI (abbreviation mark)
"<\u0E53" // DIGIT THREE "<<< \u0E46" // U+0E46 THAI CHARACTER MAIYAMOK (repetition mark)
"<\u0E54" // DIGIT FOUR "<<< \u0E4F" // U+0E4F THAI CHARACTER FONGMAN (ancient symbol used as bullet mark)
"<\u0E55" // DIGIT FIVE "<<< \u0E5A" // U+0E5A THAI CHARACTER ANGKHANKHU (ancient symbol used to mark end of section or episode)
"<\u0E56" // DIGIT SIX "<<< \u0E5B" // U+0E5B THAI CHARACTER KHOMUT (ancient symbol used to mark end of story)
"<\u0E57" // DIGIT SEVEN
"<\u0E58" // DIGIT EIGHT
"<\u0E59" // DIGIT NINE
// Sorta tonal marks, but maybe not really
"<\u0E4D" // NIKHAHIT
// Thai symbols are supposed to sort "after white space". I'm
// treating this as making them sort just after the normal
// Latin-1 symbols, which are in turn after the white space.
"&'\u007d'" // right-brace
"<\u0E2F" // PAIYANNOI (ellipsis, abbreviation)
"<\u0E46" // MAIYAMOK
"<\u0E4F" // FONGMAN
"<\u0E5A" // ANGKHANKHU
"<\u0E5B" // KHOMUT
"<\u0E3F" // CURRENCY SYMBOL BAHT
// These symbols are supposed to be "after all characters"
"<\u0E4E" // YAMAKKAN
// This rare symbol also comes after all characters. But when it is
// used in combination with RU and LU, the combination is treated as
// a separate letter, ala "CH" sorting after "C" in traditional Spanish.
// see above.
"<\u0E45" // LAKKHANGYAO
// Tonal marks are primary ignorables but are treated as secondary
// differences
"&\u0301" // acute accent
"<<\u0E47" // MAITAIKHU
"<<\u0E48" // MAI EK
"<<\u0E49" // MAI THO
"<<\u0E4A" // MAI TRI
"<<\u0E4B" // MAI CHATTAWA
"<<\u0E4C" // THANTHAKHAT
// These are supposed to be ignored, so I'm treating them as controls
"&\u0001 "
"=\u0E3A" // PHINTHU
"='.'" // period
} }
} }
Countries { Countries {

View File

@ -17220,12 +17220,13 @@ FDD0 0089; [, E1 81, 05]
FDD0 008A; [, E1 91, 05] FDD0 008A; [, E1 91, 05]
FDD0 008B; [, E1 A1, 05] FDD0 008B; [, E1 A1, 05]
FDD0 008C; [, E1 B1, 05] FDD0 008C; [, E1 B1, 05]
FDD0 008D; [,, 3E]
# VALUES BASED ON UCA # VALUES BASED ON UCA
[first tertiary ignorable [,,]] [first tertiary ignorable [,,]]
[last tertiary ignorable [,,]] [last tertiary ignorable [,,]]
[first secondary ignorable [,, 05]] [first secondary ignorable [,, 3E]]
[last secondary ignorable [,, 05]] [last secondary ignorable [,, 3E]]
[first primary ignorable [, 87, 05]] [first primary ignorable [, 87, 05]]
[last primary ignorable [, E1 B1, 05]] [last primary ignorable [, E1 B1, 05]]
[first variable [05 07, 05, 05]] [first variable [05 07, 05, 05]]

View File

@ -24,14 +24,15 @@
* The TestDictionary test expects a file of this name, with this * The TestDictionary test expects a file of this name, with this
* encoding, to be present in the directory $ICU/source/test/testdata. * encoding, to be present in the directory $ICU/source/test/testdata.
*/ */
#define TEST_FILE "th18057.txt" //#define TEST_FILE "th18057.txt"
#define TEST_FILE "riwords.txt"
#define TEST_FILE_ENCODING "UTF8" #define TEST_FILE_ENCODING "UTF8"
/** /**
* This is the most failures we show in TestDictionary. If this number * This is the most failures we show in TestDictionary. If this number
* is < 0, we show all failures. * is < 0, we show all failures.
*/ */
#define MAX_FAILURES_TO_SHOW 8 #define MAX_FAILURES_TO_SHOW -1
#define CASE(id,test) \ #define CASE(id,test) \
case id: \ case id: \
@ -47,7 +48,7 @@ CollationThaiTest::CollationThaiTest() {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
coll = Collator::createInstance(Locale("th", "TH", ""), status); coll = Collator::createInstance(Locale("th", "TH", ""), status);
if (coll && U_SUCCESS(status)) { if (coll && U_SUCCESS(status)) {
coll->setStrength(Collator::TERTIARY); //coll->setStrength(Collator::TERTIARY);
} else { } else {
delete coll; delete coll;
coll = 0; coll = 0;
@ -87,7 +88,7 @@ static UBool readLine(FileStream *in, UnicodeString& line, const char* encoding)
if (T_FileStream_eof(in)) { if (T_FileStream_eof(in)) {
return FALSE; return FALSE;
} }
char buffer[128]; char buffer[1024];
char* p = buffer; char* p = buffer;
char* limit = p + sizeof(buffer) - 1; // Leave space for 0 char* limit = p + sizeof(buffer) - 1; // Leave space for 0
while (p<limit) { while (p<limit) {
@ -204,7 +205,7 @@ void CollationThaiTest::TestDictionary(void) {
FileStream *in = T_FileStream_open(buffer, "rb"); FileStream *in = T_FileStream_open(buffer, "rb");
if (in == 0) { if (in == 0) {
errln((UnicodeString)"Error: could not open test file " + buffer); infoln((UnicodeString)"INFO: could not open test file " + buffer + ". Aborting test.");
return; return;
} }
@ -488,7 +489,7 @@ void CollationThaiTest::TestInvalidThai(void) {
void CollationThaiTest::TestReordering(void) { void CollationThaiTest::TestReordering(void) {
const char *tests[] = { const char *tests[] = {
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition "\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
"\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries "\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary "\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP "\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration) "\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
@ -525,12 +526,11 @@ void CollationThaiTest::TestReordering(void) {
parseChars(rules, rule); parseChars(rules, rule);
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status); RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
if(U_SUCCESS(status)) { if(U_SUCCESS(status)) {
//compareArray(*rcoll, testcontraction, 3); compareArray(*rcoll, testcontraction, 3);
delete rcoll; delete rcoll;
} else { } else {
errln("Couldn't instantiate collator from rules"); errln("Couldn't instantiate collator from rules");
} }
//genericRulesStarter(rule, test10, 2);
} }

32895
icu4c/source/test/testdata/riwords.txt vendored Normal file

File diff suppressed because it is too large Load Diff