ICU-3140 updated Thai tailoring, Fractional UCA, test and test file
X-SVN-Rev: 12711
This commit is contained in:
parent
65b66ed697
commit
c2a77c0fda
@ -13,143 +13,48 @@ th {
|
|||||||
// First put in all of the consonants, after Z
|
// First put in all of the consonants, after Z
|
||||||
//
|
//
|
||||||
CollationElements {
|
CollationElements {
|
||||||
Version { "2.0" }
|
Version { "3.0" }
|
||||||
Sequence { "[normalization on]& Z "
|
Sequence {
|
||||||
"<\u0E01" // KO KAI
|
// Tailoring of UCA for Thai Royal Institute Dictionary Sort, B.E. 2525
|
||||||
"<\u0E02" // KHO KHAI
|
"[normalization on]" // needed because Thai uses multiple accents
|
||||||
"<\u0E03" // KHO KHUAT
|
|
||||||
"<\u0E04" // KHO KHWAI
|
|
||||||
"<\u0E05" // KHO KHON
|
|
||||||
"<\u0E06" // KHO RAKHANG
|
|
||||||
"<\u0E07" // NGO NGU
|
|
||||||
"<\u0E08" // CHO CHAN
|
|
||||||
"<\u0E09" // CHO CHING
|
|
||||||
"<\u0E0A" // CHO CHANG
|
|
||||||
"<\u0E0B" // SO SO
|
|
||||||
"<\u0E0C" // CHO CHOE
|
|
||||||
"<\u0E0D" // YO YING
|
|
||||||
"<\u0E0E" // DO CHADA
|
|
||||||
"<\u0E0F" // TO PATAK
|
|
||||||
"<\u0E10" // THO THAN
|
|
||||||
"<\u0E11" // THO NANGMONTHO
|
|
||||||
"<\u0E12" // THO PHUTHAO
|
|
||||||
"<\u0E13" // NO NEN
|
|
||||||
"<\u0E14" // DO DEK
|
|
||||||
"<\u0E15" // TO TAO
|
|
||||||
"<\u0E16" // THO THUNG
|
|
||||||
"<\u0E17" // THO THAHAN
|
|
||||||
"<\u0E18" // THO THONG
|
|
||||||
"<\u0E19" // NO NU
|
|
||||||
"<\u0E1A" // BO BAIMAI
|
|
||||||
"<\u0E1B" // PO PLA
|
|
||||||
"<\u0E1C" // PHO PHUNG
|
|
||||||
"<\u0E1D" // FO FA
|
|
||||||
"<\u0E1E" // PHO PHAN
|
|
||||||
"<\u0E1F" // FO FAN
|
|
||||||
"<\u0E20" // PHO SAMPHAO
|
|
||||||
"<\u0E21" // MO MA
|
|
||||||
"<\u0E22" // YO YAK
|
|
||||||
"<\u0E23" // RO RUA
|
|
||||||
"<\u0E24" // RU
|
|
||||||
"<\u0E24\u0E45" // See the comment below on LAKKHANGYAO
|
|
||||||
"<\u0E25" // LO LING
|
|
||||||
"<\u0E26" // LU
|
|
||||||
"<\u0E26\u0E45" // See the comment below on LAKKHANGYAO
|
|
||||||
"<\u0E27" // WO WAEN
|
|
||||||
"<\u0E28" // SO SALA
|
|
||||||
"<\u0E29" // SO RUSI
|
|
||||||
"<\u0E2A" // SO SUA
|
|
||||||
"<\u0E2B" // HO HIP
|
|
||||||
"<\u0E2C" // LO CHULA
|
|
||||||
"<\u0E2D" // O ANG
|
|
||||||
"<\u0E2E" // HO NOKHUK
|
|
||||||
|
|
||||||
//
|
// put Ru with Lakkhangyao after Ru and put Lu with Lakkhangyao after Lu
|
||||||
// Normal vowels
|
// see the comment below on Lakkhangyao
|
||||||
//
|
"& \u0e24" // U+0E24 THAI CHARACTER RU
|
||||||
"<\u0E30" // SARA A
|
"< \u0e24\u0e45" // U+0E24 THAI CHARACTER RU U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||||
"<\u0E31" // MAI HAN-AKAT
|
"& \u0e26" // U+0E26 THAI CHARACTER LU
|
||||||
"<\u0E32" // SARA AA
|
"< \u0e26\u0e45" // U+0E26 THAI CHARACTER LU U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||||
|
|
||||||
// Normalizer will decompose this character to \u0e4d\u0e32.
|
// put Lakkhangyao after Sara Ai Maimalai
|
||||||
// This is a Bad Thing, because we want the separate
|
// this rare symbol also comes after all characters. But when it is used in combination
|
||||||
// characters to sort differently than this individual one.
|
// with Ru and Lu, the combination is treated as a seperate letter, ala CH sorting after
|
||||||
// Since there's no public way to set the decomposition to be
|
// C in the traditional Spanish.
|
||||||
// used when creating a collator, there's no way around this
|
"& \u0e44" // U+0E44 THAI CHARACTER SARA AI MAIMALAI
|
||||||
// right now. It's best to go ahead and leave the character
|
"< \u0e45" // U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||||
// in, because it occurs this way a lot more often than it
|
|
||||||
// occurs as separate characters.
|
|
||||||
"<\u0E33" // SARA AM
|
|
||||||
|
|
||||||
"<\u0E34" // SARA I
|
// put Yamakkan just before Maitaikhu. It will behave like an accent (primary ignorable)
|
||||||
|
"& [before 2] \u0E47" // U+0E47 THAI CHARACTER MAITAIKHU
|
||||||
|
"<< \u0E4E" // U+0E4E THAI CHARACTER YAMAKKAN
|
||||||
|
|
||||||
"<\u0E35" // SARA II
|
// put Thantakat and Nikhahit just after Mai Chattawa. They will behave like an accent (primary ignorable)
|
||||||
"<\u0E36" // SARA UE
|
"& \u0E4B" // U+0E4B THAI CHARACTER MAI CHATTAWA
|
||||||
"<\u0E37" // SARA UEE
|
"<< \u0E4C" // U+0E4C THAI CHARACTER THANTAKAT
|
||||||
"<\u0E38" // SARA U
|
"<< \u0E4D" // U+0E4D THAI CHARACTER NIKHAHIT
|
||||||
"<\u0E39" // SARA UU
|
|
||||||
|
|
||||||
//
|
// make punctuation and Paiyannoi...Khomut secondary ignorable. This will make them sort after the same
|
||||||
// Preceding vowels
|
// strings that don't contain them.
|
||||||
//
|
|
||||||
"<\u0E40" // SARA E
|
|
||||||
"<\u0E41" // SARA AE
|
|
||||||
"<\u0E42" // SARA O
|
|
||||||
"<\u0E43" // SARA AI MAIMUAN
|
|
||||||
"<\u0E44" // SARA AI MAIMALAI
|
|
||||||
|
|
||||||
//
|
"& [last secondary ignorable]"
|
||||||
// Digits
|
"<<< ' '" // Space
|
||||||
//
|
"<<< '-'" // Hyphen
|
||||||
"<\u0E50" // DIGIT ZERO
|
"<<< '.'" // Full stop
|
||||||
"<\u0E51" // DIGIT ONE
|
"<<< '...'" // Ellipsis
|
||||||
"<\u0E52" // DIGIT TWO
|
"<<< \u0E2F" // U+0E2F THAI CHARACTER PAIYANNOI (abbreviation mark)
|
||||||
"<\u0E53" // DIGIT THREE
|
"<<< \u0E46" // U+0E46 THAI CHARACTER MAIYAMOK (repetition mark)
|
||||||
"<\u0E54" // DIGIT FOUR
|
"<<< \u0E4F" // U+0E4F THAI CHARACTER FONGMAN (ancient symbol used as bullet mark)
|
||||||
"<\u0E55" // DIGIT FIVE
|
"<<< \u0E5A" // U+0E5A THAI CHARACTER ANGKHANKHU (ancient symbol used to mark end of section or episode)
|
||||||
"<\u0E56" // DIGIT SIX
|
"<<< \u0E5B" // U+0E5B THAI CHARACTER KHOMUT (ancient symbol used to mark end of story)
|
||||||
"<\u0E57" // DIGIT SEVEN
|
|
||||||
"<\u0E58" // DIGIT EIGHT
|
|
||||||
"<\u0E59" // DIGIT NINE
|
|
||||||
|
|
||||||
// Sorta tonal marks, but maybe not really
|
|
||||||
"<\u0E4D" // NIKHAHIT
|
|
||||||
|
|
||||||
// Thai symbols are supposed to sort "after white space". I'm
|
|
||||||
// treating this as making them sort just after the normal
|
|
||||||
// Latin-1 symbols, which are in turn after the white space.
|
|
||||||
"&'\u007d'" // right-brace
|
|
||||||
"<\u0E2F" // PAIYANNOI (ellipsis, abbreviation)
|
|
||||||
"<\u0E46" // MAIYAMOK
|
|
||||||
"<\u0E4F" // FONGMAN
|
|
||||||
"<\u0E5A" // ANGKHANKHU
|
|
||||||
"<\u0E5B" // KHOMUT
|
|
||||||
"<\u0E3F" // CURRENCY SYMBOL BAHT
|
|
||||||
|
|
||||||
// These symbols are supposed to be "after all characters"
|
|
||||||
"<\u0E4E" // YAMAKKAN
|
|
||||||
|
|
||||||
// This rare symbol also comes after all characters. But when it is
|
|
||||||
// used in combination with RU and LU, the combination is treated as
|
|
||||||
// a separate letter, ala "CH" sorting after "C" in traditional Spanish.
|
|
||||||
// see above.
|
|
||||||
"<\u0E45" // LAKKHANGYAO
|
|
||||||
|
|
||||||
// Tonal marks are primary ignorables but are treated as secondary
|
|
||||||
// differences
|
|
||||||
"&\u0301" // acute accent
|
|
||||||
"<<\u0E47" // MAITAIKHU
|
|
||||||
"<<\u0E48" // MAI EK
|
|
||||||
"<<\u0E49" // MAI THO
|
|
||||||
"<<\u0E4A" // MAI TRI
|
|
||||||
"<<\u0E4B" // MAI CHATTAWA
|
|
||||||
"<<\u0E4C" // THANTHAKHAT
|
|
||||||
|
|
||||||
|
|
||||||
// These are supposed to be ignored, so I'm treating them as controls
|
|
||||||
"&\u0001 "
|
|
||||||
"=\u0E3A" // PHINTHU
|
|
||||||
"='.'" // period
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Countries {
|
Countries {
|
||||||
|
@ -17220,12 +17220,13 @@ FDD0 0089; [, E1 81, 05]
|
|||||||
FDD0 008A; [, E1 91, 05]
|
FDD0 008A; [, E1 91, 05]
|
||||||
FDD0 008B; [, E1 A1, 05]
|
FDD0 008B; [, E1 A1, 05]
|
||||||
FDD0 008C; [, E1 B1, 05]
|
FDD0 008C; [, E1 B1, 05]
|
||||||
|
FDD0 008D; [,, 3E]
|
||||||
|
|
||||||
# VALUES BASED ON UCA
|
# VALUES BASED ON UCA
|
||||||
[first tertiary ignorable [,,]]
|
[first tertiary ignorable [,,]]
|
||||||
[last tertiary ignorable [,,]]
|
[last tertiary ignorable [,,]]
|
||||||
[first secondary ignorable [,, 05]]
|
[first secondary ignorable [,, 3E]]
|
||||||
[last secondary ignorable [,, 05]]
|
[last secondary ignorable [,, 3E]]
|
||||||
[first primary ignorable [, 87, 05]]
|
[first primary ignorable [, 87, 05]]
|
||||||
[last primary ignorable [, E1 B1, 05]]
|
[last primary ignorable [, E1 B1, 05]]
|
||||||
[first variable [05 07, 05, 05]]
|
[first variable [05 07, 05, 05]]
|
||||||
|
@ -24,14 +24,15 @@
|
|||||||
* The TestDictionary test expects a file of this name, with this
|
* The TestDictionary test expects a file of this name, with this
|
||||||
* encoding, to be present in the directory $ICU/source/test/testdata.
|
* encoding, to be present in the directory $ICU/source/test/testdata.
|
||||||
*/
|
*/
|
||||||
#define TEST_FILE "th18057.txt"
|
//#define TEST_FILE "th18057.txt"
|
||||||
|
#define TEST_FILE "riwords.txt"
|
||||||
#define TEST_FILE_ENCODING "UTF8"
|
#define TEST_FILE_ENCODING "UTF8"
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is the most failures we show in TestDictionary. If this number
|
* This is the most failures we show in TestDictionary. If this number
|
||||||
* is < 0, we show all failures.
|
* is < 0, we show all failures.
|
||||||
*/
|
*/
|
||||||
#define MAX_FAILURES_TO_SHOW 8
|
#define MAX_FAILURES_TO_SHOW -1
|
||||||
|
|
||||||
#define CASE(id,test) \
|
#define CASE(id,test) \
|
||||||
case id: \
|
case id: \
|
||||||
@ -47,7 +48,7 @@ CollationThaiTest::CollationThaiTest() {
|
|||||||
UErrorCode status = U_ZERO_ERROR;
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
coll = Collator::createInstance(Locale("th", "TH", ""), status);
|
coll = Collator::createInstance(Locale("th", "TH", ""), status);
|
||||||
if (coll && U_SUCCESS(status)) {
|
if (coll && U_SUCCESS(status)) {
|
||||||
coll->setStrength(Collator::TERTIARY);
|
//coll->setStrength(Collator::TERTIARY);
|
||||||
} else {
|
} else {
|
||||||
delete coll;
|
delete coll;
|
||||||
coll = 0;
|
coll = 0;
|
||||||
@ -87,7 +88,7 @@ static UBool readLine(FileStream *in, UnicodeString& line, const char* encoding)
|
|||||||
if (T_FileStream_eof(in)) {
|
if (T_FileStream_eof(in)) {
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
char buffer[128];
|
char buffer[1024];
|
||||||
char* p = buffer;
|
char* p = buffer;
|
||||||
char* limit = p + sizeof(buffer) - 1; // Leave space for 0
|
char* limit = p + sizeof(buffer) - 1; // Leave space for 0
|
||||||
while (p<limit) {
|
while (p<limit) {
|
||||||
@ -204,7 +205,7 @@ void CollationThaiTest::TestDictionary(void) {
|
|||||||
|
|
||||||
FileStream *in = T_FileStream_open(buffer, "rb");
|
FileStream *in = T_FileStream_open(buffer, "rb");
|
||||||
if (in == 0) {
|
if (in == 0) {
|
||||||
errln((UnicodeString)"Error: could not open test file " + buffer);
|
infoln((UnicodeString)"INFO: could not open test file " + buffer + ". Aborting test.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -488,7 +489,7 @@ void CollationThaiTest::TestInvalidThai(void) {
|
|||||||
void CollationThaiTest::TestReordering(void) {
|
void CollationThaiTest::TestReordering(void) {
|
||||||
const char *tests[] = {
|
const char *tests[] = {
|
||||||
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
|
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
|
||||||
"\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries
|
"\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
|
||||||
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
|
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
|
||||||
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
|
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
|
||||||
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
|
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
|
||||||
@ -525,12 +526,11 @@ void CollationThaiTest::TestReordering(void) {
|
|||||||
parseChars(rules, rule);
|
parseChars(rules, rule);
|
||||||
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
|
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
|
||||||
if(U_SUCCESS(status)) {
|
if(U_SUCCESS(status)) {
|
||||||
//compareArray(*rcoll, testcontraction, 3);
|
compareArray(*rcoll, testcontraction, 3);
|
||||||
delete rcoll;
|
delete rcoll;
|
||||||
} else {
|
} else {
|
||||||
errln("Couldn't instantiate collator from rules");
|
errln("Couldn't instantiate collator from rules");
|
||||||
}
|
}
|
||||||
//genericRulesStarter(rule, test10, 2);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
32895
icu4c/source/test/testdata/riwords.txt
vendored
Normal file
32895
icu4c/source/test/testdata/riwords.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user