ICU-3140 updated Thai tailoring, Fractional UCA, test and test file
X-SVN-Rev: 12711
This commit is contained in:
parent
65b66ed697
commit
c2a77c0fda
@ -13,143 +13,48 @@ th {
|
||||
// First put in all of the consonants, after Z
|
||||
//
|
||||
CollationElements {
|
||||
Version { "2.0" }
|
||||
Sequence { "[normalization on]& Z "
|
||||
"<\u0E01" // KO KAI
|
||||
"<\u0E02" // KHO KHAI
|
||||
"<\u0E03" // KHO KHUAT
|
||||
"<\u0E04" // KHO KHWAI
|
||||
"<\u0E05" // KHO KHON
|
||||
"<\u0E06" // KHO RAKHANG
|
||||
"<\u0E07" // NGO NGU
|
||||
"<\u0E08" // CHO CHAN
|
||||
"<\u0E09" // CHO CHING
|
||||
"<\u0E0A" // CHO CHANG
|
||||
"<\u0E0B" // SO SO
|
||||
"<\u0E0C" // CHO CHOE
|
||||
"<\u0E0D" // YO YING
|
||||
"<\u0E0E" // DO CHADA
|
||||
"<\u0E0F" // TO PATAK
|
||||
"<\u0E10" // THO THAN
|
||||
"<\u0E11" // THO NANGMONTHO
|
||||
"<\u0E12" // THO PHUTHAO
|
||||
"<\u0E13" // NO NEN
|
||||
"<\u0E14" // DO DEK
|
||||
"<\u0E15" // TO TAO
|
||||
"<\u0E16" // THO THUNG
|
||||
"<\u0E17" // THO THAHAN
|
||||
"<\u0E18" // THO THONG
|
||||
"<\u0E19" // NO NU
|
||||
"<\u0E1A" // BO BAIMAI
|
||||
"<\u0E1B" // PO PLA
|
||||
"<\u0E1C" // PHO PHUNG
|
||||
"<\u0E1D" // FO FA
|
||||
"<\u0E1E" // PHO PHAN
|
||||
"<\u0E1F" // FO FAN
|
||||
"<\u0E20" // PHO SAMPHAO
|
||||
"<\u0E21" // MO MA
|
||||
"<\u0E22" // YO YAK
|
||||
"<\u0E23" // RO RUA
|
||||
"<\u0E24" // RU
|
||||
"<\u0E24\u0E45" // See the comment below on LAKKHANGYAO
|
||||
"<\u0E25" // LO LING
|
||||
"<\u0E26" // LU
|
||||
"<\u0E26\u0E45" // See the comment below on LAKKHANGYAO
|
||||
"<\u0E27" // WO WAEN
|
||||
"<\u0E28" // SO SALA
|
||||
"<\u0E29" // SO RUSI
|
||||
"<\u0E2A" // SO SUA
|
||||
"<\u0E2B" // HO HIP
|
||||
"<\u0E2C" // LO CHULA
|
||||
"<\u0E2D" // O ANG
|
||||
"<\u0E2E" // HO NOKHUK
|
||||
Version { "3.0" }
|
||||
Sequence {
|
||||
// Tailoring of UCA for Thai Royal Institute Dictionary Sort, B.E. 2525
|
||||
"[normalization on]" // needed because Thai uses multiple accents
|
||||
|
||||
//
|
||||
// Normal vowels
|
||||
//
|
||||
"<\u0E30" // SARA A
|
||||
"<\u0E31" // MAI HAN-AKAT
|
||||
"<\u0E32" // SARA AA
|
||||
// put Ru with Lakkhangyao after Ru and put Lu with Lakkhangyao after Lu
|
||||
// see the comment below on Lakkhangyao
|
||||
"& \u0e24" // U+0E24 THAI CHARACTER RU
|
||||
"< \u0e24\u0e45" // U+0E24 THAI CHARACTER RU U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||
"& \u0e26" // U+0E26 THAI CHARACTER LU
|
||||
"< \u0e26\u0e45" // U+0E26 THAI CHARACTER LU U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||
|
||||
// Normalizer will decompose this character to \u0e4d\u0e32.
|
||||
// This is a Bad Thing, because we want the separate
|
||||
// characters to sort differently than this individual one.
|
||||
// Since there's no public way to set the decomposition to be
|
||||
// used when creating a collator, there's no way around this
|
||||
// right now. It's best to go ahead and leave the character
|
||||
// in, because it occurs this way a lot more often than it
|
||||
// occurs as separate characters.
|
||||
"<\u0E33" // SARA AM
|
||||
// put Lakkhangyao after Sara Ai Maimalai
|
||||
// this rare symbol also comes after all characters. But when it is used in combination
|
||||
// with Ru and Lu, the combination is treated as a seperate letter, ala CH sorting after
|
||||
// C in the traditional Spanish.
|
||||
"& \u0e44" // U+0E44 THAI CHARACTER SARA AI MAIMALAI
|
||||
"< \u0e45" // U+0E45 THAI CHARACTER LAKKHANGYAO
|
||||
|
||||
"<\u0E34" // SARA I
|
||||
// put Yamakkan just before Maitaikhu. It will behave like an accent (primary ignorable)
|
||||
"& [before 2] \u0E47" // U+0E47 THAI CHARACTER MAITAIKHU
|
||||
"<< \u0E4E" // U+0E4E THAI CHARACTER YAMAKKAN
|
||||
|
||||
"<\u0E35" // SARA II
|
||||
"<\u0E36" // SARA UE
|
||||
"<\u0E37" // SARA UEE
|
||||
"<\u0E38" // SARA U
|
||||
"<\u0E39" // SARA UU
|
||||
// put Thantakat and Nikhahit just after Mai Chattawa. They will behave like an accent (primary ignorable)
|
||||
"& \u0E4B" // U+0E4B THAI CHARACTER MAI CHATTAWA
|
||||
"<< \u0E4C" // U+0E4C THAI CHARACTER THANTAKAT
|
||||
"<< \u0E4D" // U+0E4D THAI CHARACTER NIKHAHIT
|
||||
|
||||
//
|
||||
// Preceding vowels
|
||||
//
|
||||
"<\u0E40" // SARA E
|
||||
"<\u0E41" // SARA AE
|
||||
"<\u0E42" // SARA O
|
||||
"<\u0E43" // SARA AI MAIMUAN
|
||||
"<\u0E44" // SARA AI MAIMALAI
|
||||
// make punctuation and Paiyannoi...Khomut secondary ignorable. This will make them sort after the same
|
||||
// strings that don't contain them.
|
||||
|
||||
//
|
||||
// Digits
|
||||
//
|
||||
"<\u0E50" // DIGIT ZERO
|
||||
"<\u0E51" // DIGIT ONE
|
||||
"<\u0E52" // DIGIT TWO
|
||||
"<\u0E53" // DIGIT THREE
|
||||
"<\u0E54" // DIGIT FOUR
|
||||
"<\u0E55" // DIGIT FIVE
|
||||
"<\u0E56" // DIGIT SIX
|
||||
"<\u0E57" // DIGIT SEVEN
|
||||
"<\u0E58" // DIGIT EIGHT
|
||||
"<\u0E59" // DIGIT NINE
|
||||
"& [last secondary ignorable]"
|
||||
"<<< ' '" // Space
|
||||
"<<< '-'" // Hyphen
|
||||
"<<< '.'" // Full stop
|
||||
"<<< '...'" // Ellipsis
|
||||
"<<< \u0E2F" // U+0E2F THAI CHARACTER PAIYANNOI (abbreviation mark)
|
||||
"<<< \u0E46" // U+0E46 THAI CHARACTER MAIYAMOK (repetition mark)
|
||||
"<<< \u0E4F" // U+0E4F THAI CHARACTER FONGMAN (ancient symbol used as bullet mark)
|
||||
"<<< \u0E5A" // U+0E5A THAI CHARACTER ANGKHANKHU (ancient symbol used to mark end of section or episode)
|
||||
"<<< \u0E5B" // U+0E5B THAI CHARACTER KHOMUT (ancient symbol used to mark end of story)
|
||||
|
||||
// Sorta tonal marks, but maybe not really
|
||||
"<\u0E4D" // NIKHAHIT
|
||||
|
||||
// Thai symbols are supposed to sort "after white space". I'm
|
||||
// treating this as making them sort just after the normal
|
||||
// Latin-1 symbols, which are in turn after the white space.
|
||||
"&'\u007d'" // right-brace
|
||||
"<\u0E2F" // PAIYANNOI (ellipsis, abbreviation)
|
||||
"<\u0E46" // MAIYAMOK
|
||||
"<\u0E4F" // FONGMAN
|
||||
"<\u0E5A" // ANGKHANKHU
|
||||
"<\u0E5B" // KHOMUT
|
||||
"<\u0E3F" // CURRENCY SYMBOL BAHT
|
||||
|
||||
// These symbols are supposed to be "after all characters"
|
||||
"<\u0E4E" // YAMAKKAN
|
||||
|
||||
// This rare symbol also comes after all characters. But when it is
|
||||
// used in combination with RU and LU, the combination is treated as
|
||||
// a separate letter, ala "CH" sorting after "C" in traditional Spanish.
|
||||
// see above.
|
||||
"<\u0E45" // LAKKHANGYAO
|
||||
|
||||
// Tonal marks are primary ignorables but are treated as secondary
|
||||
// differences
|
||||
"&\u0301" // acute accent
|
||||
"<<\u0E47" // MAITAIKHU
|
||||
"<<\u0E48" // MAI EK
|
||||
"<<\u0E49" // MAI THO
|
||||
"<<\u0E4A" // MAI TRI
|
||||
"<<\u0E4B" // MAI CHATTAWA
|
||||
"<<\u0E4C" // THANTHAKHAT
|
||||
|
||||
|
||||
// These are supposed to be ignored, so I'm treating them as controls
|
||||
"&\u0001 "
|
||||
"=\u0E3A" // PHINTHU
|
||||
"='.'" // period
|
||||
}
|
||||
}
|
||||
Countries {
|
||||
|
@ -17220,12 +17220,13 @@ FDD0 0089; [, E1 81, 05]
|
||||
FDD0 008A; [, E1 91, 05]
|
||||
FDD0 008B; [, E1 A1, 05]
|
||||
FDD0 008C; [, E1 B1, 05]
|
||||
FDD0 008D; [,, 3E]
|
||||
|
||||
# VALUES BASED ON UCA
|
||||
[first tertiary ignorable [,,]]
|
||||
[last tertiary ignorable [,,]]
|
||||
[first secondary ignorable [,, 05]]
|
||||
[last secondary ignorable [,, 05]]
|
||||
[first secondary ignorable [,, 3E]]
|
||||
[last secondary ignorable [,, 3E]]
|
||||
[first primary ignorable [, 87, 05]]
|
||||
[last primary ignorable [, E1 B1, 05]]
|
||||
[first variable [05 07, 05, 05]]
|
||||
|
@ -24,14 +24,15 @@
|
||||
* The TestDictionary test expects a file of this name, with this
|
||||
* encoding, to be present in the directory $ICU/source/test/testdata.
|
||||
*/
|
||||
#define TEST_FILE "th18057.txt"
|
||||
//#define TEST_FILE "th18057.txt"
|
||||
#define TEST_FILE "riwords.txt"
|
||||
#define TEST_FILE_ENCODING "UTF8"
|
||||
|
||||
/**
|
||||
* This is the most failures we show in TestDictionary. If this number
|
||||
* is < 0, we show all failures.
|
||||
*/
|
||||
#define MAX_FAILURES_TO_SHOW 8
|
||||
#define MAX_FAILURES_TO_SHOW -1
|
||||
|
||||
#define CASE(id,test) \
|
||||
case id: \
|
||||
@ -47,7 +48,7 @@ CollationThaiTest::CollationThaiTest() {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
coll = Collator::createInstance(Locale("th", "TH", ""), status);
|
||||
if (coll && U_SUCCESS(status)) {
|
||||
coll->setStrength(Collator::TERTIARY);
|
||||
//coll->setStrength(Collator::TERTIARY);
|
||||
} else {
|
||||
delete coll;
|
||||
coll = 0;
|
||||
@ -87,7 +88,7 @@ static UBool readLine(FileStream *in, UnicodeString& line, const char* encoding)
|
||||
if (T_FileStream_eof(in)) {
|
||||
return FALSE;
|
||||
}
|
||||
char buffer[128];
|
||||
char buffer[1024];
|
||||
char* p = buffer;
|
||||
char* limit = p + sizeof(buffer) - 1; // Leave space for 0
|
||||
while (p<limit) {
|
||||
@ -204,7 +205,7 @@ void CollationThaiTest::TestDictionary(void) {
|
||||
|
||||
FileStream *in = T_FileStream_open(buffer, "rb");
|
||||
if (in == 0) {
|
||||
errln((UnicodeString)"Error: could not open test file " + buffer);
|
||||
infoln((UnicodeString)"INFO: could not open test file " + buffer + ". Aborting test.");
|
||||
return;
|
||||
}
|
||||
|
||||
@ -488,7 +489,7 @@ void CollationThaiTest::TestInvalidThai(void) {
|
||||
void CollationThaiTest::TestReordering(void) {
|
||||
const char *tests[] = {
|
||||
"\\u0E41c\\u0301", "=", "\\u0E41\\u0107", // composition
|
||||
"\\u0E41\\uD834\\uDC00", "<", "\\u0E41\\uD834\\uDC01", // supplementaries
|
||||
"\\u0E41\\uD835\\uDFCE", "<", "\\u0E41\\uD835\\uDFCF", // supplementaries
|
||||
"\\u0E41\\uD834\\uDD5F", "=", "\\u0E41\\uD834\\uDD58\\uD834\\uDD65", // supplementary composition decomps to supplementary
|
||||
"\\u0E41\\uD87E\\uDC02", "=", "\\u0E41\\u4E41", // supplementary composition decomps to BMP
|
||||
"\\u0E41\\u0301", "=", "\\u0E41\\u0301", // unsafe (just checking backwards iteration)
|
||||
@ -525,12 +526,11 @@ void CollationThaiTest::TestReordering(void) {
|
||||
parseChars(rules, rule);
|
||||
RuleBasedCollator *rcoll = new RuleBasedCollator(rules, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
//compareArray(*rcoll, testcontraction, 3);
|
||||
compareArray(*rcoll, testcontraction, 3);
|
||||
delete rcoll;
|
||||
} else {
|
||||
errln("Couldn't instantiate collator from rules");
|
||||
}
|
||||
//genericRulesStarter(rule, test10, 2);
|
||||
|
||||
}
|
||||
|
||||
|
32895
icu4c/source/test/testdata/riwords.txt
vendored
Normal file
32895
icu4c/source/test/testdata/riwords.txt
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user