ICU-3109 port of the thai collation bug fix

X-SVN-Rev: 12657
This commit is contained in:
Syn Wee Quek 2003-07-22 17:52:14 +00:00
parent 68ef0bde89
commit 9372fb574c
2 changed files with 132 additions and 7 deletions

View File

@ -20,6 +20,8 @@ package com.ibm.icu.dev.test.collator;
import com.ibm.icu.dev.test.*;
import com.ibm.icu.text.*;
import java.util.Locale;
import java.util.Comparator;
import java.util.Arrays;
import java.io.*;
public class CollationThaiTest extends TestFmwk {
@ -75,7 +77,7 @@ public class CollationThaiTest extends TestFmwk {
Collator coll = null;
try {
coll = Collator.getInstance(new Locale("th", "TH", ""));
coll = getThaiCollator();
} catch (Exception e) {
errln("Error: could not construct Thai collator");
return;
@ -152,7 +154,7 @@ public class CollationThaiTest extends TestFmwk {
public void TestDictionary() {
Collator coll = null;
try {
coll = Collator.getInstance(new Locale("th", "TH", ""));
coll = getThaiCollator();
} catch (Exception e) {
errln("Error: could not construct Thai collator");
return;
@ -252,6 +254,43 @@ public class CollationThaiTest extends TestFmwk {
logln("Words checked: " + wordCount);
}
public void TestInvalidThai()
{
String tests[] = { "\u0E44\u0E01\u0E44\u0E01",
"\u0E44\u0E01\u0E01\u0E44",
"\u0E01\u0E44\u0E01\u0E44",
"\u0E01\u0E01\u0E44\u0E44",
"\u0E44\u0E44\u0E01\u0E01",
"\u0E01\u0E44\u0E44\u0E01",
};
RuleBasedCollator collator;
StrCmp comparator;
try {
collator = getThaiCollator();
comparator = new StrCmp();
} catch (Exception e) {
errln("Error: could not construct Thai collator");
return;
}
Arrays.sort(tests, comparator);
for (int i = 0; i < tests.length; i ++)
{
for (int j = i + 1; j < tests.length; j ++) {
if (collator.compare(tests[i], tests[j]) > 0) {
// inconsistency ordering found!
errln("Inconsistent ordering between strings " + i
+ " and " + j);
}
}
CollationElementIterator iterator
= collator.getCollationElementIterator(tests[i]);
CollationIteratorTest.backAndForth(this, iterator);
}
}
private static final byte BOM[] = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
private byte savedBytes[]= new byte[BOM.length];
@ -343,4 +382,36 @@ public class CollationThaiTest extends TestFmwk {
target += "]";
return target;
}
// private inner class -------------------------------------------------
private static final class StrCmp implements Comparator
{
public int compare(Object string1, Object string2)
{
return collator.compare(string1, string2);
}
StrCmp() throws Exception
{
collator = getThaiCollator();
}
Collator collator;
}
// private data members ------------------------------------------------
private static RuleBasedCollator m_collator_;
// private methods -----------------------------------------------------
private static RuleBasedCollator getThaiCollator() throws Exception
{
if (m_collator_ == null) {
m_collator_ = (RuleBasedCollator)Collator.getInstance(
new Locale("th", "TH", ""));
}
return m_collator_;
}
}

View File

@ -355,15 +355,41 @@ public final class CollationElementIterator
}
}
else {
if (m_bufferOffset_ < 0 && m_source_.getIndex() != 0) {
if (m_bufferOffset_ < 0 && m_source_.getIndex() != 0
&& isThaiPreVowel(peekCharacter(-1))) {
// we now rearrange unconditionally
if (isThaiPreVowel(m_source_.previous())) {
backupInternalState(m_utilSpecialBackUp_);
// we have to check if the previous character is also Thai
// if not, we can just set the result
// we have already determined that the normalization
// buffer is empty
m_source_.previous();
if (m_source_.getIndex() == 0
|| !isThaiPreVowel(peekCharacter(-1))) {
result = CE_THAI_;
}
else {
result = m_collator_.m_trie_.getLeadValue(ch);
else {
// previous is also reordered
// we need to go back as long as they are being
// reordered
// count over the range of reorderable characters
// and see
// if there is an even or odd number of them
// if even, we should not reorder.
// If odd we should reorder.
int noReordered = 1; // the one we already detected
while (m_source_.getIndex() != 0
&& isThaiPreVowel(m_source_.previous())) {
noReordered ++;
}
if ((noReordered & 1) != 0) {
// odd number of reorderables
result = CE_THAI_;
} else {
result = m_collator_.m_trie_.getLeadValue(ch);
}
}
m_source_.next();
updateInternalState(m_utilSpecialBackUp_);
}
else {
result = m_collator_.m_trie_.getLeadValue(ch);
@ -1862,6 +1888,12 @@ public final class CollationElementIterator
collator.m_expansion_[++ offset];
}
}
// in case of one element expansion, we
// want to immediately return CEpos
if (m_CEBufferSize_ == 1) {
m_CEBufferSize_ = 0;
m_CEBufferOffset_ = 0;
}
return m_CEBuffer_[0];
}
@ -2526,4 +2558,26 @@ public final class CollationElementIterator
}
return cp + NON_CJK_OFFSET_; // non-CJK
}
/**
* Gets a character from the source string at a given offset
* Handles both normal and iterative cases.
* No error checking - caller beware!
* @param offset offset from current position which character is to be
* retrieved
* @return character at current position + offset
*/
private char peekCharacter(int offset)
{
if (offset != 0) {
int currentoffset = m_source_.getIndex();
m_source_.setIndex(currentoffset + offset);
char result = m_source_.current();
m_source_.setIndex(currentoffset);
return result;
}
else {
return m_source_.current();
}
}
}