ICU-13549 CjkBreakEngine::divideUpDictionaryRange, problems with supplemental character handling.
X-SVN-Rev: 40949
This commit is contained in:
parent
a0c64552ab
commit
f6fbd54e92
@ -1324,8 +1324,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||||||
}
|
}
|
||||||
if (katakanaRunLength < kMaxKatakanaGroupLength) {
|
if (katakanaRunLength < kMaxKatakanaGroupLength) {
|
||||||
uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
|
uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
|
||||||
if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) {
|
if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
|
||||||
bestSnlp.setElementAt(newSnlp, j);
|
bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
|
||||||
prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
|
prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
22
icu4c/source/test/testdata/rbbitst.txt
vendored
22
icu4c/source/test/testdata/rbbitst.txt
vendored
@ -38,17 +38,8 @@
|
|||||||
|
|
||||||
|
|
||||||
# Temp debugging tests
|
# Temp debugging tests
|
||||||
<locale en>
|
#
|
||||||
<rules>
|
|
||||||
$s0=[;,*];
|
|
||||||
$s1=[a-z];
|
|
||||||
$s2=[i-n];
|
|
||||||
$s3=[x-z];
|
|
||||||
!!forward;
|
|
||||||
($s0 | '?')*
|
|
||||||
($s1 | $s2 | $s3)*;
|
|
||||||
</rules>
|
|
||||||
<data>•hello• •</data>
|
|
||||||
|
|
||||||
## FILTERED BREAK TESTS
|
## FILTERED BREAK TESTS
|
||||||
|
|
||||||
@ -327,6 +318,15 @@ $s3=[x-z];
|
|||||||
<data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
<data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
||||||
<data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
<data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
||||||
|
|
||||||
|
#
|
||||||
|
# Ticket #13549
|
||||||
|
# CjiBreakEngine::divideUpDictionaryRange: assertion failure.
|
||||||
|
#
|
||||||
|
<locale en>
|
||||||
|
<word>
|
||||||
|
<data>•\U00020029<400>\u3300<400>\U0002C400<400></data>
|
||||||
|
<data>•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD•</data>
|
||||||
|
|
||||||
#
|
#
|
||||||
# What Is Unicode in Japanese
|
# What Is Unicode in Japanese
|
||||||
# From http://unicode.org/standard/translations/japanese.html
|
# From http://unicode.org/standard/translations/japanese.html
|
||||||
|
@ -102,7 +102,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
|
boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES ||
|
||||||
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
|
Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0);
|
||||||
CharacterIterator text;
|
CharacterIterator text;
|
||||||
int numChars = 0;
|
int numCodePts = 0;
|
||||||
if (isNormalized) {
|
if (isNormalized) {
|
||||||
text = new java.text.StringCharacterIterator(prenormstr);
|
text = new java.text.StringCharacterIterator(prenormstr);
|
||||||
int index = 0;
|
int index = 0;
|
||||||
@ -110,8 +110,8 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
while (index < prenormstr.length()) {
|
while (index < prenormstr.length()) {
|
||||||
int codepoint = prenormstr.codePointAt(index);
|
int codepoint = prenormstr.codePointAt(index);
|
||||||
index += Character.charCount(codepoint);
|
index += Character.charCount(codepoint);
|
||||||
numChars++;
|
numCodePts++;
|
||||||
charPositions[numChars] = index;
|
charPositions[numCodePts] = index;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
|
String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC);
|
||||||
@ -122,37 +122,43 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
charPositions[0] = 0;
|
charPositions[0] = 0;
|
||||||
while (index < normalizer.endIndex()) {
|
while (index < normalizer.endIndex()) {
|
||||||
normalizer.next();
|
normalizer.next();
|
||||||
numChars++;
|
numCodePts++;
|
||||||
index = normalizer.getIndex();
|
index = normalizer.getIndex();
|
||||||
charPositions[numChars] = index;
|
charPositions[numCodePts] = index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// From here on out, do the algorithm. Note that our indices
|
// From here on out, do the algorithm. Note that our indices
|
||||||
// refer to indices within the normalized string.
|
// refer to indices within the normalized string.
|
||||||
int[] bestSnlp = new int[numChars + 1];
|
int[] bestSnlp = new int[numCodePts + 1];
|
||||||
bestSnlp[0] = 0;
|
bestSnlp[0] = 0;
|
||||||
for (int i = 1; i <= numChars; i++) {
|
for (int i = 1; i <= numCodePts; i++) {
|
||||||
bestSnlp[i] = kint32max;
|
bestSnlp[i] = kint32max;
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] prev = new int[numChars + 1];
|
int[] prev = new int[numCodePts + 1];
|
||||||
for (int i = 0; i <= numChars; i++) {
|
for (int i = 0; i <= numCodePts; i++) {
|
||||||
prev[i] = -1;
|
prev[i] = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int maxWordSize = 20;
|
final int maxWordSize = 20;
|
||||||
int values[] = new int[numChars];
|
int values[] = new int[numCodePts];
|
||||||
int lengths[] = new int[numChars];
|
int lengths[] = new int[numCodePts];
|
||||||
// dynamic programming to find the best segmentation
|
// dynamic programming to find the best segmentation
|
||||||
|
|
||||||
|
// In outer loop, i is the code point index,
|
||||||
|
// ix is the corresponding code unit index.
|
||||||
|
// They differ when the string contains supplementary characters.
|
||||||
|
int ix = 0;
|
||||||
|
text.setIndex(ix);
|
||||||
boolean is_prev_katakana = false;
|
boolean is_prev_katakana = false;
|
||||||
for (int i = 0; i < numChars; i++) {
|
for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) {
|
||||||
text.setIndex(i);
|
ix = text.getIndex();
|
||||||
if (bestSnlp[i] == kint32max) {
|
if (bestSnlp[i] == kint32max) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i);
|
int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i);
|
||||||
int[] count_ = new int[1];
|
int[] count_ = new int[1];
|
||||||
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
|
fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values);
|
||||||
int count = count_[0];
|
int count = count_[0];
|
||||||
@ -162,7 +168,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
// with the highest value possible (i.e. the least likely to occur).
|
// with the highest value possible (i.e. the least likely to occur).
|
||||||
// Exclude Korean characters from this treatment, as they should be
|
// Exclude Korean characters from this treatment, as they should be
|
||||||
// left together by default.
|
// left together by default.
|
||||||
text.setIndex(i); // fDictionary.matches() advances the text position; undo that.
|
text.setIndex(ix); // fDictionary.matches() advances the text position; undo that.
|
||||||
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
|
if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) {
|
||||||
values[count] = maxSnlp;
|
values[count] = maxSnlp;
|
||||||
lengths[count] = 1;
|
lengths[count] = 1;
|
||||||
@ -186,7 +192,7 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
if (!is_prev_katakana && is_katakana) {
|
if (!is_prev_katakana && is_katakana) {
|
||||||
int j = i + 1;
|
int j = i + 1;
|
||||||
next32(text);
|
next32(text);
|
||||||
while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
|
while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) {
|
||||||
next32(text);
|
next32(text);
|
||||||
++j;
|
++j;
|
||||||
}
|
}
|
||||||
@ -202,13 +208,13 @@ class CjkBreakEngine extends DictionaryBreakEngine {
|
|||||||
is_prev_katakana = is_katakana;
|
is_prev_katakana = is_katakana;
|
||||||
}
|
}
|
||||||
|
|
||||||
int t_boundary[] = new int[numChars + 1];
|
int t_boundary[] = new int[numCodePts + 1];
|
||||||
int numBreaks = 0;
|
int numBreaks = 0;
|
||||||
if (bestSnlp[numChars] == kint32max) {
|
if (bestSnlp[numCodePts] == kint32max) {
|
||||||
t_boundary[numBreaks] = numChars;
|
t_boundary[numBreaks] = numCodePts;
|
||||||
numBreaks++;
|
numBreaks++;
|
||||||
} else {
|
} else {
|
||||||
for (int i = numChars; i > 0; i = prev[i]) {
|
for (int i = numCodePts; i > 0; i = prev[i]) {
|
||||||
t_boundary[numBreaks] = i;
|
t_boundary[numBreaks] = i;
|
||||||
numBreaks++;
|
numBreaks++;
|
||||||
}
|
}
|
||||||
|
@ -38,19 +38,8 @@
|
|||||||
|
|
||||||
|
|
||||||
# Temp debugging tests
|
# Temp debugging tests
|
||||||
<locale en>
|
#
|
||||||
<word>
|
|
||||||
<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
|
||||||
コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\
|
|
||||||
よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\
|
|
||||||
何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\
|
|
||||||
んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\
|
|
||||||
すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\
|
|
||||||
が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\
|
|
||||||
の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\
|
|
||||||
。<0></data>
|
|
||||||
|
|
||||||
#<data><0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\
|
|
||||||
|
|
||||||
## FILTERED BREAK TESTS
|
## FILTERED BREAK TESTS
|
||||||
|
|
||||||
@ -329,6 +318,15 @@
|
|||||||
<data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
<data>•ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
||||||
<data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
<data>•\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400></data>
|
||||||
|
|
||||||
|
#
|
||||||
|
# Ticket #13549
|
||||||
|
# CjiBreakEngine::divideUpDictionaryRange: assertion failure.
|
||||||
|
#
|
||||||
|
<locale en>
|
||||||
|
<word>
|
||||||
|
<data>•\U00020029<400>\u3300<400>\U0002C400<400></data>
|
||||||
|
<data>•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD•</data>
|
||||||
|
|
||||||
#
|
#
|
||||||
# What Is Unicode in Japanese
|
# What Is Unicode in Japanese
|
||||||
# From http://unicode.org/standard/translations/japanese.html
|
# From http://unicode.org/standard/translations/japanese.html
|
||||||
|
Loading…
Reference in New Issue
Block a user