diff --git a/icu4c/source/common/dictbe.cpp b/icu4c/source/common/dictbe.cpp index 18fa188a7e..dde8072966 100644 --- a/icu4c/source/common/dictbe.cpp +++ b/icu4c/source/common/dictbe.cpp @@ -1324,8 +1324,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText, } if (katakanaRunLength < kMaxKatakanaGroupLength) { uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength); - if (newSnlp < (uint32_t)bestSnlp.elementAti(j)) { - bestSnlp.setElementAt(newSnlp, j); + if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) { + bestSnlp.setElementAt(newSnlp, i+katakanaRunLength); prev.setElementAt(i, i+katakanaRunLength); // prev[j] = i; } } diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 0d4c1633b6..761b3e01b5 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -38,17 +38,8 @@ # Temp debugging tests - - -$s0=[;,*]; -$s1=[a-z]; -$s2=[i-n]; -$s3=[x-z]; -!!forward; -($s0 | '?')* -($s1 | $s2 | $s3)*; - -•hello• • +# + ## FILTERED BREAK TESTS @@ -327,6 +318,15 @@ $s3=[x-z]; •ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400> •\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400> +# +# Ticket #13549 +# CjiBreakEngine::divideUpDictionaryRange: assertion failure. +# + + +•\U00020029<400>\u3300<400>\U0002C400<400> +•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD• + # # What Is Unicode in Japanese # From http://unicode.org/standard/translations/japanese.html diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java index b2c4c61b7f..0e21779bdd 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CjkBreakEngine.java @@ -102,7 +102,7 @@ class CjkBreakEngine extends DictionaryBreakEngine { boolean isNormalized = Normalizer.quickCheck(prenormstr, Normalizer.NFKC) == Normalizer.YES || Normalizer.isNormalized(prenormstr, Normalizer.NFKC, 0); CharacterIterator text; - int numChars = 0; + int numCodePts = 0; if (isNormalized) { text = new java.text.StringCharacterIterator(prenormstr); int index = 0; @@ -110,8 +110,8 @@ class CjkBreakEngine extends DictionaryBreakEngine { while (index < prenormstr.length()) { int codepoint = prenormstr.codePointAt(index); index += Character.charCount(codepoint); - numChars++; - charPositions[numChars] = index; + numCodePts++; + charPositions[numCodePts] = index; } } else { String normStr = Normalizer.normalize(prenormstr, Normalizer.NFKC); @@ -122,37 +122,43 @@ class CjkBreakEngine extends DictionaryBreakEngine { charPositions[0] = 0; while (index < normalizer.endIndex()) { normalizer.next(); - numChars++; + numCodePts++; index = normalizer.getIndex(); - charPositions[numChars] = index; + charPositions[numCodePts] = index; } } // From here on out, do the algorithm. Note that our indices // refer to indices within the normalized string. - int[] bestSnlp = new int[numChars + 1]; + int[] bestSnlp = new int[numCodePts + 1]; bestSnlp[0] = 0; - for (int i = 1; i <= numChars; i++) { + for (int i = 1; i <= numCodePts; i++) { bestSnlp[i] = kint32max; } - int[] prev = new int[numChars + 1]; - for (int i = 0; i <= numChars; i++) { + int[] prev = new int[numCodePts + 1]; + for (int i = 0; i <= numCodePts; i++) { prev[i] = -1; } final int maxWordSize = 20; - int values[] = new int[numChars]; - int lengths[] = new int[numChars]; + int values[] = new int[numCodePts]; + int lengths[] = new int[numCodePts]; // dynamic programming to find the best segmentation + + // In outer loop, i is the code point index, + // ix is the corresponding code unit index. + // They differ when the string contains supplementary characters. + int ix = 0; + text.setIndex(ix); boolean is_prev_katakana = false; - for (int i = 0; i < numChars; i++) { - text.setIndex(i); + for (int i = 0; i < numCodePts; i++, text.setIndex(ix), next32(text)) { + ix = text.getIndex(); if (bestSnlp[i] == kint32max) { continue; } - int maxSearchLength = (i + maxWordSize < numChars) ? maxWordSize : (numChars - i); + int maxSearchLength = (i + maxWordSize < numCodePts) ? maxWordSize : (numCodePts - i); int[] count_ = new int[1]; fDictionary.matches(text, maxSearchLength, lengths, count_, maxSearchLength, values); int count = count_[0]; @@ -162,7 +168,7 @@ class CjkBreakEngine extends DictionaryBreakEngine { // with the highest value possible (i.e. the least likely to occur). // Exclude Korean characters from this treatment, as they should be // left together by default. - text.setIndex(i); // fDictionary.matches() advances the text position; undo that. + text.setIndex(ix); // fDictionary.matches() advances the text position; undo that. if ((count == 0 || lengths[0] != 1) && current32(text) != DONE32 && !fHangulWordSet.contains(current32(text))) { values[count] = maxSnlp; lengths[count] = 1; @@ -186,7 +192,7 @@ class CjkBreakEngine extends DictionaryBreakEngine { if (!is_prev_katakana && is_katakana) { int j = i + 1; next32(text); - while (j < numChars && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { + while (j < numCodePts && (j - i) < kMaxKatakanaGroupLength && isKatakana(current32(text))) { next32(text); ++j; } @@ -202,13 +208,13 @@ class CjkBreakEngine extends DictionaryBreakEngine { is_prev_katakana = is_katakana; } - int t_boundary[] = new int[numChars + 1]; + int t_boundary[] = new int[numCodePts + 1]; int numBreaks = 0; - if (bestSnlp[numChars] == kint32max) { - t_boundary[numBreaks] = numChars; + if (bestSnlp[numCodePts] == kint32max) { + t_boundary[numBreaks] = numCodePts; numBreaks++; } else { - for (int i = numChars; i > 0; i = prev[i]) { + for (int i = numCodePts; i > 0; i = prev[i]) { t_boundary[numBreaks] = i; numBreaks++; } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt index 1450a98d7b..761b3e01b5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt @@ -38,19 +38,8 @@ # Temp debugging tests - - -<0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\ -コンピューター<400>は<400>、<0>文字<400>や<400>記号<400>など<400>の<400>それぞれに<400>番号<400>を<400>割り振る<400>こと<400>によって<400>扱える<400>\ -よう<400>にし<400>ます<400>。<0>ユニ<400>コード<400>が<400>出来る<400>まで<400>は<400>、<0>これらの<400>番号<400>を<400>割り振る<400>仕組み<400>が<400>\ -何<400>百<400>種類<400>も<400>存在<400>しま<400>した<400>。<0>どの<400>一つ<400>を<400>とっても<400>、<0>十分<400>な<400>文字<400>を<400>含<400>\ -んで<400>は<400>いま<400>せん<400>で<400>した<400>。<0>例えば<400>、<0>欧州<400>連合<400>一つ<400>を<400>見<400>て<400>も<400>、<0>その<400>\ -すべて<400>の<400>言語<400>を<400>カバー<400>する<400>ため<400>に<400>は<400>、<0>いくつか<400>の<400>異なる<400>符号<400>化<400>の<400>仕組み<400>\ -が<400>必要<400>で<400>した<400>。<0>英語<400>の<400>よう<400>な<400>一つ<400>の<400>言語<400>に<400>限<400>って<400>も<400>、<0>一つ<400>だけ<400>\ -の<400>符号<400>化<400>の<400>仕組み<400>では<400>、<0>一般<400>的<400>に<400>使<400>われる<400>すべて<400>の<400>文字<400>、<0>句読点<400>、<0>\ -。<0> +# -#<0>コンピューター<400>は<400>、<0>本質<400>的<400>に<400>は<400>数字<400>しか<400>扱う<400>こと<400>が<400>でき<400>ま<400>せん<400>。<0>\ ## FILTERED BREAK TESTS @@ -329,6 +318,15 @@ •ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400> •\U00011700<200>ロ<400>から<400>売却<400>完了<400>時<400>の<400>時価<400>が<400>提示<400>さ<400>れ<400>て<400>いる<400> +# +# Ticket #13549 +# CjiBreakEngine::divideUpDictionaryRange: assertion failure. +# + + +•\U00020029<400>\u3300<400>\U0002C400<400> +•\uFAD7<400>\u331B<400>\u87DF<400>\u006D<200>\uFFFD• + # # What Is Unicode in Japanese # From http://unicode.org/standard/translations/japanese.html