ICU-11790 UnicodeString rather than LocalPointer<UnicodeString>; moveFrom() = adoptInstead(orphan())
X-SVN-Rev: 37965
This commit is contained in:
parent
e95ec1d6a3
commit
815ae73855
@ -1138,8 +1138,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// UnicodeString version of input UText, NFKC normalized in necessary.
|
||||
LocalPointer<UnicodeString> inString;
|
||||
// UnicodeString version of input UText, NFKC normalized if necessary.
|
||||
UnicodeString inString;
|
||||
|
||||
// inputMap[inStringIndex] = corresponding native index from UText inText.
|
||||
// If NULL then mapping is 1:1
|
||||
@ -1153,12 +1153,12 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
inText->chunkNativeStart <= rangeStart &&
|
||||
inText->chunkNativeLimit >= rangeEnd &&
|
||||
inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
|
||||
|
||||
// Input UTtxt is in one contiguous UTF-16 chunk.
|
||||
// Use Read-only aliasing UnicodeString constructor on it.
|
||||
inString.adoptInstead(new UnicodeString(FALSE,
|
||||
inText->chunkContents + rangeStart - inText->chunkNativeStart,
|
||||
rangeEnd - rangeStart));
|
||||
|
||||
// Input UText is in one contiguous UTF-16 chunk.
|
||||
// Use Read-only aliasing UnicodeString.
|
||||
inString.setTo(FALSE,
|
||||
inText->chunkContents + rangeStart - inText->chunkNativeStart,
|
||||
rangeEnd - rangeStart);
|
||||
} else {
|
||||
// Copy the text from the original inText (UText) to inString (UnicodeString).
|
||||
// Create a map from UnicodeString indices -> UText offsets.
|
||||
@ -1168,14 +1168,16 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
if (limit > utext_nativeLength(inText)) {
|
||||
limit = utext_nativeLength(inText);
|
||||
}
|
||||
inString.adoptInstead(new UnicodeString);
|
||||
inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
while (utext_getNativeIndex(inText) < limit) {
|
||||
int32_t nativePosition = utext_getNativeIndex(inText);
|
||||
UChar32 c = utext_next32(inText);
|
||||
U_ASSERT(c != U_SENTINEL);
|
||||
inString->append(c);
|
||||
while (inputMap->size() < inString->length()) {
|
||||
inString.append(c);
|
||||
while (inputMap->size() < inString.length()) {
|
||||
inputMap->addElement(nativePosition, status);
|
||||
}
|
||||
}
|
||||
@ -1183,8 +1185,8 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
}
|
||||
|
||||
|
||||
if (!nfkcNorm2->isNormalized(*inString, status)) {
|
||||
LocalPointer<UnicodeString> normalizedInput(new UnicodeString(), status);
|
||||
if (!nfkcNorm2->isNormalized(inString, status)) {
|
||||
UnicodeString normalizedInput;
|
||||
// normalizedMap[normalizedInput position] == original UText position.
|
||||
LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
|
||||
if (U_FAILURE(status)) {
|
||||
@ -1193,55 +1195,60 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
|
||||
UnicodeString fragment;
|
||||
UnicodeString normalizedFragment;
|
||||
for (int32_t srcI = 0; srcI < inString->length();) { // Once per normalization chunk
|
||||
for (int32_t srcI = 0; srcI < inString.length();) { // Once per normalization chunk
|
||||
fragment.remove();
|
||||
int32_t fragmentStartI = srcI;
|
||||
UChar32 c = inString->char32At(srcI);
|
||||
UChar32 c = inString.char32At(srcI);
|
||||
for (;;) {
|
||||
fragment.append(c);
|
||||
srcI = inString->moveIndex32(srcI, 1);
|
||||
if (srcI == inString->length()) {
|
||||
srcI = inString.moveIndex32(srcI, 1);
|
||||
if (srcI == inString.length()) {
|
||||
break;
|
||||
}
|
||||
c = inString->char32At(srcI);
|
||||
c = inString.char32At(srcI);
|
||||
if (nfkcNorm2->hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
nfkcNorm2->normalize(fragment, normalizedFragment, status);
|
||||
normalizedInput->append(normalizedFragment);
|
||||
normalizedInput.append(normalizedFragment);
|
||||
|
||||
// Map every position in the normalized chunk to the start of the chunk
|
||||
// in the original input.
|
||||
int32_t fragmentOriginalStart = (inputMap.isValid())? inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
|
||||
while (normalizedMap->size() < normalizedInput->length()) {
|
||||
int32_t fragmentOriginalStart = inputMap.isValid() ?
|
||||
inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
|
||||
while (normalizedMap->size() < normalizedInput.length()) {
|
||||
normalizedMap->addElement(fragmentOriginalStart, status);
|
||||
if (U_FAILURE(status)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
U_ASSERT(normalizedMap->size() == normalizedInput->length());
|
||||
int32_t nativeEnd = (inputMap.isValid())? inputMap->elementAti(inString->length()) : inString->length()+rangeStart;
|
||||
U_ASSERT(normalizedMap->size() == normalizedInput.length());
|
||||
int32_t nativeEnd = inputMap.isValid() ?
|
||||
inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
|
||||
normalizedMap->addElement(nativeEnd, status);
|
||||
|
||||
inputMap.adoptInstead(normalizedMap.orphan());
|
||||
inString.adoptInstead(normalizedInput.orphan());
|
||||
inputMap.moveFrom(normalizedMap);
|
||||
inString.moveFrom(normalizedInput);
|
||||
}
|
||||
|
||||
int32_t numCodePts = inString->countChar32();
|
||||
if (numCodePts != inString->length()) {
|
||||
int32_t numCodePts = inString.countChar32();
|
||||
if (numCodePts != inString.length()) {
|
||||
// There are supplementary characters in the input.
|
||||
// The dictionary will produce boundary positions in terms of code point indexes,
|
||||
// not in terms of code unit string indexes.
|
||||
// Use the inputMap mechanism to take care of this in addition to indexing differences
|
||||
// from normalization and/or UTF-8 input.
|
||||
UBool hadExistingMap = (inputMap.isValid());
|
||||
UBool hadExistingMap = inputMap.isValid();
|
||||
if (!hadExistingMap) {
|
||||
inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
|
||||
inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
int32_t cpIdx = 0;
|
||||
for (int32_t cuIdx = 0; ; cuIdx = inString->moveIndex32(cuIdx, 1)) {
|
||||
for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
|
||||
U_ASSERT(cuIdx >= cpIdx);
|
||||
if (hadExistingMap) {
|
||||
inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
|
||||
@ -1249,7 +1256,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
inputMap->addElement(cuIdx+rangeStart, status);
|
||||
}
|
||||
cpIdx++;
|
||||
if (cuIdx == inString->length()) {
|
||||
if (cuIdx == inString.length()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -1278,7 +1285,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
lengths.setSize(numCodePts);
|
||||
|
||||
UText fu = UTEXT_INITIALIZER;
|
||||
utext_openUnicodeString(&fu, inString.getAlias(), &status);
|
||||
utext_openUnicodeString(&fu, &inString, &status);
|
||||
|
||||
// Dynamic programming to find the best segmentation.
|
||||
|
||||
@ -1286,7 +1293,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
// ix is the corresponding string (code unit) index.
|
||||
// They differ when the string contains supplementary characters.
|
||||
int32_t ix = 0;
|
||||
for (int32_t i = 0; i < numCodePts; ++i, ix = inString->moveIndex32(ix, 1)) {
|
||||
for (int32_t i = 0; i < numCodePts; ++i, ix = inString.moveIndex32(ix, 1)) {
|
||||
if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
|
||||
continue;
|
||||
}
|
||||
@ -1304,7 +1311,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
// Exclude Korean characters from this treatment, as they should be left
|
||||
// together by default.
|
||||
if ((count == 0 || lengths.elementAti(0) != 1) &&
|
||||
!fHangulWordSet.contains(inString->char32At(ix))) {
|
||||
!fHangulWordSet.contains(inString.char32At(ix))) {
|
||||
values.setElementAt(maxSnlp, count); // 255
|
||||
lengths.setElementAt(1, count++);
|
||||
}
|
||||
@ -1325,14 +1332,14 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
||||
// specified in the katakanaCost table according to its length.
|
||||
|
||||
bool is_prev_katakana = false;
|
||||
bool is_katakana = isKatakana(inString->char32At(ix));
|
||||
bool is_katakana = isKatakana(inString.char32At(ix));
|
||||
int32_t katakanaRunLength = 1;
|
||||
if (!is_prev_katakana && is_katakana) {
|
||||
int32_t j = inString->moveIndex32(ix, 1);
|
||||
int32_t j = inString.moveIndex32(ix, 1);
|
||||
// Find the end of the continuous run of Katakana characters
|
||||
while (j < inString->length() && katakanaRunLength < kMaxKatakanaGroupLength &&
|
||||
isKatakana(inString->char32At(j))) {
|
||||
j = inString->moveIndex32(j, 1);
|
||||
while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
|
||||
isKatakana(inString.char32At(j))) {
|
||||
j = inString.moveIndex32(j, 1);
|
||||
katakanaRunLength++;
|
||||
}
|
||||
if (katakanaRunLength < kMaxKatakanaGroupLength) {
|
||||
|
Loading…
Reference in New Issue
Block a user