ICU-12410 Edits::Iterator getters not fields, tracks indexes, findSourceIndex(), skip-unchanged iterators, ucasemap_toLowerWithEdits() & ucasemap_toUpperWithEdits()

X-SVN-Rev: 39548
This commit is contained in:
Markus Scherer 2017-01-06 00:20:31 +00:00
parent 20994f490a
commit 00f2e12b65
6 changed files with 368 additions and 115 deletions

View File

@ -197,53 +197,113 @@ public:
UBool next(UErrorCode &errorCode);
/**
* TRUE if this edit replaces oldLength units with newLength different ones.
* FALSE if oldLength units remain unchanged.
* Finds the edit that contains the source index.
* The source index may be found in a non-change
* even if normal iteration would skip non-changes.
* Normal iteration can continue from a found edit.
*
* The iterator state before this search logically does not matter.
* (It may affect the performance of the search.)
*
* The iterator state after this search is undefined
* if the source index is out of bounds for the source string.
*
* @param i source index
* @return TRUE if the edit for the source index was found
* @internal ICU 59 technology preview
*/
UBool changed;
UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
/**
* Number of units in the original string which are replaced or remain unchanged.
* @return TRUE if this edit replaces oldLength() units with newLength() different ones.
* FALSE if oldLength units remain unchanged.
* @internal ICU 59 technology preview
*/
int32_t oldLength;
UBool hasChange() const { return changed; }
/**
* Number of units in the modified string, if changed is TRUE.
* Same as oldLength if changed is FALSE.
* @return the number of units in the original string which are replaced or remain unchanged.
* @internal ICU 59 technology preview
*/
int32_t newLength;
int32_t oldLength() const { return oldLength_; }
/**
* @return the number of units in the modified string, if hasChange() is TRUE.
* Same as oldLength if hasChange() is FALSE.
* @internal ICU 59 technology preview
*/
int32_t newLength() const { return newLength_; }
/**
* @return the current index into the source string
* @internal ICU 59 technology preview
*/
int32_t sourceIndex() const { return srcIndex; }
/**
* @return the current index into the replacement-characters-only string,
* not counting unchanged spans
* @internal ICU 59 technology preview
*/
int32_t replacementIndex() const { return replIndex; }
/**
* @return the current index into the full destination string
* @internal ICU 59 technology preview
*/
int32_t destinationIndex() const { return destIndex; }
private:
friend class Edits;
Iterator(const uint16_t *a, int32_t len, UBool crs) :
array(a), index(0), length(len), width(0), remaining(0), coarse(crs) {}
Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
int32_t readLength(int32_t head);
void updateIndexes();
UBool noNext();
const uint16_t *array;
int32_t index, length;
int32_t width, remaining;
UBool coarse;
int32_t remaining;
UBool onlyChanges, coarse;
UBool changed;
int32_t oldLength_, newLength_;
int32_t srcIndex, replIndex, destIndex;
};
/**
* Returns an Iterator for coarse-grained changes for simple string updates.
* Skips non-changes.
* @return an Iterator that merges adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getCoarseChangesIterator() const {
return Iterator(array, length, TRUE, TRUE);
}
/**
* Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
* @return an Iterator that merges adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getCoarseIterator() const {
return Iterator(array, length, TRUE);
return Iterator(array, length, FALSE, TRUE);
}
/**
* Returns an Iterator for fine-grained changes for modifying text with metadata.
* Returns an Iterator for fine-grained changes for modifying styled text.
* Skips non-changes.
* @return an Iterator that separates adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getFineChangesIterator() const {
return Iterator(array, length, TRUE, FALSE);
}
/**
* Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
* @return an Iterator that separates adjacent changes.
* @internal ICU 59 technology preview
*/
Iterator getFineIterator() const {
return Iterator(array, length, FALSE);
return Iterator(array, length, FALSE, FALSE);
}
private:
@ -383,6 +443,76 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
*/
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
#if U_SHOW_CPLUSPLUS_API
#ifndef U_HIDE_INTERNAL_API
/**
* Lowercases the characters in a UTF-16 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any). Can be NULL.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToLower
* @internal ICU 59 technology preview
*/
U_CAPI int32_t U_EXPORT2
ucasemap_toLowerWithEdits(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode);
/**
* Uppercases the characters in a UTF-16 string and optionally records edits.
* Casing is locale-dependent and context-sensitive.
* The result may be longer or shorter than the original.
* The source string and the destination buffer must not overlap.
*
* @param csm UCaseMap service object.
* @param dest A buffer for the result string. The result will be NUL-terminated if
* the buffer is large enough.
* The contents is undefined in case of failure.
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
* dest may be NULL and the function will only return the length of the result
* without writing any of the result string.
* @param src The original string.
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any). Can be NULL.
* @param pErrorCode Must be a valid pointer to an error code value,
* which must not indicate a failure before the function call.
* @return The length of the result string, if successful - or in case of a buffer overflow,
* in which case it will be greater than destCapacity.
*
* @see u_strToLower
* @internal ICU 59 technology preview
*/
U_CAPI int32_t U_EXPORT2
ucasemap_toUpperWithEdits(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode);
#endif // U_HIDE_INTERNAL_API
#endif // U_SHOW_CPLUSPLUS_API
#if !UCONFIG_NO_BREAK_ITERATION
/**

View File

@ -140,9 +140,8 @@ UnicodeString::caseMap(const UCaseMap *csm,
Edits edits;
edits.setWriteUnchanged(FALSE);
UChar replacementChars[200];
int32_t replacementLength = stringCaseMapper(
csm, replacementChars, UPRV_LENGTHOF(replacementChars),
oldArray, oldLength, &edits, &errorCode);
stringCaseMapper(csm, replacementChars, UPRV_LENGTHOF(replacementChars),
oldArray, oldLength, &edits, &errorCode);
UErrorCode editsError = U_ZERO_ERROR;
if (edits.setErrorCode(editsError)) {
setToBogus();
@ -150,22 +149,17 @@ UnicodeString::caseMap(const UCaseMap *csm,
}
newLength = oldLength + edits.lengthDelta();
if (U_SUCCESS(errorCode)) {
if (!cloneArrayIfNeeded(newLength, newLength)) {
// Grow the buffer at most once, not for multiple doReplace() calls.
if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
return *this;
}
int32_t index = 0; // index into this string
int32_t replIndex = 0; // index into replacementChars
for (Edits::Iterator iter = edits.getCoarseIterator(); iter.next(errorCode);) {
if (iter.changed) {
doReplace(index, iter.oldLength, replacementChars, replIndex, iter.newLength);
replIndex += iter.newLength;
}
index += iter.newLength;
for (Edits::Iterator iter = edits.getCoarseChangesIterator(); iter.next(errorCode);) {
doReplace(iter.destinationIndex(), iter.oldLength(),
replacementChars, iter.replacementIndex(), iter.newLength());
}
if (U_FAILURE(errorCode)) {
setToBogus();
}
U_ASSERT(replIndex == replacementLength);
return *this;
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
// common overflow handling below

View File

@ -167,8 +167,8 @@ ustrcase_internalFold(const UCaseMap *csm,
UErrorCode *pErrorCode);
/**
* Implements argument checking and buffer handling
* for string case mapping as a common function.
* Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
* Implements argument checking.
*/
U_CFUNC int32_t
ustrcase_map(const UCaseMap *csm,
@ -178,6 +178,18 @@ ustrcase_map(const UCaseMap *csm,
icu::Edits *edits,
UErrorCode *pErrorCode);
/**
* Common string case mapping implementation for old-fashioned u_strToXyz() functions
* that allow the source string to overlap the destination buffer.
* Implements argument checking and internally works with an intermediate buffer if necessary.
*/
U_CFUNC int32_t
ustrcase_mapWithOverlap(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UStringCaseMapper *stringCaseMapper,
UErrorCode *pErrorCode);
/**
* UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
* UTF-8 version of UStringCaseMapper.

View File

@ -62,11 +62,11 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
} else {
csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode);
}
int32_t length=ustrcase_map(
int32_t length=ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToTitle, NULL, pErrorCode);
ustrcase_internalToTitle, pErrorCode);
if(titleIter==NULL && csm.iter!=NULL) {
ubrk_close(csm.iter);
}

View File

@ -226,82 +226,11 @@ UBool Edits::hasChanges() const {
return FALSE;
}
UBool Edits::Iterator::next(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
// Always set all relevant public fields: Do not rely on them not having been touched.
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
changed = TRUE;
oldLength = newLength = width;
--remaining;
return TRUE;
}
if (index >= length) {
return FALSE;
}
int32_t u = array[index++];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = FALSE;
oldLength = u + 1;
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
if (u >= (INT32_MAX - oldLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += u + 1;
}
newLength = oldLength;
return TRUE;
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
if (coarse) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
oldLength = newLength = w * len;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
oldLength = newLength = width = u >> 12;
remaining = u & 0xfff;
return TRUE;
}
} else {
U_ASSERT(u <= 0x7fff);
oldLength = readLength((u >> 6) & 0x3f);
newLength = readLength(u & 0x3f);
if (!coarse) {
return TRUE;
}
}
// Combine adjacent changes.
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
len = w * len;
if (len > (INT32_MAX - oldLength) || len > (INT32_MAX - newLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += len;
newLength += len;
} else {
U_ASSERT(u <= 0x7fff);
int32_t oldLen = readLength((u >> 6) & 0x3f);
int32_t newLen = readLength(u & 0x3f);
if (oldLen > (INT32_MAX - oldLength) || newLen > (INT32_MAX - newLength)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return FALSE;
}
oldLength += oldLen;
newLength += newLen;
}
}
return TRUE;
}
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
array(a), index(0), length(len), remaining(0),
onlyChanges(oc), coarse(crs),
changed(FALSE), oldLength_(0), newLength_(0),
srcIndex(0), replIndex(0), destIndex(0) {}
int32_t Edits::Iterator::readLength(int32_t head) {
if (head < LENGTH_IN_1TRAIL) {
@ -322,6 +251,129 @@ int32_t Edits::Iterator::readLength(int32_t head) {
}
}
void Edits::Iterator::updateIndexes() {
srcIndex += oldLength_;
if (changed) {
replIndex += newLength_;
}
destIndex += newLength_;
}
UBool Edits::Iterator::noNext() {
// Empty span beyond the string.
oldLength_ = newLength_ = 0;
return FALSE;
}
UBool Edits::Iterator::next(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return FALSE; }
// We have an errorCode in case we need to start guarding against integer overflows.
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
updateIndexes();
if (remaining > 0) {
// Fine-grained iterator: Continue a sequence of equal-length changes.
--remaining;
return TRUE;
}
if (index >= length) {
return noNext();
}
int32_t u = array[index++];
if (u <= MAX_UNCHANGED) {
// Combine adjacent unchanged ranges.
changed = FALSE;
oldLength_ = u + 1;
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
++index;
oldLength_ += u + 1;
}
newLength_ = oldLength_;
if (onlyChanges) {
updateIndexes();
if (index >= length) {
return noNext();
}
// already fetched u > MAX_UNCHANGED at index
++index;
} else {
return TRUE;
}
}
changed = TRUE;
if (u <= MAX_SHORT_CHANGE) {
if (coarse) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
oldLength_ = newLength_ = len * w;
} else {
// Split a sequence of equal-length changes that was compressed into one unit.
oldLength_ = newLength_ = u >> 12;
remaining = u & 0xfff;
return TRUE;
}
} else {
U_ASSERT(u <= 0x7fff);
oldLength_ = readLength((u >> 6) & 0x3f);
newLength_ = readLength(u & 0x3f);
if (!coarse) {
return TRUE;
}
}
// Combine adjacent changes.
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
++index;
if (u <= MAX_SHORT_CHANGE) {
int32_t w = u >> 12;
int32_t len = (u & 0xfff) + 1;
len = len * w;
oldLength_ += len;
newLength_ += len;
} else {
U_ASSERT(u <= 0x7fff);
int32_t oldLen = readLength((u >> 6) & 0x3f);
int32_t newLen = readLength(u & 0x3f);
oldLength_ += oldLen;
newLength_ += newLen;
}
}
return TRUE;
}
UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
if (i < srcIndex) {
// Reset the iterator to the start.
index = remaining = srcIndex = replIndex = destIndex = 0;
} else if (i < (srcIndex + oldLength_)) {
// The index is in the current span.
return TRUE;
}
while (next(errorCode)) {
if (i < (srcIndex + oldLength_)) {
// The index is in the current span.
return TRUE;
}
if (remaining > 0) {
// Is the index in one of the remaining compressed edits?
// srcIndex is the start of the current span, before the remaining ones.
int32_t len = (remaining + 1) * oldLength_;
if (i < (srcIndex + len)) {
int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
len = n * oldLength_;
srcIndex += len;
replIndex += len;
destIndex += len;
remaining -= n;
return TRUE;
}
// Make next() skip all of these edits at once.
oldLength_ = newLength_ = len;
remaining = 0;
}
}
return FALSE;
}
U_NAMESPACE_END
U_NAMESPACE_USE
@ -1360,6 +1412,45 @@ ustrcase_map(const UCaseMap *csm,
UStringCaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode *pErrorCode) {
int32_t destLength;
/* check argument values */
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( destCapacity<0 ||
(dest==NULL && destCapacity>0) ||
src==NULL ||
srcLength<-1
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
/* get the string length */
if(srcLength==-1) {
srcLength=u_strlen(src);
}
/* check for overlapping source and destination */
if( dest!=NULL &&
((src>=dest && src<(dest+destCapacity)) ||
(dest>=src && dest<(src+srcLength)))
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
}
U_CFUNC int32_t
ustrcase_mapWithOverlap(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UStringCaseMapper *stringCaseMapper,
UErrorCode *pErrorCode) {
UChar buffer[300];
UChar *temp;
@ -1404,7 +1495,7 @@ ustrcase_map(const UCaseMap *csm,
temp=dest;
}
destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, edits, pErrorCode);
destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, NULL, pErrorCode);
if(temp!=dest) {
/* copy the result string to the destination buffer */
if (U_SUCCESS(*pErrorCode) && 0 < destLength && destLength <= destCapacity) {
@ -1428,11 +1519,37 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
UCaseMap csm=UCASEMAP_INITIALIZER;
csm.csp=ucase_getSingleton();
csm.options=options;
return ustrcase_map(
return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalFold, NULL, pErrorCode);
ustrcase_internalFold, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
ucasemap_toLowerWithEdits(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) {
return ustrcase_map(
csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToLower, edits, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
ucasemap_toUpperWithEdits(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) {
return ustrcase_map(
csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToUpper, edits, pErrorCode);
}
/* case-insensitive string comparisons -------------------------------------- */

View File

@ -90,11 +90,11 @@ u_strToLower(UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
UCaseMap csm=UCASEMAP_INITIALIZER;
setTempCaseMap(&csm, locale);
return ustrcase_map(
return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToLower, NULL, pErrorCode);
ustrcase_internalToLower, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
@ -104,9 +104,9 @@ u_strToUpper(UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode) {
UCaseMap csm=UCASEMAP_INITIALIZER;
setTempCaseMap(&csm, locale);
return ustrcase_map(
return ustrcase_mapWithOverlap(
&csm,
dest, destCapacity,
src, srcLength,
ustrcase_internalToUpper, NULL, pErrorCode);
ustrcase_internalToUpper, pErrorCode);
}