ICU-12410 low-level UTF-16 case mapping functions optionally write metadata to new class Edits

X-SVN-Rev: 39546
This commit is contained in:
Markus Scherer 2016-12-31 00:23:31 +00:00
parent fb0439839b
commit 6fc7fe3b65
6 changed files with 460 additions and 148 deletions

View File

@ -25,6 +25,12 @@
#include "unicode/ustring.h" #include "unicode/ustring.h"
#include "unicode/localpointer.h" #include "unicode/localpointer.h"
#if U_SHOW_CPLUSPLUS_API
#include "unicode/uobject.h"
#endif // U_SHOW_CPLUSPLUS_API
/** /**
* \file * \file
* \brief C API: Unicode case mapping functions using a UCaseMap service object. * \brief C API: Unicode case mapping functions using a UCaseMap service object.
@ -94,6 +100,98 @@ U_NAMESPACE_BEGIN
*/ */
U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close); U_DEFINE_LOCAL_OPEN_POINTER(LocalUCaseMapPointer, UCaseMap, ucasemap_close);
#ifndef U_HIDE_INTERNAL_API
/**
* Records lengths of string edits but not replacement text.
* Supports replacements, insertions, deletions in linear progression.
* Does not support moving/reordering of text.
*
* @internal ICU 59 technology preview
*/
class Edits final : public UMemory {
public:
/**
* Constructs an empty object.
* @internal ICU 59 technology preview
*/
Edits() :
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
errorCode(U_ZERO_ERROR) {}
~Edits();
/**
* Resets the data but may not release memory.
* @internal ICU 59 technology preview
*/
void reset();
/**
* Adds a record for an unchanged segment of text.
* @internal ICU 59 technology preview
*/
void addUnchanged(int32_t unchangedLength);
/**
* Adds a record for a text replacement/insertion/deletion.
* @internal ICU 59 technology preview
*/
void addReplace(int32_t oldLength, int32_t newLength);
/**
* Sets the UErrorCode if an error occurred while recording edits.
* Preserves older error codes in the outErrorCode.
* @return TRUE if U_FAILURE(outErrorCode)
* @internal ICU 59 technology preview
*/
UBool setErrorCode(UErrorCode &outErrorCode);
/**
* How much longer is the new text compared with the old text?
* @return new length minus old length
* @internal ICU 59 technology preview
*/
int32_t lengthDelta() const { return delta; }
private:
Edits(const Edits &) = delete;
Edits &operator=(const Edits &) = delete;
void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
void append(int32_t r);
void append(const uint16_t *buffer, int32_t bLength);
UBool growArray();
static const int32_t STACK_CAPACITY = 100;
uint16_t *array;
int32_t capacity;
int32_t length;
int32_t delta;
UErrorCode errorCode;
uint16_t stackArray[STACK_CAPACITY];
};
/**
* Omit unchanged text when case-mapping with Edits.
*
* TODO: revisit which bit to use; currently:
* - 31..20: old normalization options (only deprecated Unicode 3.2)
* shifted up for unorm_compare()
* - 19..16: more options specific to unorm_compare() (currently bits 19, 17, 16)
* - 15..12: more string compare options (currently bits 15 & 12)
* - 11.. 8: titlecase mapping options (currently bits 9..8)
* - 7.. 0: case folding options, but only bit 0 currently used
*
* could overlay any normalization and string *comparison* option bits
* with case *mapping* option bits
* *unless* we start using UCaseMap for string comparison functions
*
* future: German sharp s may need locale variant or option bit
*
* @internal ICU 59 technology preview
*/
#define UCASEMAP_OMIT_UNCHANGED 0x4000
#endif // U_HIDE_INTERNAL_API
U_NAMESPACE_END U_NAMESPACE_END
#endif #endif
@ -422,4 +520,19 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
const char *src, int32_t srcLength, const char *src, int32_t srcLength,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
// Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper.
/**
* Internal string case mapping function type.
* All error checking must be done.
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
* src and dest must not overlap.
* @internal
*/
typedef int32_t U_CALLCONV
UStringCaseMapper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode);
#endif #endif

View File

@ -55,25 +55,6 @@ U_STABLE int32_t U_EXPORT2
u_strlen(const UChar *s); u_strlen(const UChar *s);
#endif #endif
/**
* \def U_STRING_CASE_MAPPER_DEFINED
* @internal
*/
#ifndef U_STRING_CASE_MAPPER_DEFINED
#define U_STRING_CASE_MAPPER_DEFINED
/**
* Internal string case mapping function type.
* @internal
*/
typedef int32_t U_CALLCONV
UStringCaseMapper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
#if !UCONFIG_NO_BREAK_ITERATION #if !UCONFIG_NO_BREAK_ITERATION

View File

@ -18,6 +18,7 @@
#define __USTR_IMP_H__ #define __USTR_IMP_H__
#include "unicode/utypes.h" #include "unicode/utypes.h"
#include "unicode/ucasemap.h"
#include "unicode/uiter.h" #include "unicode/uiter.h"
#include "ucase.h" #include "ucase.h"
@ -129,28 +130,12 @@ typedef struct UCaseMap UCaseMap;
U_CFUNC void U_CFUNC void
ustrcase_setTempCaseMapLocale(UCaseMap *csm, const char *locale); ustrcase_setTempCaseMapLocale(UCaseMap *csm, const char *locale);
#ifndef U_STRING_CASE_MAPPER_DEFINED
#define U_STRING_CASE_MAPPER_DEFINED
/**
* String case mapping function type, used by ustrcase_map().
* All error checking must be done.
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
* src and dest must not overlap.
*/
typedef int32_t U_CALLCONV
UStringCaseMapper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UErrorCode *pErrorCode);
#endif
/** Implements UStringCaseMapper. */ /** Implements UStringCaseMapper. */
U_CFUNC int32_t U_CALLCONV U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(const UCaseMap *csm, ustrcase_internalToLower(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
/** Implements UStringCaseMapper. */ /** Implements UStringCaseMapper. */
@ -158,6 +143,7 @@ U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(const UCaseMap *csm, ustrcase_internalToUpper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
#if !UCONFIG_NO_BREAK_ITERATION #if !UCONFIG_NO_BREAK_ITERATION
@ -167,6 +153,7 @@ U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(const UCaseMap *csm, ustrcase_internalToTitle(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
#endif #endif
@ -176,6 +163,7 @@ U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(const UCaseMap *csm, ustrcase_internalFold(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
/** /**
@ -187,6 +175,7 @@ ustrcase_map(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
UStringCaseMapper *stringCaseMapper, UStringCaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode *pErrorCode); UErrorCode *pErrorCode);
/** /**

View File

@ -87,7 +87,7 @@ ucasemap_toTitle(UCaseMap *csm,
csm, csm,
dest, destCapacity, dest, destCapacity,
src, srcLength, src, srcLength,
ustrcase_internalToTitle, pErrorCode); ustrcase_internalToTitle, NULL, pErrorCode);
} }
#endif // !UCONFIG_NO_BREAK_ITERATION #endif // !UCONFIG_NO_BREAK_ITERATION

View File

@ -32,6 +32,190 @@
#include "ustr_imp.h" #include "ustr_imp.h"
#include "uassert.h" #include "uassert.h"
U_NAMESPACE_BEGIN
namespace {
// 0000uuuuuuuuuuuu records u+1 unchanged text units.
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
// No length change.
const int32_t MAX_SHORT_WIDTH = 6;
const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
const int32_t MAX_SHORT_CHANGE = 0x6fff;
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
// m or n = 61: actual length follows in the next edits array unit.
// m or n = 62..63: actual length follows in the next two edits array units.
// Bit 30 of the actual length is in the head unit.
// Trailing units have bit 15 set.
const int32_t LENGTH_IN_1TRAIL = 61;
const int32_t LENGTH_IN_2TRAIL = 62;
} // namespace
Edits::~Edits() {
if(array != stackArray) {
uprv_free(array);
}
}
void Edits::reset() {
length = 0;
}
void Edits::addUnchanged(int32_t unchangedLength) {
if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
if(unchangedLength < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Merge into previous unchanged-text record, if any.
int32_t last = lastUnit();
if(last < MAX_UNCHANGED) {
int32_t remaining = MAX_UNCHANGED - last;
if (remaining >= unchangedLength) {
setLastUnit(last + unchangedLength);
return;
}
setLastUnit(MAX_UNCHANGED);
unchangedLength -= remaining;
}
// Split large lengths into multiple units.
while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
append(MAX_UNCHANGED);
unchangedLength -= MAX_UNCHANGED_LENGTH;
}
// Write a small (remaining) length.
if(unchangedLength > 0) {
append(unchangedLength - 1);
}
}
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
if(U_FAILURE(errorCode)) { return; }
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
// Replacement of short oldLength text units by same-length new text.
// Merge into previous short-replacement record, if any.
int32_t last = lastUnit();
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
setLastUnit(last + 1);
return;
}
append(oldLength << 12);
return;
}
if(oldLength < 0 || newLength < 0) {
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (oldLength == 0 && newLength == 0) {
return;
}
int32_t newDelta = newLength - oldLength;
if (newDelta != 0) {
if (newDelta > 0 ? newDelta > (INT32_MAX - delta) : newDelta < (INT32_MIN - delta)) {
// Integer overflow or underflow.
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
delta += newDelta;
}
uint16_t buffer[5];
int32_t bLength = 1;
int32_t head = 0x7000;
if(oldLength < LENGTH_IN_1TRAIL) {
head |= oldLength << 6;
} else if(oldLength <= 0x7fff) {
head |= LENGTH_IN_1TRAIL << 6;
buffer[bLength++] = (uint16_t)(0x8000 | oldLength);
} else {
head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
buffer[bLength++] = (uint16_t)(0x8000 | (oldLength >> 15));
buffer[bLength++] = (uint16_t)(0x8000 | oldLength);
}
if(newLength < LENGTH_IN_1TRAIL) {
head |= newLength;
} else if(newLength <= 0x7fff) {
head |= LENGTH_IN_1TRAIL;
buffer[bLength++] = (uint16_t)(0x8000 | newLength);
} else {
head |= LENGTH_IN_2TRAIL + (newLength >> 30);
buffer[bLength++] = (uint16_t)(0x8000 | (newLength >> 15));
buffer[bLength++] = (uint16_t)(0x8000 | newLength);
}
if(bLength == 1) {
append(head);
} else {
buffer[0] = (uint16_t)head;
append(buffer, bLength);
}
}
void Edits::append(int32_t r) {
if(length < capacity || growArray()) {
array[length++] = (uint16_t)r;
}
}
void Edits::append(const uint16_t *buffer, int32_t bLength) {
if(bLength > (INT32_MAX - length)) {
errorCode = U_INDEX_OUTOFBOUNDS_ERROR; // Integer overflow.
return;
}
if((length + bLength) < capacity || growArray()) {
int32_t i = 0;
do {
array[length++] = buffer[i++];
} while (i < bLength);
}
}
UBool Edits::growArray() {
int32_t newCapacity;
if (array == stackArray) {
newCapacity = 2000;
} else if (capacity == INT32_MAX) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
return FALSE;
} else if (capacity >= (INT32_MAX / 2)) {
newCapacity = INT32_MAX;
} else {
newCapacity = 2 * capacity;
}
// Grow by at least 5 units so that a maximal change record will fit.
if ((newCapacity - capacity) < 5) {
errorCode = U_BUFFER_OVERFLOW_ERROR;
return FALSE;
}
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
if (newArray == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return FALSE;
}
uprv_memcpy(newArray, array, (size_t)length * 2);
if (array != stackArray) {
uprv_free(array);
}
array = newArray;
capacity = newCapacity;
return TRUE;
}
UBool Edits::setErrorCode(UErrorCode &outErrorCode) {
if(U_FAILURE(outErrorCode)) { return TRUE; }
if(U_SUCCESS(errorCode)) { return FALSE; }
outErrorCode = errorCode;
return TRUE;
}
U_NAMESPACE_END
U_NAMESPACE_USE U_NAMESPACE_USE
/* string casing ------------------------------------------------------------ */ /* string casing ------------------------------------------------------------ */
@ -39,21 +223,43 @@ U_NAMESPACE_USE
/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
static inline int32_t static inline int32_t
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
int32_t result, const UChar *s) { int32_t result, const UChar *s,
uint32_t options, int32_t cpLength, icu::Edits *edits) {
UChar32 c; UChar32 c;
int32_t length; int32_t length;
/* decode the result */ /* decode the result */
if(result<0) { if(result<0) {
/* (not) original code point */ /* (not) original code point */
if(edits!=NULL) {
edits->addUnchanged(cpLength);
}
if(options & UCASEMAP_OMIT_UNCHANGED) {
return destIndex;
}
c=~result; c=~result;
length=U16_LENGTH(c); if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
} else if(result<=UCASE_MAX_STRING_LENGTH) { dest[destIndex++]=(UChar)c;
c=U_SENTINEL; return destIndex;
length=result; }
length=cpLength;
} else { } else {
c=result; if(result<=UCASE_MAX_STRING_LENGTH) {
length=U16_LENGTH(c); c=U_SENTINEL;
length=result;
} else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
dest[destIndex++]=(UChar)result;
if(edits!=NULL) {
edits->addReplace(cpLength, 1);
}
return destIndex;
} else {
c=result;
length=U16_LENGTH(c);
}
if(edits!=NULL) {
edits->addReplace(cpLength, length);
}
} }
if(length>(INT32_MAX-destIndex)) { if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow return -1; // integer overflow
@ -99,8 +305,8 @@ appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
} }
static inline int32_t static inline int32_t
appendString(UChar *dest, int32_t destIndex, int32_t destCapacity, appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
const UChar *s, int32_t length) { const UChar *s, int32_t length, icu::Edits *edits) {
if(length>0) { if(length>0) {
if(length>(INT32_MAX-destIndex)) { if(length>(INT32_MAX-destIndex)) {
return -1; // integer overflow return -1; // integer overflow
@ -109,6 +315,9 @@ appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
u_memcpy(dest+destIndex, s, length); u_memcpy(dest+destIndex, s, length);
} }
destIndex+=length; destIndex+=length;
if(edits!=NULL) {
edits->addUnchanged(length);
}
} }
return destIndex; return destIndex;
} }
@ -154,31 +363,26 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, UCaseContext *csc, const UChar *src, UCaseContext *csc,
int32_t srcStart, int32_t srcLimit, int32_t srcStart, int32_t srcLimit,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
const UChar *s; int32_t locCache=csm->locCache;
UChar32 c, c2 = 0;
int32_t srcIndex, destIndex;
int32_t locCache;
locCache=csm->locCache;
/* case mapping loop */ /* case mapping loop */
srcIndex=srcStart; int32_t srcIndex=srcStart;
destIndex=0; int32_t destIndex=0;
while(srcIndex<srcLimit) { while(srcIndex<srcLimit) {
csc->cpStart=srcIndex; int32_t cpStart;
csc->cpStart=cpStart=srcIndex;
UChar32 c;
U16_NEXT(src, srcIndex, srcLimit, c); U16_NEXT(src, srcIndex, srcLimit, c);
csc->cpLimit=srcIndex; csc->cpLimit=srcIndex;
const UChar *s;
c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { destIndex = appendResult(dest, destIndex, destCapacity, c, s,
/* fast path version of appendResult() for BMP results */ csm->options, srcIndex - cpStart, edits);
dest[destIndex++]=(UChar)c2; if (destIndex < 0) {
} else { *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
destIndex=appendResult(dest, destIndex, destCapacity, c, s); return 0;
if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
} }
} }
@ -194,12 +398,8 @@ U_CFUNC int32_t U_CALLCONV
ustrcase_internalToTitle(const UCaseMap *csm, ustrcase_internalToTitle(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
const UChar *s;
UChar32 c;
int32_t prev, titleStart, titleLimit, idx, destIndex;
UBool isFirstIndex;
if(U_FAILURE(*pErrorCode)) { if(U_FAILURE(*pErrorCode)) {
return 0; return 0;
} }
@ -213,13 +413,14 @@ ustrcase_internalToTitle(const UCaseMap *csm,
UCaseContext csc=UCASECONTEXT_INITIALIZER; UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src; csc.p=(void *)src;
csc.limit=srcLength; csc.limit=srcLength;
destIndex=0; int32_t destIndex=0;
prev=0; int32_t prev=0;
isFirstIndex=TRUE; UBool isFirstIndex=TRUE;
/* titlecasing loop */ /* titlecasing loop */
while(prev<srcLength) { while(prev<srcLength) {
/* find next index where to titlecase */ /* find next index where to titlecase */
int32_t idx;
if(isFirstIndex) { if(isFirstIndex) {
isFirstIndex=FALSE; isFirstIndex=FALSE;
idx=bi->first(); idx=bi->first();
@ -245,7 +446,9 @@ ustrcase_internalToTitle(const UCaseMap *csm,
*/ */
if(prev<idx) { if(prev<idx) {
/* find and copy uncased characters [prev..titleStart[ */ /* find and copy uncased characters [prev..titleStart[ */
titleStart=titleLimit=prev; int32_t titleStart=prev;
int32_t titleLimit=prev;
UChar32 c;
U16_NEXT(src, titleLimit, idx, c); U16_NEXT(src, titleLimit, idx, c);
if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
/* Adjust the titlecasing index (titleStart) to the next cased character. */ /* Adjust the titlecasing index (titleStart) to the next cased character. */
@ -263,7 +466,8 @@ ustrcase_internalToTitle(const UCaseMap *csm,
break; /* cased letter at [titleStart..titleLimit[ */ break; /* cased letter at [titleStart..titleLimit[ */
} }
} }
destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev); destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+prev, titleStart-prev, edits);
if(destIndex<0) { if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0; return 0;
@ -274,8 +478,11 @@ ustrcase_internalToTitle(const UCaseMap *csm,
/* titlecase c which is from [titleStart..titleLimit[ */ /* titlecase c which is from [titleStart..titleLimit[ */
csc.cpStart=titleStart; csc.cpStart=titleStart;
csc.cpLimit=titleLimit; csc.cpLimit=titleLimit;
c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); const UChar *s;
destIndex=appendResult(dest, destIndex, destCapacity, c, s); c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s,
csm->locale, &locCache);
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
csm->options, titleLimit-titleStart, edits);
if(destIndex<0) { if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0; return 0;
@ -291,6 +498,9 @@ ustrcase_internalToTitle(const UCaseMap *csm,
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0; return 0;
} }
if(edits!=NULL) {
edits->addReplace(1, 1);
}
titleLimit++; titleLimit++;
} }
@ -304,7 +514,7 @@ ustrcase_internalToTitle(const UCaseMap *csm,
dest+destIndex, destCapacity-destIndex, dest+destIndex, destCapacity-destIndex,
src, &csc, src, &csc,
titleLimit, idx, titleLimit, idx,
pErrorCode); edits, pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR; *pErrorCode=U_ZERO_ERROR;
} }
@ -313,7 +523,8 @@ ustrcase_internalToTitle(const UCaseMap *csm,
} }
} else { } else {
/* Optionally just copy the rest of the word unchanged. */ /* Optionally just copy the rest of the word unchanged. */
destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit); destIndex=appendUnchanged(dest, destIndex, destCapacity,
src+titleLimit, idx-titleLimit, edits);
if(destIndex<0) { if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0; return 0;
@ -816,6 +1027,7 @@ UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i,
int32_t toUpper(const UCaseMap *csm, int32_t toUpper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
int32_t locCache = UCASE_LOC_GREEK; int32_t locCache = UCASE_LOC_GREEK;
int32_t destIndex=0; int32_t destIndex=0;
@ -890,35 +1102,64 @@ int32_t toUpper(const UCaseMap *csm,
data &= ~HAS_EITHER_DIALYTIKA; data &= ~HAS_EITHER_DIALYTIKA;
} }
} }
destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) { UBool change;
destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika if ((csm->options & UCASEMAP_OMIT_UNCHANGED) == 0 && edits == NULL) {
} change = TRUE; // common, simple usage
if (destIndex >= 0 && addTonos) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
}
while (destIndex >= 0 && numYpogegrammeni > 0) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
--numYpogegrammeni;
}
if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
} else {
const UChar *s;
UChar32 c2 = 0;
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
/* fast path version of appendResult() for BMP results */
dest[destIndex++]=(UChar)c2;
} else { } else {
destIndex=appendResult(dest, destIndex, destCapacity, c, s); // Find out first whether we are changing the text.
change = src[i] != upper || numYpogegrammeni > 0;
int32_t i2 = i + 1;
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
change |= i2 >= nextIndex || src[i2] != 0x308;
++i2;
}
if (addTonos) {
change |= i2 >= nextIndex || src[i2] != 0x301;
++i2;
}
int32_t oldLength = nextIndex - i;
int32_t newLength = (i2 - i) + numYpogegrammeni;
change |= oldLength != newLength;
if (change) {
if (edits != NULL) {
edits->addReplace(oldLength, newLength);
}
} else {
if (edits != NULL) {
edits->addUnchanged(oldLength);
}
// Write unchanged text?
change |= (csm->options & UCASEMAP_OMIT_UNCHANGED) == 0;
}
}
if (change) {
destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
}
if (destIndex >= 0 && addTonos) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
}
while (destIndex >= 0 && numYpogegrammeni > 0) {
destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
--numYpogegrammeni;
}
if(destIndex<0) { if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0; return 0;
} }
} }
} else {
const UChar *s;
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
csm->options, nextIndex - i, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
} }
i = nextIndex; i = nextIndex;
state = nextState; state = nextState;
@ -939,6 +1180,7 @@ U_CFUNC int32_t U_CALLCONV
ustrcase_internalToLower(const UCaseMap *csm, ustrcase_internalToLower(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
UCaseContext csc=UCASECONTEXT_INITIALIZER; UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src; csc.p=(void *)src;
@ -947,17 +1189,18 @@ ustrcase_internalToLower(const UCaseMap *csm,
csm, ucase_toFullLower, csm, ucase_toFullLower,
dest, destCapacity, dest, destCapacity,
src, &csc, 0, srcLength, src, &csc, 0, srcLength,
pErrorCode); edits, pErrorCode);
} }
U_CFUNC int32_t U_CALLCONV U_CFUNC int32_t U_CALLCONV
ustrcase_internalToUpper(const UCaseMap *csm, ustrcase_internalToUpper(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
int32_t locCache = csm->locCache; int32_t locCache = csm->locCache;
if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) { if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode); return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, edits, pErrorCode);
} }
UCaseContext csc=UCASECONTEXT_INITIALIZER; UCaseContext csc=UCASECONTEXT_INITIALIZER;
csc.p=(void *)src; csc.p=(void *)src;
@ -966,49 +1209,37 @@ ustrcase_internalToUpper(const UCaseMap *csm,
csm, ucase_toFullUpper, csm, ucase_toFullUpper,
dest, destCapacity, dest, destCapacity,
src, &csc, 0, srcLength, src, &csc, 0, srcLength,
pErrorCode); edits, pErrorCode);
}
static int32_t
ustr_foldCase(const UCaseProps *csp,
UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
uint32_t options,
UErrorCode *pErrorCode) {
int32_t srcIndex, destIndex;
const UChar *s;
UChar32 c, c2 = 0;
/* case mapping loop */
srcIndex=destIndex=0;
while(srcIndex<srcLength) {
U16_NEXT(src, srcIndex, srcLength, c);
c=ucase_toFullFolding(csp, c, &s, options);
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
/* fast path version of appendResult() for BMP results */
dest[destIndex++]=(UChar)c2;
} else {
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
if(destIndex<0) {
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
}
if(destIndex>destCapacity) {
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
return destIndex;
} }
U_CFUNC int32_t U_CALLCONV U_CFUNC int32_t U_CALLCONV
ustrcase_internalFold(const UCaseMap *csm, ustrcase_internalFold(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); /* case mapping loop */
int32_t srcIndex = 0;
int32_t destIndex = 0;
while (srcIndex < srcLength) {
int32_t cpStart = srcIndex;
UChar32 c;
U16_NEXT(src, srcIndex, srcLength, c);
const UChar *s;
c = ucase_toFullFolding(csm->csp, c, &s, csm->options);
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
csm->options, srcIndex - cpStart, edits);
if (destIndex < 0) {
*pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
}
// TODO: are these internal functions called where destIndex>destCapacity is not already checked? (see u_terminateUChars())
if (destIndex > destCapacity) {
*pErrorCode = U_BUFFER_OVERFLOW_ERROR;
}
return destIndex;
} }
U_CFUNC int32_t U_CFUNC int32_t
@ -1016,6 +1247,7 @@ ustrcase_map(const UCaseMap *csm,
UChar *dest, int32_t destCapacity, UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength, const UChar *src, int32_t srcLength,
UStringCaseMapper *stringCaseMapper, UStringCaseMapper *stringCaseMapper,
icu::Edits *edits,
UErrorCode *pErrorCode) { UErrorCode *pErrorCode) {
UChar buffer[300]; UChar buffer[300];
UChar *temp; UChar *temp;
@ -1061,14 +1293,11 @@ ustrcase_map(const UCaseMap *csm,
temp=dest; temp=dest;
} }
destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode); destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, edits, pErrorCode);
if(temp!=dest) { if(temp!=dest) {
/* copy the result string to the destination buffer */ /* copy the result string to the destination buffer */
if(destLength>0) { if (U_SUCCESS(*pErrorCode) && 0 < destLength && destLength <= destCapacity) {
int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; u_memmove(dest, temp, destLength);
if(copyLength>0) {
u_memmove(dest, temp, copyLength);
}
} }
if(temp!=buffer) { if(temp!=buffer) {
uprv_free(temp); uprv_free(temp);
@ -1092,7 +1321,7 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
&csm, &csm,
dest, destCapacity, dest, destCapacity,
src, srcLength, src, srcLength,
ustrcase_internalFold, pErrorCode); ustrcase_internalFold, NULL, pErrorCode);
} }
/* case-insensitive string comparisons -------------------------------------- */ /* case-insensitive string comparisons -------------------------------------- */

View File

@ -94,7 +94,7 @@ u_strToLower(UChar *dest, int32_t destCapacity,
&csm, &csm,
dest, destCapacity, dest, destCapacity,
src, srcLength, src, srcLength,
ustrcase_internalToLower, pErrorCode); ustrcase_internalToLower, NULL, pErrorCode);
} }
U_CAPI int32_t U_EXPORT2 U_CAPI int32_t U_EXPORT2
@ -108,5 +108,5 @@ u_strToUpper(UChar *dest, int32_t destCapacity,
&csm, &csm,
dest, destCapacity, dest, destCapacity,
src, srcLength, src, srcLength,
ustrcase_internalToUpper, pErrorCode); ustrcase_internalToUpper, NULL, pErrorCode);
} }