ICU-12410 class Edits, class CaseMap with new low-level functions that work with Edits, simpler case properties code, some cleanup
X-SVN-Rev: 39684
This commit is contained in:
commit
4c2fad3e36
@ -94,6 +94,7 @@ stringtriebuilder.o bytestriebuilder.o \
|
||||
bytestrie.o bytestrieiterator.o \
|
||||
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
|
||||
dictionarydata.o \
|
||||
edits.o \
|
||||
appendable.o ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
|
||||
utf_impl.o ustring.o ustrcase.o ucasemap.o ucasemap_titlecase_brkiter.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
|
||||
unistr_case_locale.o ustrcase_locale.o unistr_titlecase_brkiter.o ustr_titlecase_brkiter.o \
|
||||
|
@ -15,6 +15,7 @@
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "charstr.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
|
@ -453,6 +453,7 @@
|
||||
<ClCompile Include="cstring.cpp" />
|
||||
<ClCompile Include="cstr.cpp" />
|
||||
<ClCompile Include="cwchar.cpp" />
|
||||
<ClCompile Include="edits.cpp" />
|
||||
<ClCompile Include="messagepattern.cpp" />
|
||||
<ClCompile Include="schriter.cpp" />
|
||||
<ClCompile Include="stringpiece.cpp" />
|
||||
@ -1515,6 +1516,20 @@
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\casemap.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
@ -1536,6 +1551,20 @@
|
||||
<ClInclude Include="cstring.h" />
|
||||
<ClInclude Include="cstr.h" />
|
||||
<ClInclude Include="cwchar.h" />
|
||||
<CustomBuild Include="unicode\edits.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\messagepattern.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
@ -1620,6 +1649,7 @@
|
||||
</Command>
|
||||
<Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\include\unicode\%(Filename)%(Extension);%(Outputs)</Outputs>
|
||||
</CustomBuild>
|
||||
<ClInclude Include="ucasemap_imp.h" />
|
||||
<CustomBuild Include="unicode\ucharstrie.h">
|
||||
<Command Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">copy "%(FullPath)" ..\..\include\unicode
|
||||
</Command>
|
||||
|
@ -478,6 +478,9 @@
|
||||
<ClCompile Include="cwchar.cpp">
|
||||
<Filter>strings</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="edits.cpp">
|
||||
<Filter>strings</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="schriter.cpp">
|
||||
<Filter>strings</Filter>
|
||||
</ClCompile>
|
||||
@ -870,6 +873,9 @@
|
||||
<ClInclude Include="cwchar.h">
|
||||
<Filter>strings</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="ucasemap_imp.h">
|
||||
<Filter>strings</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="uinvchar.h">
|
||||
<Filter>strings</Filter>
|
||||
</ClInclude>
|
||||
@ -1096,9 +1102,15 @@
|
||||
<CustomBuild Include="unicode\bytestream.h">
|
||||
<Filter>strings</Filter>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\casemap.h">
|
||||
<Filter>strings</Filter>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\chariter.h">
|
||||
<Filter>strings</Filter>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\edits.h">
|
||||
<Filter>strings</Filter>
|
||||
</CustomBuild>
|
||||
<CustomBuild Include="unicode\rep.h">
|
||||
<Filter>strings</Filter>
|
||||
</CustomBuild>
|
||||
|
346
icu4c/source/common/edits.cpp
Normal file
346
icu4c/source/common/edits.cpp
Normal file
@ -0,0 +1,346 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// edits.cpp
|
||||
// created: 2017feb08 Markus W. Scherer
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
// 0000uuuuuuuuuuuu records u+1 unchanged text units.
|
||||
const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
|
||||
const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
|
||||
|
||||
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
|
||||
// No length change.
|
||||
const int32_t MAX_SHORT_WIDTH = 6;
|
||||
const int32_t MAX_SHORT_CHANGE_LENGTH = 0xfff;
|
||||
const int32_t MAX_SHORT_CHANGE = 0x6fff;
|
||||
|
||||
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
|
||||
// m or n = 61: actual length follows in the next edits array unit.
|
||||
// m or n = 62..63: actual length follows in the next two edits array units.
|
||||
// Bit 30 of the actual length is in the head unit.
|
||||
// Trailing units have bit 15 set.
|
||||
const int32_t LENGTH_IN_1TRAIL = 61;
|
||||
const int32_t LENGTH_IN_2TRAIL = 62;
|
||||
|
||||
} // namespace
|
||||
|
||||
Edits::~Edits() {
|
||||
if(array != stackArray) {
|
||||
uprv_free(array);
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::reset() {
|
||||
length = delta = 0;
|
||||
}
|
||||
|
||||
void Edits::addUnchanged(int32_t unchangedLength) {
|
||||
if(U_FAILURE(errorCode) || unchangedLength == 0) { return; }
|
||||
if(unchangedLength < 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
// Merge into previous unchanged-text record, if any.
|
||||
int32_t last = lastUnit();
|
||||
if(last < MAX_UNCHANGED) {
|
||||
int32_t remaining = MAX_UNCHANGED - last;
|
||||
if (remaining >= unchangedLength) {
|
||||
setLastUnit(last + unchangedLength);
|
||||
return;
|
||||
}
|
||||
setLastUnit(MAX_UNCHANGED);
|
||||
unchangedLength -= remaining;
|
||||
}
|
||||
// Split large lengths into multiple units.
|
||||
while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
|
||||
append(MAX_UNCHANGED);
|
||||
unchangedLength -= MAX_UNCHANGED_LENGTH;
|
||||
}
|
||||
// Write a small (remaining) length.
|
||||
if(unchangedLength > 0) {
|
||||
append(unchangedLength - 1);
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::addReplace(int32_t oldLength, int32_t newLength) {
|
||||
if(U_FAILURE(errorCode)) { return; }
|
||||
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
|
||||
// Replacement of short oldLength text units by same-length new text.
|
||||
// Merge into previous short-replacement record, if any.
|
||||
int32_t last = lastUnit();
|
||||
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
|
||||
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
|
||||
setLastUnit(last + 1);
|
||||
return;
|
||||
}
|
||||
append(oldLength << 12);
|
||||
return;
|
||||
}
|
||||
|
||||
if(oldLength < 0 || newLength < 0) {
|
||||
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
if (oldLength == 0 && newLength == 0) {
|
||||
return;
|
||||
}
|
||||
int32_t newDelta = newLength - oldLength;
|
||||
if (newDelta != 0) {
|
||||
if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
|
||||
(newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
|
||||
// Integer overflow or underflow.
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
delta += newDelta;
|
||||
}
|
||||
|
||||
int32_t head = 0x7000;
|
||||
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
head |= newLength;
|
||||
append(head);
|
||||
} else if ((capacity - length) >= 5 || growArray()) {
|
||||
int32_t limit = length + 1;
|
||||
if(oldLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
} else if(oldLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL << 6;
|
||||
array[limit++] = (uint16_t)(0x8000 | oldLength);
|
||||
} else {
|
||||
head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
|
||||
array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
|
||||
array[limit++] = (uint16_t)(0x8000 | oldLength);
|
||||
}
|
||||
if(newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= newLength;
|
||||
} else if(newLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL;
|
||||
array[limit++] = (uint16_t)(0x8000 | newLength);
|
||||
} else {
|
||||
head |= LENGTH_IN_2TRAIL + (newLength >> 30);
|
||||
array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
|
||||
array[limit++] = (uint16_t)(0x8000 | newLength);
|
||||
}
|
||||
array[length] = (uint16_t)head;
|
||||
length = limit;
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::append(int32_t r) {
|
||||
if(length < capacity || growArray()) {
|
||||
array[length++] = (uint16_t)r;
|
||||
}
|
||||
}
|
||||
|
||||
UBool Edits::growArray() {
|
||||
int32_t newCapacity;
|
||||
if (array == stackArray) {
|
||||
newCapacity = 2000;
|
||||
} else if (capacity == INT32_MAX) {
|
||||
// Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
|
||||
// with a result-string-buffer overflow.
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
} else if (capacity >= (INT32_MAX / 2)) {
|
||||
newCapacity = INT32_MAX;
|
||||
} else {
|
||||
newCapacity = 2 * capacity;
|
||||
}
|
||||
// Grow by at least 5 units so that a maximal change record will fit.
|
||||
if ((newCapacity - capacity) < 5) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
|
||||
if (newArray == NULL) {
|
||||
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
uprv_memcpy(newArray, array, (size_t)length * 2);
|
||||
if (array != stackArray) {
|
||||
uprv_free(array);
|
||||
}
|
||||
array = newArray;
|
||||
capacity = newCapacity;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool Edits::copyErrorTo(UErrorCode &outErrorCode) {
|
||||
if (U_FAILURE(outErrorCode)) { return TRUE; }
|
||||
if (U_SUCCESS(errorCode)) { return FALSE; }
|
||||
outErrorCode = errorCode;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool Edits::hasChanges() const {
|
||||
if (delta != 0) {
|
||||
return TRUE;
|
||||
}
|
||||
for (int32_t i = 0; i < length; ++i) {
|
||||
if (array[i] > MAX_UNCHANGED) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
|
||||
array(a), index(0), length(len), remaining(0),
|
||||
onlyChanges_(oc), coarse(crs),
|
||||
changed(FALSE), oldLength_(0), newLength_(0),
|
||||
srcIndex(0), replIndex(0), destIndex(0) {}
|
||||
|
||||
int32_t Edits::Iterator::readLength(int32_t head) {
|
||||
if (head < LENGTH_IN_1TRAIL) {
|
||||
return head;
|
||||
} else if (head < LENGTH_IN_2TRAIL) {
|
||||
U_ASSERT(index < length);
|
||||
U_ASSERT(array[index] >= 0x8000);
|
||||
return array[index++] & 0x7fff;
|
||||
} else {
|
||||
U_ASSERT((index + 2) <= length);
|
||||
U_ASSERT(array[index] >= 0x8000);
|
||||
U_ASSERT(array[index + 1] >= 0x8000);
|
||||
int32_t len = ((head & 1) << 30) |
|
||||
((int32_t)(array[index] & 0x7fff) << 15) |
|
||||
(array[index + 1] & 0x7fff);
|
||||
index += 2;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
void Edits::Iterator::updateIndexes() {
|
||||
srcIndex += oldLength_;
|
||||
if (changed) {
|
||||
replIndex += newLength_;
|
||||
}
|
||||
destIndex += newLength_;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::noNext() {
|
||||
// No change beyond the string.
|
||||
changed = FALSE;
|
||||
oldLength_ = newLength_ = 0;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode)) { return FALSE; }
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
updateIndexes();
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator: Continue a sequence of equal-length changes.
|
||||
--remaining;
|
||||
return TRUE;
|
||||
}
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
int32_t u = array[index++];
|
||||
if (u <= MAX_UNCHANGED) {
|
||||
// Combine adjacent unchanged ranges.
|
||||
changed = FALSE;
|
||||
oldLength_ = u + 1;
|
||||
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
|
||||
++index;
|
||||
oldLength_ += u + 1;
|
||||
}
|
||||
newLength_ = oldLength_;
|
||||
if (onlyChanges) {
|
||||
updateIndexes();
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
// already fetched u > MAX_UNCHANGED at index
|
||||
++index;
|
||||
} else {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
changed = TRUE;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
if (coarse) {
|
||||
int32_t w = u >> 12;
|
||||
int32_t len = (u & 0xfff) + 1;
|
||||
oldLength_ = newLength_ = len * w;
|
||||
} else {
|
||||
// Split a sequence of equal-length changes that was compressed into one unit.
|
||||
oldLength_ = newLength_ = u >> 12;
|
||||
remaining = u & 0xfff;
|
||||
return TRUE;
|
||||
}
|
||||
} else {
|
||||
U_ASSERT(u <= 0x7fff);
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
if (!coarse) {
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
// Combine adjacent changes.
|
||||
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
|
||||
++index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int32_t w = u >> 12;
|
||||
int32_t len = (u & 0xfff) + 1;
|
||||
len = len * w;
|
||||
oldLength_ += len;
|
||||
newLength_ += len;
|
||||
} else {
|
||||
U_ASSERT(u <= 0x7fff);
|
||||
int32_t oldLen = readLength((u >> 6) & 0x3f);
|
||||
int32_t newLen = readLength(u & 0x3f);
|
||||
oldLength_ += oldLen;
|
||||
newLength_ += newLen;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool Edits::Iterator::findSourceIndex(int32_t i, UErrorCode &errorCode) {
|
||||
if (U_FAILURE(errorCode) || i < 0) { return FALSE; }
|
||||
if (i < srcIndex) {
|
||||
// Reset the iterator to the start.
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (srcIndex + oldLength_)) {
|
||||
// The index is in the current span.
|
||||
return TRUE;
|
||||
}
|
||||
while (next(FALSE, errorCode)) {
|
||||
if (i < (srcIndex + oldLength_)) {
|
||||
// The index is in the current span.
|
||||
return TRUE;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// srcIndex is the start of the current span, before the remaining ones.
|
||||
int32_t len = (remaining + 1) * oldLength_;
|
||||
if (i < (srcIndex + len)) {
|
||||
int32_t n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
|
||||
len = n * oldLength_;
|
||||
srcIndex += len;
|
||||
replIndex += len;
|
||||
destIndex += len;
|
||||
remaining -= n;
|
||||
return TRUE;
|
||||
}
|
||||
// Make next() skip all of these edits at once.
|
||||
oldLength_ = newLength_ = len;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
@ -13,6 +13,7 @@
|
||||
|
||||
#include "unicode/locdspnm.h"
|
||||
#include "unicode/simpleformatter.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/udisplaycontext.h"
|
||||
#include "unicode/brkiter.h"
|
||||
|
@ -33,6 +33,7 @@
|
||||
|
||||
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "putilimp.h"
|
||||
#include "mutex.h"
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/uscript.h"
|
||||
|
@ -46,13 +46,6 @@ struct UCaseProps {
|
||||
#define INCLUDED_FROM_UCASE_CPP
|
||||
#include "ucase_props_data.h"
|
||||
|
||||
/* UCaseProps singleton ----------------------------------------------------- */
|
||||
|
||||
U_CAPI const UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton() {
|
||||
return &ucase_props_singleton;
|
||||
}
|
||||
|
||||
/* set of property starts for UnicodeSet ------------------------------------ */
|
||||
|
||||
static UBool U_CALLCONV
|
||||
@ -64,13 +57,13 @@ _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, ui
|
||||
}
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* add the start code point of each same-value range of the trie */
|
||||
utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
|
||||
utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
|
||||
|
||||
/* add code points with hardcoded properties, plus the ones following them */
|
||||
|
||||
@ -133,14 +126,14 @@ static const uint8_t flagsOffset[256]={
|
||||
/* simple case mappings ----------------------------------------------------- */
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_tolower(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_tolower(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
|
||||
@ -150,14 +143,14 @@ ucase_tolower(const UCaseProps *csp, UChar32 c) {
|
||||
}
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_toupper(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_toupper(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
|
||||
@ -167,14 +160,14 @@ ucase_toupper(const UCaseProps *csp, UChar32 c) {
|
||||
}
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_totitle(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_totitle(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t idx;
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
|
||||
@ -198,7 +191,7 @@ static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
|
||||
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
|
||||
ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
|
||||
uint16_t props;
|
||||
|
||||
/*
|
||||
@ -229,7 +222,7 @@ ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
|
||||
break;
|
||||
}
|
||||
|
||||
props=UTRIE2_GET16(&csp->trie, c);
|
||||
props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
@ -243,7 +236,7 @@ ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
|
||||
* c has exceptions, so there may be multiple simple and/or
|
||||
* full case mappings. Add them all.
|
||||
*/
|
||||
const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
const UChar *closure;
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t idx, closureLength, fullLength, length;
|
||||
@ -338,10 +331,10 @@ strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
|
||||
}
|
||||
|
||||
U_CFUNC UBool U_EXPORT2
|
||||
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
|
||||
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
|
||||
int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
|
||||
|
||||
if(csp->unfold==NULL || s==NULL) {
|
||||
if(ucase_props_singleton.unfold==NULL || s==NULL) {
|
||||
return FALSE; /* no reverse case folding data, or no string */
|
||||
}
|
||||
if(length<=1) {
|
||||
@ -355,7 +348,7 @@ ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
const uint16_t *unfold=csp->unfold;
|
||||
const uint16_t *unfold=ucase_props_singleton.unfold;
|
||||
unfoldRows=unfold[UCASE_UNFOLD_ROWS];
|
||||
unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
|
||||
unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
|
||||
@ -381,7 +374,7 @@ ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length
|
||||
for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
|
||||
U16_NEXT_UNSAFE(p, i, c);
|
||||
sa->add(sa->set, c);
|
||||
ucase_addCaseClosure(csp, c, sa);
|
||||
ucase_addCaseClosure(c, sa);
|
||||
}
|
||||
return TRUE;
|
||||
} else if(result<0) {
|
||||
@ -430,38 +423,38 @@ U_NAMESPACE_END
|
||||
|
||||
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getType(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_getType(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
return UCASE_GET_TYPE(props);
|
||||
}
|
||||
|
||||
/** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_getTypeOrIgnorable(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
return UCASE_GET_TYPE_AND_IGNORABLE(props);
|
||||
}
|
||||
|
||||
/** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
|
||||
static inline int32_t
|
||||
getDotType(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
getDotType(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
return props&UCASE_DOT_MASK;
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
|
||||
return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
|
||||
ucase_isSoftDotted(UChar32 c) {
|
||||
return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_isCaseSensitive(UChar32 c) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
return (UBool)((props&UCASE_SENSITIVE)!=0);
|
||||
}
|
||||
|
||||
@ -545,12 +538,10 @@ ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
|
||||
* zero or more case-ignorable characters.
|
||||
*/
|
||||
|
||||
#define is_a(c) ((c)=='a' || (c)=='A')
|
||||
#define is_d(c) ((c)=='d' || (c)=='D')
|
||||
#define is_e(c) ((c)=='e' || (c)=='E')
|
||||
#define is_i(c) ((c)=='i' || (c)=='I')
|
||||
#define is_l(c) ((c)=='l' || (c)=='L')
|
||||
#define is_n(c) ((c)=='n' || (c)=='N')
|
||||
#define is_r(c) ((c)=='r' || (c)=='R')
|
||||
#define is_t(c) ((c)=='t' || (c)=='T')
|
||||
#define is_u(c) ((c)=='u' || (c)=='U')
|
||||
@ -565,16 +556,7 @@ ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
|
||||
* Accepts both 2- and 3-letter codes and accepts case variants.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ucase_getCaseLocale(const char *locale, int32_t *locCache) {
|
||||
int32_t result;
|
||||
char c;
|
||||
|
||||
if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
|
||||
return result;
|
||||
}
|
||||
|
||||
result=UCASE_LOC_ROOT;
|
||||
|
||||
ucase_getCaseLocale(const char *locale) {
|
||||
/*
|
||||
* This function used to use uloc_getLanguage(), but the current code
|
||||
* removes the dependency of this low-level code on uloc implementation code
|
||||
@ -584,44 +566,12 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
|
||||
* Because this code does not want to depend on uloc, the caller must
|
||||
* pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
|
||||
*/
|
||||
c=*locale++;
|
||||
if(is_t(c)) {
|
||||
/* tr or tur? */
|
||||
c=*locale++;
|
||||
if(is_u(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_r(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
result=UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(is_a(c)) {
|
||||
/* az or aze? */
|
||||
c=*locale++;
|
||||
if(is_z(c)) {
|
||||
c=*locale++;
|
||||
if(is_e(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
result=UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(is_l(c)) {
|
||||
/* lt or lit? */
|
||||
c=*locale++;
|
||||
if(is_i(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_t(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
result=UCASE_LOC_LITHUANIAN;
|
||||
}
|
||||
}
|
||||
} else if(is_e(c)) {
|
||||
char c=*locale++;
|
||||
// Fastpath for English "en" which is often used for default (=root locale) case mappings,
|
||||
// and for Chinese "zh": Very common but no special case mapping behavior.
|
||||
// Then check lowercase vs. uppercase to reduce the number of comparisons
|
||||
// for other locales without special behavior.
|
||||
if(c=='e') {
|
||||
/* el or ell? */
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
@ -630,27 +580,135 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
result=UCASE_LOC_GREEK;
|
||||
return UCASE_LOC_GREEK;
|
||||
}
|
||||
}
|
||||
} else if(is_n(c)) {
|
||||
/* nl or nld? */
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
// en, es, ... -> root
|
||||
} else if(c=='z') {
|
||||
return UCASE_LOC_ROOT;
|
||||
#if U_CHARSET_FAMILY==U_ASCII_FAMILY
|
||||
} else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z
|
||||
#elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
|
||||
} else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
|
||||
#else
|
||||
# error Unknown charset family!
|
||||
#endif
|
||||
// lowercase c
|
||||
if(c=='t') {
|
||||
/* tr or tur? */
|
||||
c=*locale++;
|
||||
if(is_d(c)) {
|
||||
c=*locale;
|
||||
if(is_u(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
result=UCASE_LOC_DUTCH;
|
||||
if(is_r(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(c=='a') {
|
||||
/* az or aze? */
|
||||
c=*locale++;
|
||||
if(is_z(c)) {
|
||||
c=*locale++;
|
||||
if(is_e(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(c=='l') {
|
||||
/* lt or lit? */
|
||||
c=*locale++;
|
||||
if(is_i(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_t(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_LITHUANIAN;
|
||||
}
|
||||
}
|
||||
} else if(c=='n') {
|
||||
/* nl or nld? */
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
c=*locale++;
|
||||
if(is_d(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_DUTCH;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// uppercase c
|
||||
// Same code as for lowercase c but also check for 'E'.
|
||||
if(c=='T') {
|
||||
/* tr or tur? */
|
||||
c=*locale++;
|
||||
if(is_u(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_r(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(c=='A') {
|
||||
/* az or aze? */
|
||||
c=*locale++;
|
||||
if(is_z(c)) {
|
||||
c=*locale++;
|
||||
if(is_e(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_TURKISH;
|
||||
}
|
||||
}
|
||||
} else if(c=='L') {
|
||||
/* lt or lit? */
|
||||
c=*locale++;
|
||||
if(is_i(c)) {
|
||||
c=*locale++;
|
||||
}
|
||||
if(is_t(c)) {
|
||||
c=*locale;
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_LITHUANIAN;
|
||||
}
|
||||
}
|
||||
} else if(c=='E') {
|
||||
/* el or ell? */
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_GREEK;
|
||||
}
|
||||
}
|
||||
} else if(c=='N') {
|
||||
/* nl or nld? */
|
||||
c=*locale++;
|
||||
if(is_l(c)) {
|
||||
c=*locale++;
|
||||
if(is_d(c)) {
|
||||
c=*locale;
|
||||
}
|
||||
if(is_sep(c)) {
|
||||
return UCASE_LOC_DUTCH;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(locCache!=NULL) {
|
||||
*locCache=result;
|
||||
}
|
||||
return result;
|
||||
return UCASE_LOC_ROOT;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -662,7 +720,7 @@ ucase_getCaseLocale(const char *locale, int32_t *locCache) {
|
||||
* it is also cased or not.
|
||||
*/
|
||||
static UBool
|
||||
isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
|
||||
isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
|
||||
UChar32 c;
|
||||
|
||||
if(iter==NULL) {
|
||||
@ -670,7 +728,7 @@ isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void
|
||||
}
|
||||
|
||||
for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
|
||||
int32_t type=ucase_getTypeOrIgnorable(csp, c);
|
||||
int32_t type=ucase_getTypeOrIgnorable(c);
|
||||
if(type&4) {
|
||||
/* case-ignorable, continue with the loop */
|
||||
} else if(type!=UCASE_NONE) {
|
||||
@ -685,7 +743,7 @@ isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void
|
||||
|
||||
/* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
|
||||
static UBool
|
||||
isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
|
||||
isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
|
||||
UChar32 c;
|
||||
int32_t dotType;
|
||||
int8_t dir;
|
||||
@ -695,7 +753,7 @@ isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *
|
||||
}
|
||||
|
||||
for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
|
||||
dotType=getDotType(csp, c);
|
||||
dotType=getDotType(c);
|
||||
if(dotType==UCASE_SOFT_DOTTED) {
|
||||
return TRUE; /* preceded by TYPE_i */
|
||||
} else if(dotType!=UCASE_OTHER_ACCENT) {
|
||||
@ -742,7 +800,7 @@ isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *
|
||||
|
||||
/* Is preceded by base character 'I' with no intervening cc=230 ? */
|
||||
static UBool
|
||||
isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
|
||||
isPrecededBy_I(UCaseContextIterator *iter, void *context) {
|
||||
UChar32 c;
|
||||
int32_t dotType;
|
||||
int8_t dir;
|
||||
@ -755,7 +813,7 @@ isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context)
|
||||
if(c==0x49) {
|
||||
return TRUE; /* preceded by I */
|
||||
}
|
||||
dotType=getDotType(csp, c);
|
||||
dotType=getDotType(c);
|
||||
if(dotType!=UCASE_OTHER_ACCENT) {
|
||||
return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
|
||||
}
|
||||
@ -766,7 +824,7 @@ isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context)
|
||||
|
||||
/* Is followed by one or more cc==230 ? */
|
||||
static UBool
|
||||
isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
|
||||
isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
|
||||
UChar32 c;
|
||||
int32_t dotType;
|
||||
int8_t dir;
|
||||
@ -776,7 +834,7 @@ isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *c
|
||||
}
|
||||
|
||||
for(dir=1; (c=iter(context, dir))>=0; dir=0) {
|
||||
dotType=getDotType(csp, c);
|
||||
dotType=getDotType(c);
|
||||
if(dotType==UCASE_ABOVE) {
|
||||
return TRUE; /* at least one cc==230 following */
|
||||
} else if(dotType!=UCASE_OTHER_ACCENT) {
|
||||
@ -789,7 +847,7 @@ isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *c
|
||||
|
||||
/* Is followed by a dot above (without cc==230 in between) ? */
|
||||
static UBool
|
||||
isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
|
||||
isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
|
||||
UChar32 c;
|
||||
int32_t dotType;
|
||||
int8_t dir;
|
||||
@ -802,7 +860,7 @@ isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *co
|
||||
if(c==0x307) {
|
||||
return TRUE;
|
||||
}
|
||||
dotType=getDotType(csp, c);
|
||||
dotType=getDotType(c);
|
||||
if(dotType!=UCASE_OTHER_ACCENT) {
|
||||
return FALSE; /* next base character or cc==230 in between */
|
||||
}
|
||||
@ -812,20 +870,20 @@ isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *co
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullLower(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache) {
|
||||
int32_t loc) {
|
||||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t full;
|
||||
|
||||
@ -833,7 +891,6 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
|
||||
if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
|
||||
/* use hardcoded conditions and mappings */
|
||||
int32_t loc=ucase_getCaseLocale(locale, locCache);
|
||||
|
||||
/*
|
||||
* Test for conditional mappings first
|
||||
@ -844,7 +901,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
if( loc==UCASE_LOC_LITHUANIAN &&
|
||||
/* base characters, find accents above */
|
||||
(((c==0x49 || c==0x4a || c==0x12e) &&
|
||||
isFollowedByMoreAbove(csp, iter, context)) ||
|
||||
isFollowedByMoreAbove(iter, context)) ||
|
||||
/* precomposed with accent above, no need to find one */
|
||||
(c==0xcc || c==0xcd || c==0x128))
|
||||
) {
|
||||
@ -896,7 +953,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
*/
|
||||
return 0x69;
|
||||
} else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
|
||||
} else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
|
||||
/*
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
@ -905,7 +962,7 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
*/
|
||||
return 0; /* remove the dot (continue without output) */
|
||||
} else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
|
||||
} else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
|
||||
/*
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
@ -922,8 +979,8 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
*pString=iDot;
|
||||
return 2;
|
||||
} else if( c==0x3a3 &&
|
||||
!isFollowedByCasedLetter(csp, iter, context, 1) &&
|
||||
isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
|
||||
!isFollowedByCasedLetter(iter, context, 1) &&
|
||||
isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
|
||||
) {
|
||||
/* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
|
||||
/*
|
||||
@ -957,21 +1014,21 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
|
||||
/* internal */
|
||||
static int32_t
|
||||
toUpperOrTitle(const UCaseProps *csp, UChar32 c,
|
||||
toUpperOrTitle(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache,
|
||||
int32_t loc,
|
||||
UBool upperNotTitle) {
|
||||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t full, idx;
|
||||
|
||||
@ -979,8 +1036,6 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
|
||||
|
||||
if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
|
||||
/* use hardcoded conditions and mappings */
|
||||
int32_t loc=ucase_getCaseLocale(locale, locCache);
|
||||
|
||||
if(loc==UCASE_LOC_TURKISH && c==0x69) {
|
||||
/*
|
||||
# Turkish and Azeri
|
||||
@ -994,7 +1049,7 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
|
||||
0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
|
||||
*/
|
||||
return 0x130;
|
||||
} else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
|
||||
} else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
|
||||
/*
|
||||
# Lithuanian
|
||||
|
||||
@ -1052,19 +1107,19 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullUpper(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache) {
|
||||
return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
|
||||
int32_t caseLocale) {
|
||||
return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullTitle(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache) {
|
||||
return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
|
||||
int32_t caseLocale) {
|
||||
return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
|
||||
}
|
||||
|
||||
/* case folding ------------------------------------------------------------- */
|
||||
@ -1110,14 +1165,14 @@ ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
|
||||
|
||||
/* return the simple case folding mapping for c */
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
ucase_fold(UChar32 c, uint32_t options) {
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t idx;
|
||||
if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
|
||||
@ -1170,19 +1225,19 @@ ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullFolding(UChar32 c,
|
||||
const UChar **pString,
|
||||
uint32_t options) {
|
||||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
uint16_t props=UTRIE2_GET16(&csp->trie, c);
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t full, idx;
|
||||
|
||||
@ -1244,66 +1299,59 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
|
||||
|
||||
/* case mapping properties API ---------------------------------------------- */
|
||||
|
||||
#define GET_CASE_PROPS() &ucase_props_singleton
|
||||
|
||||
/* public API (see uchar.h) */
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isULowercase(UChar32 c) {
|
||||
return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
|
||||
return (UBool)(UCASE_LOWER==ucase_getType(c));
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isUUppercase(UChar32 c) {
|
||||
return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
|
||||
return (UBool)(UCASE_UPPER==ucase_getType(c));
|
||||
}
|
||||
|
||||
/* Transforms the Unicode character to its lower case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_tolower(UChar32 c) {
|
||||
return ucase_tolower(GET_CASE_PROPS(), c);
|
||||
return ucase_tolower(c);
|
||||
}
|
||||
|
||||
/* Transforms the Unicode character to its upper case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_toupper(UChar32 c) {
|
||||
return ucase_toupper(GET_CASE_PROPS(), c);
|
||||
return ucase_toupper(c);
|
||||
}
|
||||
|
||||
/* Transforms the Unicode character to its title case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_totitle(UChar32 c) {
|
||||
return ucase_totitle(GET_CASE_PROPS(), c);
|
||||
return ucase_totitle(c);
|
||||
}
|
||||
|
||||
/* return the simple case folding mapping for c */
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_foldCase(UChar32 c, uint32_t options) {
|
||||
return ucase_fold(GET_CASE_PROPS(), c, options);
|
||||
return ucase_fold(c, options);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t U_EXPORT2
|
||||
ucase_hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
/* case mapping properties */
|
||||
const UChar *resultString;
|
||||
int32_t locCache;
|
||||
const UCaseProps *csp=GET_CASE_PROPS();
|
||||
if(csp==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
switch(which) {
|
||||
case UCHAR_LOWERCASE:
|
||||
return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
|
||||
return (UBool)(UCASE_LOWER==ucase_getType(c));
|
||||
case UCHAR_UPPERCASE:
|
||||
return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
|
||||
return (UBool)(UCASE_UPPER==ucase_getType(c));
|
||||
case UCHAR_SOFT_DOTTED:
|
||||
return ucase_isSoftDotted(csp, c);
|
||||
return ucase_isSoftDotted(c);
|
||||
case UCHAR_CASE_SENSITIVE:
|
||||
return ucase_isCaseSensitive(csp, c);
|
||||
return ucase_isCaseSensitive(c);
|
||||
case UCHAR_CASED:
|
||||
return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
|
||||
return (UBool)(UCASE_NONE!=ucase_getType(c));
|
||||
case UCHAR_CASE_IGNORABLE:
|
||||
return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
|
||||
return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
|
||||
/*
|
||||
* Note: The following Changes_When_Xyz are defined as testing whether
|
||||
* the NFD form of the input changes when Xyz-case-mapped.
|
||||
@ -1317,21 +1365,17 @@ ucase_hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
* start sets for normalization and case mappings.
|
||||
*/
|
||||
case UCHAR_CHANGES_WHEN_LOWERCASED:
|
||||
locCache=UCASE_LOC_ROOT;
|
||||
return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
|
||||
return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
|
||||
case UCHAR_CHANGES_WHEN_UPPERCASED:
|
||||
locCache=UCASE_LOC_ROOT;
|
||||
return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
|
||||
return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
|
||||
case UCHAR_CHANGES_WHEN_TITLECASED:
|
||||
locCache=UCASE_LOC_ROOT;
|
||||
return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
|
||||
return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
|
||||
/* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
|
||||
case UCHAR_CHANGES_WHEN_CASEMAPPED:
|
||||
locCache=UCASE_LOC_ROOT;
|
||||
return (UBool)(
|
||||
ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
|
||||
ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
|
||||
ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
|
||||
ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
|
||||
ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
|
||||
ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
|
||||
default:
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -37,18 +37,8 @@ U_NAMESPACE_END
|
||||
|
||||
/* library API -------------------------------------------------------------- */
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
struct UCaseProps;
|
||||
typedef struct UCaseProps UCaseProps;
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
U_CAPI const UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(void);
|
||||
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Requires non-NULL locale ID but otherwise does the equivalent of
|
||||
@ -56,7 +46,7 @@ ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *
|
||||
* Accepts both 2- and 3-letter codes and accepts case variants.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ucase_getCaseLocale(const char *locale, int32_t *locCache);
|
||||
ucase_getCaseLocale(const char *locale);
|
||||
|
||||
/* Casing locale types for ucase_getCaseLocale */
|
||||
enum {
|
||||
@ -87,16 +77,16 @@ enum {
|
||||
/* single-code point functions */
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_tolower(const UCaseProps *csp, UChar32 c);
|
||||
ucase_tolower(UChar32 c);
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_toupper(const UCaseProps *csp, UChar32 c);
|
||||
ucase_toupper(UChar32 c);
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_totitle(const UCaseProps *csp, UChar32 c);
|
||||
ucase_totitle(UChar32 c);
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
|
||||
ucase_fold(UChar32 c, uint32_t options);
|
||||
|
||||
/**
|
||||
* Adds all simple case mappings and the full case folding for c to sa,
|
||||
@ -108,7 +98,7 @@ ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
|
||||
* - for k include the Kelvin sign
|
||||
*/
|
||||
U_CFUNC void U_EXPORT2
|
||||
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
|
||||
ucase_addCaseClosure(UChar32 c, const USetAdder *sa);
|
||||
|
||||
/**
|
||||
* Maps the string to single code points and adds the associated case closure
|
||||
@ -123,7 +113,7 @@ ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
|
||||
* @return TRUE if the string was found
|
||||
*/
|
||||
U_CFUNC UBool U_EXPORT2
|
||||
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
|
||||
ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa);
|
||||
|
||||
#ifdef __cplusplus
|
||||
U_NAMESPACE_BEGIN
|
||||
@ -157,17 +147,17 @@ U_NAMESPACE_END
|
||||
|
||||
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getType(const UCaseProps *csp, UChar32 c);
|
||||
ucase_getType(UChar32 c);
|
||||
|
||||
/** @return like ucase_getType() but also sets UCASE_IGNORABLE if c is case-ignorable */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
|
||||
ucase_getTypeOrIgnorable(UChar32 c);
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
|
||||
ucase_isSoftDotted(UChar32 c);
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
|
||||
ucase_isCaseSensitive(UChar32 c);
|
||||
|
||||
/* string case mapping functions */
|
||||
|
||||
@ -240,10 +230,7 @@ enum {
|
||||
* @param context Pointer to be passed into iter.
|
||||
* @param pString If the mapping result is a string, then the pointer is
|
||||
* written to *pString.
|
||||
* @param locale Locale ID for locale-dependent mappings.
|
||||
* @param locCache Initialize to 0; may be used to cache the result of parsing
|
||||
* the locale ID for subsequent calls.
|
||||
* Can be NULL.
|
||||
* @param caseLocale Case locale value from ucase_getCaseLocale().
|
||||
* @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
|
||||
*
|
||||
* @see UCaseContextIterator
|
||||
@ -251,25 +238,25 @@ enum {
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullLower(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache);
|
||||
int32_t caseLocale);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullUpper(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache);
|
||||
int32_t caseLocale);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullTitle(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache);
|
||||
int32_t caseLocale);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
|
||||
ucase_toFullFolding(UChar32 c,
|
||||
const UChar **pString,
|
||||
uint32_t options);
|
||||
|
||||
@ -283,10 +270,10 @@ U_CDECL_BEGIN
|
||||
* @internal
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UCaseMapFull(const UCaseProps *csp, UChar32 c,
|
||||
UCaseMapFull(UChar32 c,
|
||||
UCaseContextIterator *iter, void *context,
|
||||
const UChar **pString,
|
||||
const char *locale, int32_t *locCache);
|
||||
int32_t caseLocale);
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
|
@ -33,46 +33,46 @@
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "ucase.h"
|
||||
#include "ucasemap_imp.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* UCaseMap service object -------------------------------------------------- */
|
||||
|
||||
UCaseMap::UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode) :
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
iter(NULL),
|
||||
#endif
|
||||
caseLocale(UCASE_LOC_UNKNOWN), options(opts) {
|
||||
ucasemap_setLocale(this, localeID, pErrorCode);
|
||||
}
|
||||
|
||||
UCaseMap::~UCaseMap() {
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
delete iter;
|
||||
#endif
|
||||
}
|
||||
|
||||
U_CAPI UCaseMap * U_EXPORT2
|
||||
ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode) {
|
||||
UCaseMap *csm;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
csm=(UCaseMap *)uprv_malloc(sizeof(UCaseMap));
|
||||
UCaseMap *csm = new UCaseMap(locale, options, pErrorCode);
|
||||
if(csm==NULL) {
|
||||
*pErrorCode = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
} else if (U_FAILURE(*pErrorCode)) {
|
||||
delete csm;
|
||||
return NULL;
|
||||
}
|
||||
uprv_memset(csm, 0, sizeof(UCaseMap));
|
||||
|
||||
csm->csp=ucase_getSingleton();
|
||||
ucasemap_setLocale(csm, locale, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
uprv_free(csm);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
csm->options=options;
|
||||
return csm;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucasemap_close(UCaseMap *csm) {
|
||||
if(csm!=NULL) {
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
// Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
|
||||
delete reinterpret_cast<BreakIterator *>(csm->iter);
|
||||
#endif
|
||||
uprv_free(csm);
|
||||
}
|
||||
delete csm;
|
||||
}
|
||||
|
||||
U_CAPI const char * U_EXPORT2
|
||||
@ -87,13 +87,16 @@ ucasemap_getOptions(const UCaseMap *csm) {
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
|
||||
int32_t length;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
if (locale != NULL && *locale == 0) {
|
||||
csm->locale[0] = 0;
|
||||
csm->caseLocale = UCASE_LOC_ROOT;
|
||||
return;
|
||||
}
|
||||
|
||||
length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
|
||||
int32_t length=uloc_getName(locale, csm->locale, (int32_t)sizeof(csm->locale), pErrorCode);
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR || length==sizeof(csm->locale)) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
/* we only really need the language code for case mappings */
|
||||
@ -102,16 +105,20 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
|
||||
if(length==sizeof(csm->locale)) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
csm->locCache=0;
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
ucase_getCaseLocale(csm->locale, &csm->locCache);
|
||||
csm->caseLocale=UCASE_LOC_UNKNOWN;
|
||||
csm->caseLocale = ucase_getCaseLocale(csm->locale);
|
||||
} else {
|
||||
csm->locale[0]=0;
|
||||
csm->caseLocale = UCASE_LOC_ROOT;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode * /*pErrorCode*/) {
|
||||
ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
csm->options=options;
|
||||
}
|
||||
|
||||
@ -258,7 +265,7 @@ utf8_caseContextIterator(void *context, int8_t dir) {
|
||||
* context [0..srcLength[ into account.
|
||||
*/
|
||||
static int32_t
|
||||
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
|
||||
_caseMap(int32_t caseLocale, uint32_t /* TODO: options */, UCaseMapFull *map,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, UCaseContext *csc,
|
||||
int32_t srcStart, int32_t srcLimit,
|
||||
@ -266,9 +273,6 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
|
||||
const UChar *s = NULL;
|
||||
UChar32 c, c2 = 0;
|
||||
int32_t srcIndex, destIndex;
|
||||
int32_t locCache;
|
||||
|
||||
locCache=csm->locCache;
|
||||
|
||||
/* case mapping loop */
|
||||
srcIndex=srcStart;
|
||||
@ -286,7 +290,7 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
c=map(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &locCache);
|
||||
c=map(c, utf8_caseContextIterator, csc, &s, caseLocale);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
|
||||
/* fast path version of appendResult() for ASCII results */
|
||||
dest[destIndex++]=(uint8_t)c2;
|
||||
@ -308,10 +312,11 @@ _caseMap(const UCaseMap *csm, UCaseMapFull *map,
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
ucasemap_internalUTF8ToTitle(
|
||||
int32_t caseLocale, uint32_t options, BreakIterator *iter,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *s;
|
||||
UChar32 c;
|
||||
int32_t prev, titleStart, titleLimit, idx, destIndex;
|
||||
@ -321,12 +326,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Use the C++ abstract base class to minimize dependencies.
|
||||
// TODO: Change UCaseMap.iter to store a BreakIterator directly.
|
||||
BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
|
||||
|
||||
/* set up local variables */
|
||||
int32_t locCache=csm->locCache;
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
@ -339,9 +339,9 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
/* find next index where to titlecase */
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=FALSE;
|
||||
idx=bi->first();
|
||||
idx=iter->first();
|
||||
} else {
|
||||
idx=bi->next();
|
||||
idx=iter->next();
|
||||
}
|
||||
if(idx==UBRK_DONE || idx>srcLength) {
|
||||
idx=srcLength;
|
||||
@ -364,7 +364,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
titleStart=titleLimit=prev;
|
||||
U8_NEXT(src, titleLimit, idx, c);
|
||||
if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
|
||||
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
|
||||
/* Adjust the titlecasing index (titleStart) to the next cased character. */
|
||||
for(;;) {
|
||||
titleStart=titleLimit;
|
||||
@ -376,7 +376,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
break;
|
||||
}
|
||||
U8_NEXT(src, titleLimit, idx, c);
|
||||
if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
|
||||
if(UCASE_NONE!=ucase_getType(c)) {
|
||||
break; /* cased letter at [titleStart..titleLimit[ */
|
||||
}
|
||||
}
|
||||
@ -392,7 +392,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
if(c>=0) {
|
||||
csc.cpStart=titleStart;
|
||||
csc.cpLimit=titleLimit;
|
||||
c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, &csc, &s, csm->locale, &locCache);
|
||||
c=ucase_toFullTitle(c, utf8_caseContextIterator, &csc, &s, caseLocale);
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
|
||||
} else {
|
||||
// Malformed UTF-8.
|
||||
@ -405,7 +405,7 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
|
||||
/* Special case Dutch IJ titlecasing */
|
||||
if (titleStart+1 < idx &&
|
||||
ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_DUTCH &&
|
||||
caseLocale == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
|
||||
(src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
|
||||
@ -413,11 +413,11 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
}
|
||||
/* lowercase [titleLimit..index[ */
|
||||
if(titleLimit<idx) {
|
||||
if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
destIndex+=
|
||||
_caseMap(
|
||||
csm, ucase_toFullLower,
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest+destIndex, destCapacity-destIndex,
|
||||
src, &csc,
|
||||
titleLimit, idx,
|
||||
@ -454,11 +454,11 @@ ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
U_NAMESPACE_BEGIN
|
||||
namespace GreekUpper {
|
||||
|
||||
UBool isFollowedByCasedLetter(const UCaseProps *csp, const uint8_t *s, int32_t i, int32_t length) {
|
||||
UBool isFollowedByCasedLetter(const uint8_t *s, int32_t i, int32_t length) {
|
||||
while (i < length) {
|
||||
UChar32 c;
|
||||
U8_NEXT(s, i, length, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(csp, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(c);
|
||||
if ((type & UCASE_IGNORABLE) != 0) {
|
||||
// Case-ignorable, continue with the loop.
|
||||
} else if (type != UCASE_NONE) {
|
||||
@ -471,11 +471,10 @@ UBool isFollowedByCasedLetter(const UCaseProps *csp, const uint8_t *s, int32_t i
|
||||
}
|
||||
|
||||
// Keep this consistent with the UTF-16 version in ustrcase.cpp and the Java version in CaseMap.java.
|
||||
int32_t toUpper(const UCaseMap *csm,
|
||||
int32_t toUpper(int32_t caseLocale, uint32_t /* TODO: options */,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t locCache = UCASE_LOC_GREEK;
|
||||
int32_t destIndex=0;
|
||||
uint32_t state = 0;
|
||||
for (int32_t i = 0; i < srcLength;) {
|
||||
@ -483,7 +482,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
UChar32 c;
|
||||
U8_NEXT(src, nextIndex, srcLength, c);
|
||||
uint32_t nextState = 0;
|
||||
int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(c);
|
||||
if ((type & UCASE_IGNORABLE) != 0) {
|
||||
// c is case-ignorable
|
||||
nextState |= (state & AFTER_CASED);
|
||||
@ -533,7 +532,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
(data & HAS_ACCENT) != 0 &&
|
||||
numYpogegrammeni == 0 &&
|
||||
(state & AFTER_CASED) == 0 &&
|
||||
!isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
|
||||
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
|
||||
// Keep disjunctive "or" with (only) a tonos.
|
||||
// We use the same "word boundary" conditions as for the Final_Sigma test.
|
||||
if (i == nextIndex) {
|
||||
@ -569,7 +568,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
} else if(c>=0) {
|
||||
const UChar *s;
|
||||
UChar32 c2 = 0;
|
||||
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
|
||||
c=ucase_toFullUpper(c, NULL, NULL, &s, caseLocale);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
|
||||
/* fast path version of appendResult() for ASCII results */
|
||||
dest[destIndex++]=(uint8_t)c2;
|
||||
@ -602,7 +601,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
U_NAMESPACE_END
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8ToLower(const UCaseMap *csm,
|
||||
ucasemap_internalUTF8ToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
@ -610,37 +609,35 @@ ucasemap_internalUTF8ToLower(const UCaseMap *csm,
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
return _caseMap(
|
||||
csm, ucase_toFullLower,
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8ToUpper(const UCaseMap *csm,
|
||||
ucasemap_internalUTF8ToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t locCache = csm->locCache;
|
||||
if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
|
||||
return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
|
||||
if (caseLocale == UCASE_LOC_GREEK) {
|
||||
return GreekUpper::toUpper(caseLocale, options, dest, destCapacity, src, srcLength, pErrorCode);
|
||||
}
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
return _caseMap(
|
||||
csm, ucase_toFullUpper,
|
||||
caseLocale, options, ucase_toFullUpper,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
utf8_foldCase(const UCaseProps *csp,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
static int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8Fold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t srcIndex, destIndex;
|
||||
|
||||
const UChar *s;
|
||||
@ -661,7 +658,7 @@ utf8_foldCase(const UCaseProps *csp,
|
||||
}
|
||||
continue;
|
||||
}
|
||||
c=ucase_toFullFolding(csp, c, &s, options);
|
||||
c=ucase_toFullFolding(c, &s, options);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0x7f : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0x7f)) {
|
||||
/* fast path version of appendResult() for ASCII results */
|
||||
dest[destIndex++]=(uint8_t)c2;
|
||||
@ -680,16 +677,8 @@ utf8_foldCase(const UCaseProps *csp,
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8Fold(const UCaseMap *csm,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return utf8_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
ucasemap_mapUTF8(const UCaseMap *csm,
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
@ -723,7 +712,8 @@ ucasemap_mapUTF8(const UCaseMap *csm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
destLength=stringCaseMapper(csm, dest, destCapacity, src, srcLength, pErrorCode);
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
dest, destCapacity, src, srcLength, pErrorCode);
|
||||
return u_terminateChars((char *)dest, destCapacity, destLength, pErrorCode);
|
||||
}
|
||||
|
||||
@ -734,10 +724,11 @@ ucasemap_utf8ToLower(const UCaseMap *csm,
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(csm,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToLower, pErrorCode);
|
||||
return ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToLower, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
@ -745,10 +736,11 @@ ucasemap_utf8ToUpper(const UCaseMap *csm,
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(csm,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToUpper, pErrorCode);
|
||||
return ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToUpper, pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
@ -756,8 +748,9 @@ ucasemap_utf8FoldCase(const UCaseMap *csm,
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return ucasemap_mapUTF8(csm,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8Fold, pErrorCode);
|
||||
return ucasemap_mapUTF8(
|
||||
UCASE_LOC_ROOT, csm->options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8Fold, pErrorCode);
|
||||
}
|
||||
|
236
icu4c/source/common/ucasemap_imp.h
Normal file
236
icu4c/source/common/ucasemap_imp.h
Normal file
@ -0,0 +1,236 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// ucasemap_imp.h
|
||||
// created: 2017feb08 Markus W. Scherer
|
||||
|
||||
#ifndef __UCASEMAP_IMP_H__
|
||||
#define __UCASEMAP_IMP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "ucase.h"
|
||||
|
||||
#ifndef U_COMPARE_IGNORE_CASE
|
||||
/* see also unorm.h */
|
||||
/**
|
||||
* Option bit for unorm_compare:
|
||||
* Perform case-insensitive comparison.
|
||||
*/
|
||||
#define U_COMPARE_IGNORE_CASE 0x10000
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Internal API, used by u_strcasecmp() etc.
|
||||
* Compare strings case-insensitively,
|
||||
* in code point order or code unit order.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
u_strcmpFold(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Interanl API, used for detecting length of
|
||||
* shared prefix case-insensitively.
|
||||
* @param s1 input string 1
|
||||
* @param length1 length of string 1, or -1 (NULL terminated)
|
||||
* @param s2 input string 2
|
||||
* @param length2 length of string 2, or -1 (NULL terminated)
|
||||
* @param options compare options
|
||||
* @param matchLen1 (output) length of partial prefix match in s1
|
||||
* @param matchLen2 (output) length of partial prefix match in s2
|
||||
* @param pErrorCode receives error status
|
||||
*/
|
||||
U_CAPI void
|
||||
u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
int32_t *matchLen1, int32_t *matchLen2,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Are the Unicode properties loaded?
|
||||
* This must be used before internal functions are called that do
|
||||
* not perform this check.
|
||||
* Generate a debug assertion failure if data is not loaded.
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
uprv_haveProperties(UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include "unicode/unistr.h" // for UStringCaseMapper
|
||||
|
||||
/*
|
||||
* Internal string casing functions implementing
|
||||
* ustring.h/ustrcase.cpp and UnicodeString case mapping functions.
|
||||
*/
|
||||
|
||||
struct UCaseMap : public icu::UMemory {
|
||||
/** Implements most of ucasemap_open(). */
|
||||
UCaseMap(const char *localeID, uint32_t opts, UErrorCode *pErrorCode);
|
||||
~UCaseMap();
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
icu::BreakIterator *iter; /* We adopt the iterator, so we own it. */
|
||||
#endif
|
||||
char locale[32];
|
||||
int32_t caseLocale;
|
||||
uint32_t options;
|
||||
};
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
# define UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
# define UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
# define UCASEMAP_BREAK_ITERATOR
|
||||
# define UCASEMAP_BREAK_ITERATOR_NULL
|
||||
#else
|
||||
# define UCASEMAP_BREAK_ITERATOR_PARAM icu::BreakIterator *iter,
|
||||
# define UCASEMAP_BREAK_ITERATOR_UNUSED icu::BreakIterator *,
|
||||
# define UCASEMAP_BREAK_ITERATOR iter,
|
||||
# define UCASEMAP_BREAK_ITERATOR_NULL NULL,
|
||||
#endif
|
||||
|
||||
U_CFUNC int32_t
|
||||
ustrcase_getCaseLocale(const char *locale);
|
||||
|
||||
// TODO: swap src / dest if approved for new public api
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options,
|
||||
icu::BreakIterator *iter,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#endif
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalFold(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Common string case mapping implementation for ucasemap_toXyz() and UnicodeString::toXyz().
|
||||
* Implements argument checking.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UStringCaseMapper *stringCaseMapper,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Common string case mapping implementation for old-fashioned u_strToXyz() functions
|
||||
* that allow the source string to overlap the destination buffer.
|
||||
* Implements argument checking and internally works with an intermediate buffer if necessary.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UStringCaseMapper *stringCaseMapper,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
|
||||
* UTF-8 version of UStringCaseMapper.
|
||||
* All error checking must be done.
|
||||
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
|
||||
* src and dest must not overlap.
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UTF8CaseMapper(int32_t caseLocale, uint32_t options,
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
icu::BreakIterator *iter,
|
||||
#endif
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/** Implements UTF8CaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(int32_t caseLocale, uint32_t options,
|
||||
icu::BreakIterator *iter,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Implements argument checking and buffer handling
|
||||
* for UTF-8 string case mapping as a common function.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
namespace GreekUpper {
|
||||
|
||||
// Data bits.
|
||||
static const uint32_t UPPER_MASK = 0x3ff;
|
||||
static const uint32_t HAS_VOWEL = 0x1000;
|
||||
static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
|
||||
static const uint32_t HAS_ACCENT = 0x4000;
|
||||
static const uint32_t HAS_DIALYTIKA = 0x8000;
|
||||
// Further bits during data building and processing, not stored in the data map.
|
||||
static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
|
||||
static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
|
||||
|
||||
static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
|
||||
static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
|
||||
HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
|
||||
static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
|
||||
|
||||
// State bits.
|
||||
static const uint32_t AFTER_CASED = 1;
|
||||
static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
|
||||
|
||||
uint32_t getLetterData(UChar32 c);
|
||||
|
||||
/**
|
||||
* Returns a non-zero value for each of the Greek combining diacritics
|
||||
* listed in The Unicode Standard, version 8, chapter 7.2 Greek,
|
||||
* plus some perispomeni look-alikes.
|
||||
*/
|
||||
uint32_t getDiacriticData(UChar32 c);
|
||||
|
||||
} // namespace GreekUpper
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
#endif // __UCASEMAP_IMP_H__
|
@ -26,20 +26,22 @@
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucase.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
U_CAPI const UBreakIterator * U_EXPORT2
|
||||
ucasemap_getBreakIterator(const UCaseMap *csm) {
|
||||
return csm->iter;
|
||||
return reinterpret_cast<UBreakIterator *>(csm->iter);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode * /*pErrorCode*/) {
|
||||
// Do not call ubrk_close() so that we do not depend on all of the BreakIterator code.
|
||||
delete reinterpret_cast<BreakIterator *>(csm->iter);
|
||||
csm->iter=iterToAdopt;
|
||||
ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
delete csm->iter;
|
||||
csm->iter=reinterpret_cast<BreakIterator *>(iterToAdopt);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
@ -47,21 +49,23 @@ ucasemap_utf8ToTitle(UCaseMap *csm,
|
||||
char *dest, int32_t destCapacity,
|
||||
const char *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
UText utext=UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UText utext=UTEXT_INITIALIZER;
|
||||
utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
|
||||
if(csm->iter==NULL) {
|
||||
csm->iter=ubrk_open(UBRK_WORD, csm->locale,
|
||||
NULL, 0,
|
||||
pErrorCode);
|
||||
csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
|
||||
}
|
||||
ubrk_setUText(csm->iter, &utext, pErrorCode);
|
||||
int32_t length=ucasemap_mapUTF8(csm,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToTitle, pErrorCode);
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
csm->iter->setText(&utext, *pErrorCode);
|
||||
int32_t length=ucasemap_mapUTF8(
|
||||
csm->caseLocale, csm->options, csm->iter,
|
||||
(uint8_t *)dest, destCapacity,
|
||||
(const uint8_t *)src, srcLength,
|
||||
ucasemap_internalUTF8ToTitle, pErrorCode);
|
||||
utext_close(&utext);
|
||||
return length;
|
||||
}
|
||||
|
@ -23,6 +23,7 @@
|
||||
#if !UCONFIG_NO_CONVERSION && !UCONFIG_NO_LEGACY_CONVERSION
|
||||
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ucnv_ext.h"
|
||||
|
197
icu4c/source/common/unicode/casemap.h
Normal file
197
icu4c/source/common/unicode/casemap.h
Normal file
@ -0,0 +1,197 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// casemap.h
|
||||
// created: 2017jan12 Markus W. Scherer
|
||||
|
||||
#ifndef __CASEMAP_H__
|
||||
#define __CASEMAP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: Low-level C++ case mapping functions.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
class BreakIterator;
|
||||
class Edits;
|
||||
|
||||
/**
|
||||
* Low-level C++ case mapping functions.
|
||||
*
|
||||
* @draft ICU 59
|
||||
*/
|
||||
class U_COMMON_API CaseMap final : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Lowercases a UTF-16 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
* the buffer is large enough.
|
||||
* The contents is undefined in case of failure.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the result
|
||||
* without writing any of the result string.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
* When the result would be longer than destCapacity,
|
||||
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
|
||||
*
|
||||
* @see u_strToLower
|
||||
* @draft ICU 59
|
||||
*/
|
||||
static int32_t toLower(
|
||||
const char *locale, uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Uppercases a UTF-16 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
* the buffer is large enough.
|
||||
* The contents is undefined in case of failure.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the result
|
||||
* without writing any of the result string.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
* When the result would be longer than destCapacity,
|
||||
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
|
||||
*
|
||||
* @see u_strToUpper
|
||||
* @draft ICU 59
|
||||
*/
|
||||
static int32_t toUpper(
|
||||
const char *locale, uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* Titlecases a UTF-16 string and optionally records edits.
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* Titlecasing uses a break iterator to find the first characters of words
|
||||
* that are to be titlecased. It titlecases those characters and lowercases
|
||||
* all others. (This can be modified with options bits.)
|
||||
*
|
||||
* @param locale The locale ID. ("" = root locale, NULL = default locale.)
|
||||
* @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
|
||||
* U_TITLECASE_NO_LOWERCASE, U_TITLECASE_NO_BREAK_ADJUSTMENT.
|
||||
* @param iter A break iterator to find the first characters of words that are to be titlecased.
|
||||
* It is set to the source string (setText())
|
||||
* and used one or more times for iteration (first() and next()).
|
||||
* If NULL, then a word break iterator for the locale is used
|
||||
* (or something equivalent).
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
* the buffer is large enough.
|
||||
* The contents is undefined in case of failure.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the result
|
||||
* without writing any of the result string.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
* When the result would be longer than destCapacity,
|
||||
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
|
||||
*
|
||||
* @see u_strToTitle
|
||||
* @see ucasemap_toTitle
|
||||
* @draft ICU 59
|
||||
*/
|
||||
static int32_t toTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
#endif // UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* Case-folds a UTF-16 string and optionally records edits.
|
||||
*
|
||||
* Case folding is locale-independent and not context-sensitive,
|
||||
* but there is an option for whether to include or exclude mappings for dotted I
|
||||
* and dotless i that are marked with 'T' in CaseFolding.txt.
|
||||
*
|
||||
* The result may be longer or shorter than the original.
|
||||
* The source string and the destination buffer must not overlap.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See UCASEMAP_OMIT_UNCHANGED_TEXT,
|
||||
* U_FOLD_CASE_DEFAULT, U_FOLD_CASE_EXCLUDE_SPECIAL_I.
|
||||
* @param src The original string.
|
||||
* @param srcLength The length of the original string. If -1, then src must be NUL-terminated.
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
* the buffer is large enough.
|
||||
* The contents is undefined in case of failure.
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the result
|
||||
* without writing any of the result string.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits->reset() first. edits can be NULL.
|
||||
* @param errorCode Reference to an in/out error code value
|
||||
* which must not indicate a failure before the function call.
|
||||
* @return The length of the result string, if successful.
|
||||
* When the result would be longer than destCapacity,
|
||||
* the full length is returned and a U_BUFFER_OVERFLOW_ERROR is set.
|
||||
*
|
||||
* @see u_strFoldCase
|
||||
* @draft ICU 59
|
||||
*/
|
||||
static int32_t fold(
|
||||
uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
private:
|
||||
CaseMap() = delete;
|
||||
CaseMap(const CaseMap &other) = delete;
|
||||
CaseMap &operator=(const CaseMap &other) = delete;
|
||||
};
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __CASEMAP_H__
|
245
icu4c/source/common/unicode/edits.h
Normal file
245
icu4c/source/common/unicode/edits.h
Normal file
@ -0,0 +1,245 @@
|
||||
// © 2016 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
// edits.h
|
||||
// created: 2016dec30 Markus W. Scherer
|
||||
|
||||
#ifndef __EDITS_H__
|
||||
#define __EDITS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: C++ class Edits for low-level string transformations on styled text.
|
||||
*/
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Records lengths of string edits but not replacement text.
|
||||
* Supports replacements, insertions, deletions in linear progression.
|
||||
* Does not support moving/reordering of text.
|
||||
*
|
||||
* An Edits object tracks a separate UErrorCode, but ICU string transformation functions
|
||||
* (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
|
||||
*
|
||||
* @draft ICU 59
|
||||
*/
|
||||
class U_COMMON_API Edits final : public UMemory {
|
||||
public:
|
||||
/**
|
||||
* Constructs an empty object.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Edits() :
|
||||
array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0),
|
||||
errorCode(U_ZERO_ERROR) {}
|
||||
/**
|
||||
* Destructor.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
~Edits();
|
||||
|
||||
/**
|
||||
* Resets the data but may not release memory.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
void reset();
|
||||
|
||||
/**
|
||||
* Adds a record for an unchanged segment of text.
|
||||
* Normally called from inside ICU string transformation functions, not user code.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
void addUnchanged(int32_t unchangedLength);
|
||||
/**
|
||||
* Adds a record for a text replacement/insertion/deletion.
|
||||
* Normally called from inside ICU string transformation functions, not user code.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
void addReplace(int32_t oldLength, int32_t newLength);
|
||||
/**
|
||||
* Sets the UErrorCode if an error occurred while recording edits.
|
||||
* Preserves older error codes in the outErrorCode.
|
||||
* Normally called from inside ICU string transformation functions, not user code.
|
||||
* @return TRUE if U_FAILURE(outErrorCode)
|
||||
* @draft ICU 59
|
||||
*/
|
||||
UBool copyErrorTo(UErrorCode &outErrorCode);
|
||||
|
||||
/**
|
||||
* How much longer is the new text compared with the old text?
|
||||
* @return new length minus old length
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t lengthDelta() const { return delta; }
|
||||
/**
|
||||
* @return TRUE if there are any change edits
|
||||
* @draft ICU 59
|
||||
*/
|
||||
UBool hasChanges() const;
|
||||
|
||||
/**
|
||||
* Access to the list of edits.
|
||||
* @see getCoarseIterator
|
||||
* @see getFineIterator
|
||||
* @draft ICU 59
|
||||
*/
|
||||
struct Iterator final : public UMemory {
|
||||
/**
|
||||
* Copy constructor.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator(const Iterator &other) = default;
|
||||
/**
|
||||
* Assignment operator.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator &operator=(const Iterator &other) = default;
|
||||
|
||||
/**
|
||||
* Advances to the next edit.
|
||||
* @return TRUE if there is another edit
|
||||
* @draft ICU 59
|
||||
*/
|
||||
UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
|
||||
|
||||
/**
|
||||
* Finds the edit that contains the source index.
|
||||
* The source index may be found in a non-change
|
||||
* even if normal iteration would skip non-changes.
|
||||
* Normal iteration can continue from a found edit.
|
||||
*
|
||||
* The iterator state before this search logically does not matter.
|
||||
* (It may affect the performance of the search.)
|
||||
*
|
||||
* The iterator state after this search is undefined
|
||||
* if the source index is out of bounds for the source string.
|
||||
*
|
||||
* @param i source index
|
||||
* @return TRUE if the edit for the source index was found
|
||||
* @draft ICU 59
|
||||
*/
|
||||
UBool findSourceIndex(int32_t i, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* @return TRUE if this edit replaces oldLength() units with newLength() different ones.
|
||||
* FALSE if oldLength units remain unchanged.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
UBool hasChange() const { return changed; }
|
||||
/**
|
||||
* @return the number of units in the original string which are replaced or remain unchanged.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t oldLength() const { return oldLength_; }
|
||||
/**
|
||||
* @return the number of units in the modified string, if hasChange() is TRUE.
|
||||
* Same as oldLength if hasChange() is FALSE.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t newLength() const { return newLength_; }
|
||||
|
||||
/**
|
||||
* @return the current index into the source string
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t sourceIndex() const { return srcIndex; }
|
||||
/**
|
||||
* @return the current index into the replacement-characters-only string,
|
||||
* not counting unchanged spans
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t replacementIndex() const { return replIndex; }
|
||||
/**
|
||||
* @return the current index into the full destination string
|
||||
* @draft ICU 59
|
||||
*/
|
||||
int32_t destinationIndex() const { return destIndex; }
|
||||
|
||||
private:
|
||||
friend class Edits;
|
||||
|
||||
Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
|
||||
|
||||
int32_t readLength(int32_t head);
|
||||
void updateIndexes();
|
||||
UBool noNext();
|
||||
UBool next(UBool onlyChanges, UErrorCode &errorCode);
|
||||
|
||||
const uint16_t *array;
|
||||
int32_t index, length;
|
||||
int32_t remaining;
|
||||
UBool onlyChanges_, coarse;
|
||||
|
||||
UBool changed;
|
||||
int32_t oldLength_, newLength_;
|
||||
int32_t srcIndex, replIndex, destIndex;
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns an Iterator for coarse-grained changes for simple string updates.
|
||||
* Skips non-changes.
|
||||
* @return an Iterator that merges adjacent changes.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator getCoarseChangesIterator() const {
|
||||
return Iterator(array, length, TRUE, TRUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
|
||||
* @return an Iterator that merges adjacent changes.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator getCoarseIterator() const {
|
||||
return Iterator(array, length, FALSE, TRUE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for fine-grained changes for modifying styled text.
|
||||
* Skips non-changes.
|
||||
* @return an Iterator that separates adjacent changes.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator getFineChangesIterator() const {
|
||||
return Iterator(array, length, TRUE, FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
|
||||
* @return an Iterator that separates adjacent changes.
|
||||
* @draft ICU 59
|
||||
*/
|
||||
Iterator getFineIterator() const {
|
||||
return Iterator(array, length, FALSE, FALSE);
|
||||
}
|
||||
|
||||
private:
|
||||
Edits(const Edits &) = delete;
|
||||
Edits &operator=(const Edits &) = delete;
|
||||
|
||||
void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
|
||||
int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
|
||||
|
||||
void append(int32_t r);
|
||||
UBool growArray();
|
||||
|
||||
static const int32_t STACK_CAPACITY = 100;
|
||||
uint16_t *array;
|
||||
int32_t capacity;
|
||||
int32_t length;
|
||||
int32_t delta;
|
||||
UErrorCode errorCode;
|
||||
uint16_t stackArray[STACK_CAPACITY];
|
||||
};
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __EDITS_H__
|
@ -33,10 +33,8 @@
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/strenum.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
@ -48,6 +46,9 @@ U_NAMESPACE_BEGIN
|
||||
// Forward Declarations
|
||||
void U_CALLCONV locale_available_init(); /**< @internal */
|
||||
|
||||
class StringEnumeration;
|
||||
class UnicodeString;
|
||||
|
||||
/**
|
||||
* A <code>Locale</code> object represents a specific geographical, political,
|
||||
* or cultural region. An operation that requires a <code>Locale</code> to perform
|
||||
|
@ -22,8 +22,8 @@
|
||||
#define __UCASEMAP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
@ -185,6 +185,15 @@ ucasemap_setOptions(UCaseMap *csm, uint32_t options, UErrorCode *pErrorCode);
|
||||
*/
|
||||
#define U_TITLECASE_NO_BREAK_ADJUSTMENT 0x200
|
||||
|
||||
/**
|
||||
* Omit unchanged text when case-mapping with Edits.
|
||||
*
|
||||
* @see CaseMap
|
||||
* @see Edits
|
||||
* @draft ICU 59
|
||||
*/
|
||||
#define UCASEMAP_OMIT_UNCHANGED_TEXT 0x4000
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
@ -253,7 +262,7 @@ ucasemap_setBreakIterator(UCaseMap *csm, UBreakIterator *iterToAdopt, UErrorCode
|
||||
* @param dest A buffer for the result string. The result will be NUL-terminated if
|
||||
* the buffer is large enough.
|
||||
* The contents is undefined in case of failure.
|
||||
* @param destCapacity The size of the buffer (number of bytes). If it is 0, then
|
||||
* @param destCapacity The size of the buffer (number of UChars). If it is 0, then
|
||||
* dest may be NULL and the function will only return the length of the result
|
||||
* without writing any of the result string.
|
||||
* @param src The original string.
|
||||
@ -272,7 +281,7 @@ ucasemap_toTitle(UCaseMap *csm,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
#endif // UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/**
|
||||
* Lowercase the characters in a UTF-8 string.
|
||||
|
@ -23,7 +23,9 @@
|
||||
#include "unicode/localpointer.h"
|
||||
|
||||
#if U_SHOW_CPLUSPLUS_API
|
||||
#include "unicode/strenum.h"
|
||||
U_NAMESPACE_BEGIN
|
||||
class StringEnumeration;
|
||||
U_NAMESPACE_END
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
@ -33,7 +33,6 @@
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/stringpiece.h"
|
||||
#include "unicode/bytestream.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
|
||||
struct UConverter; // unicode/ucnv.h
|
||||
|
||||
@ -55,30 +54,34 @@ U_STABLE int32_t U_EXPORT2
|
||||
u_strlen(const UChar *s);
|
||||
#endif
|
||||
|
||||
/**
|
||||
* \def U_STRING_CASE_MAPPER_DEFINED
|
||||
* @internal
|
||||
*/
|
||||
#ifndef U_STRING_CASE_MAPPER_DEFINED
|
||||
#define U_STRING_CASE_MAPPER_DEFINED
|
||||
|
||||
/**
|
||||
* Internal string case mapping function type.
|
||||
* @internal
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UStringCaseMapper(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
class BreakIterator; // unicode/brkiter.h
|
||||
#endif
|
||||
class Edits;
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// Not #ifndef U_HIDE_INTERNAL_API because UnicodeString needs the UStringCaseMapper.
|
||||
/**
|
||||
* Internal string case mapping function type.
|
||||
* All error checking must be done.
|
||||
* src and dest must not overlap.
|
||||
* @internal
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UStringCaseMapper(int32_t caseLocale, uint32_t options,
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
icu::BreakIterator *iter,
|
||||
#endif
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Locale; // unicode/locid.h
|
||||
class StringCharacterIterator;
|
||||
class UnicodeStringAppendable; // unicode/appendable.h
|
||||
@ -3592,7 +3595,11 @@ private:
|
||||
* as in ustr_imp.h for ustrcase_map().
|
||||
*/
|
||||
UnicodeString &
|
||||
caseMap(const UCaseMap *csm, UStringCaseMapper *stringCaseMapper);
|
||||
caseMap(int32_t caseLocale, uint32_t options,
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
BreakIterator *iter,
|
||||
#endif
|
||||
UStringCaseMapper *stringCaseMapper);
|
||||
|
||||
// ref counting
|
||||
void addRef(void);
|
||||
|
@ -184,7 +184,6 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
return *this;
|
||||
}
|
||||
if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
|
||||
const UCaseProps *csp = ucase_getSingleton();
|
||||
{
|
||||
UnicodeSet foldSet(*this);
|
||||
UnicodeString str;
|
||||
@ -207,7 +206,6 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
int32_t n = getRangeCount();
|
||||
UChar32 result;
|
||||
const UChar *full;
|
||||
int32_t locCache = 0;
|
||||
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
@ -216,22 +214,22 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
if (attribute & USET_CASE_INSENSITIVE) {
|
||||
// full case closure
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
ucase_addCaseClosure(csp, cp, &sa);
|
||||
ucase_addCaseClosure(cp, &sa);
|
||||
}
|
||||
} else {
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
result = ucase_toFullLower(cp, NULL, NULL, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
result = ucase_toFullTitle(cp, NULL, NULL, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
result = ucase_toFullUpper(cp, NULL, NULL, &full, UCASE_LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullFolding(csp, cp, &full, 0);
|
||||
result = ucase_toFullFolding(cp, &full, 0);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
}
|
||||
}
|
||||
@ -241,7 +239,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
str = *(const UnicodeString *) strings->elementAt(j);
|
||||
str.foldCase();
|
||||
if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
|
||||
if(!ucase_addStringCaseClosure(str.getBuffer(), str.length(), &sa)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
|
@ -195,7 +195,7 @@ void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
|
||||
if(U_SUCCESS(status)) {
|
||||
impl->addPropertyStarts(&sa, status);
|
||||
}
|
||||
ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
|
||||
ucase_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFC: {
|
||||
@ -228,7 +228,7 @@ void U_CALLCONV UnicodeSet_initInclusion(int32_t src, UErrorCode &status) {
|
||||
}
|
||||
#endif
|
||||
case UPROPS_SRC_CASE:
|
||||
ucase_addPropertyStarts(ucase_getSingleton(), &sa, &status);
|
||||
ucase_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
case UPROPS_SRC_BIDI:
|
||||
ubidi_addPropertyStarts(ubidi_getSingleton(), &sa, &status);
|
||||
|
@ -19,14 +19,17 @@
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "uassert.h"
|
||||
#include "ucasemap_imp.h"
|
||||
#include "uelement.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -87,56 +90,104 @@ UnicodeString::doCaseCompare(int32_t start,
|
||||
//========================================
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::caseMap(const UCaseMap *csm,
|
||||
UnicodeString::caseMap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UStringCaseMapper *stringCaseMapper) {
|
||||
if(isEmpty() || !isWritable()) {
|
||||
// nothing to do
|
||||
return *this;
|
||||
}
|
||||
|
||||
UChar oldBuffer[2 * US_STACKBUF_SIZE];
|
||||
UChar *oldArray;
|
||||
int32_t oldLength = length();
|
||||
int32_t newLength;
|
||||
UBool writable = isBufferWritable();
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
|
||||
// Try to avoid heap-allocating a new character array for this string.
|
||||
if (writable ? oldLength <= UPRV_LENGTHOF(oldBuffer) : oldLength < US_STACKBUF_SIZE) {
|
||||
// Short string: Copy the contents into a temporary buffer and
|
||||
// case-map back into the current array, or into the stack buffer.
|
||||
UChar *buffer = getArrayStart();
|
||||
int32_t capacity;
|
||||
oldArray = oldBuffer;
|
||||
u_memcpy(oldBuffer, buffer, oldLength);
|
||||
if (writable) {
|
||||
capacity = getCapacity();
|
||||
} else {
|
||||
// Switch from the read-only alias or shared heap buffer to the stack buffer.
|
||||
if (!cloneArrayIfNeeded(US_STACKBUF_SIZE, US_STACKBUF_SIZE, /* doCopyArray= */ FALSE)) {
|
||||
return *this;
|
||||
}
|
||||
U_ASSERT(fUnion.fFields.fLengthAndFlags & kUsingStackBuffer);
|
||||
buffer = fUnion.fStackFields.fBuffer;
|
||||
capacity = US_STACKBUF_SIZE;
|
||||
}
|
||||
newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
buffer, capacity,
|
||||
oldArray, oldLength, NULL, errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
setLength(newLength);
|
||||
return *this;
|
||||
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// common overflow handling below
|
||||
} else {
|
||||
setToBogus();
|
||||
return *this;
|
||||
}
|
||||
} else {
|
||||
// Longer string or read-only buffer:
|
||||
// Collect only changes and then apply them to this string.
|
||||
// Case mapping often changes only small parts of a string,
|
||||
// and often does not change its length.
|
||||
oldArray = getArrayStart();
|
||||
Edits edits;
|
||||
UChar replacementChars[200];
|
||||
stringCaseMapper(caseLocale, options | UCASEMAP_OMIT_UNCHANGED_TEXT, UCASEMAP_BREAK_ITERATOR
|
||||
replacementChars, UPRV_LENGTHOF(replacementChars),
|
||||
oldArray, oldLength, &edits, errorCode);
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
// Grow the buffer at most once, not for multiple doReplace() calls.
|
||||
newLength = oldLength + edits.lengthDelta();
|
||||
if (newLength > oldLength && !cloneArrayIfNeeded(newLength, newLength)) {
|
||||
return *this;
|
||||
}
|
||||
for (Edits::Iterator ei = edits.getCoarseChangesIterator(); ei.next(errorCode);) {
|
||||
doReplace(ei.destinationIndex(), ei.oldLength(),
|
||||
replacementChars, ei.replacementIndex(), ei.newLength());
|
||||
}
|
||||
if (U_FAILURE(errorCode)) {
|
||||
setToBogus();
|
||||
}
|
||||
return *this;
|
||||
} else if (errorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// common overflow handling below
|
||||
newLength = oldLength + edits.lengthDelta();
|
||||
} else {
|
||||
setToBogus();
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle buffer overflow, newLength is known.
|
||||
// We need to allocate a new buffer for the internal string case mapping function.
|
||||
// This is very similar to how doReplace() keeps the old array pointer
|
||||
// and deletes the old array itself after it is done.
|
||||
// In addition, we are forcing cloneArrayIfNeeded() to always allocate a new array.
|
||||
UChar oldStackBuffer[US_STACKBUF_SIZE];
|
||||
UChar *oldArray;
|
||||
int32_t oldLength;
|
||||
|
||||
if(fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) {
|
||||
// copy the stack buffer contents because it will be overwritten
|
||||
oldArray = oldStackBuffer;
|
||||
oldLength = getShortLength();
|
||||
u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
|
||||
} else {
|
||||
oldArray = getArrayStart();
|
||||
oldLength = length();
|
||||
}
|
||||
|
||||
int32_t capacity;
|
||||
if(oldLength <= US_STACKBUF_SIZE) {
|
||||
capacity = US_STACKBUF_SIZE;
|
||||
} else {
|
||||
capacity = oldLength + 20;
|
||||
}
|
||||
int32_t *bufferToDelete = 0;
|
||||
if(!cloneArrayIfNeeded(capacity, capacity, FALSE, &bufferToDelete, TRUE)) {
|
||||
if (!cloneArrayIfNeeded(newLength, newLength, FALSE, &bufferToDelete, TRUE)) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Case-map, and if the result is too long, then reallocate and repeat.
|
||||
UErrorCode errorCode;
|
||||
int32_t newLength;
|
||||
do {
|
||||
errorCode = U_ZERO_ERROR;
|
||||
newLength = stringCaseMapper(csm, getArrayStart(), getCapacity(),
|
||||
oldArray, oldLength, &errorCode);
|
||||
setLength(newLength);
|
||||
} while(errorCode==U_BUFFER_OVERFLOW_ERROR && cloneArrayIfNeeded(newLength, newLength, FALSE));
|
||||
|
||||
errorCode = U_ZERO_ERROR;
|
||||
newLength = stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
getArrayStart(), getCapacity(),
|
||||
oldArray, oldLength, NULL, errorCode);
|
||||
if (bufferToDelete) {
|
||||
uprv_free(bufferToDelete);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
setLength(newLength);
|
||||
} else {
|
||||
setToBogus();
|
||||
}
|
||||
return *this;
|
||||
@ -144,10 +195,7 @@ UnicodeString::caseMap(const UCaseMap *csm,
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::foldCase(uint32_t options) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
csm.csp=ucase_getSingleton();
|
||||
csm.options=options;
|
||||
return caseMap(&csm, ustrcase_internalFold);
|
||||
return caseMap(UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalFold);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -19,9 +19,9 @@
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -29,44 +29,28 @@ U_NAMESPACE_BEGIN
|
||||
// Write implementation
|
||||
//========================================
|
||||
|
||||
/*
|
||||
* Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
|
||||
* Do this fast because it is called with every function call.
|
||||
*/
|
||||
static inline void
|
||||
setTempCaseMap(UCaseMap *csm, const char *locale) {
|
||||
if(csm->csp==NULL) {
|
||||
csm->csp=ucase_getSingleton();
|
||||
}
|
||||
if(locale!=NULL && locale[0]==0) {
|
||||
csm->locale[0]=0;
|
||||
} else {
|
||||
ustrcase_setTempCaseMapLocale(csm, locale);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toLower() {
|
||||
return toLower(Locale::getDefault());
|
||||
return caseMap(ustrcase_getCaseLocale(NULL), 0,
|
||||
UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalToLower);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toLower(const Locale &locale) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
setTempCaseMap(&csm, locale.getName());
|
||||
return caseMap(&csm, ustrcase_internalToLower);
|
||||
return caseMap(ustrcase_getCaseLocale(locale.getBaseName()), 0,
|
||||
UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalToLower);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toUpper() {
|
||||
return toUpper(Locale::getDefault());
|
||||
return caseMap(ustrcase_getCaseLocale(NULL), 0,
|
||||
UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalToUpper);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toUpper(const Locale &locale) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
setTempCaseMap(&csm, locale.getName());
|
||||
return caseMap(&csm, ustrcase_internalToUpper);
|
||||
return caseMap(ustrcase_getCaseLocale(locale.getBaseName()), 0,
|
||||
UCASEMAP_BREAK_ITERATOR_NULL ustrcase_internalToUpper);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -22,36 +22,10 @@
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
unistr_case_internalToTitle(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
ubrk_setText(csm->iter, src, srcLength, pErrorCode);
|
||||
return ustrcase_internalToTitle(csm, dest, destCapacity, src, srcLength, pErrorCode);
|
||||
}
|
||||
|
||||
/*
|
||||
* Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
|
||||
* Do this fast because it is called with every function call.
|
||||
*/
|
||||
static inline void
|
||||
setTempCaseMap(UCaseMap *csm, const char *locale) {
|
||||
if(csm->csp==NULL) {
|
||||
csm->csp=ucase_getSingleton();
|
||||
}
|
||||
if(locale!=NULL && locale[0]==0) {
|
||||
csm->locale[0]=0;
|
||||
} else {
|
||||
ustrcase_setTempCaseMapLocale(csm, locale);
|
||||
}
|
||||
}
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -67,9 +41,6 @@ UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale) {
|
||||
|
||||
UnicodeString &
|
||||
UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t options) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
csm.options=options;
|
||||
setTempCaseMap(&csm, locale.getName());
|
||||
BreakIterator *bi=titleIter;
|
||||
if(bi==NULL) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
@ -79,8 +50,8 @@ UnicodeString::toTitle(BreakIterator *titleIter, const Locale &locale, uint32_t
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
csm.iter=reinterpret_cast<UBreakIterator *>(bi);
|
||||
caseMap(&csm, unistr_case_internalToTitle);
|
||||
bi->setText(*this);
|
||||
caseMap(ustrcase_getCaseLocale(locale.getBaseName()), options, bi, ustrcase_internalToTitle);
|
||||
if(titleIter==NULL) {
|
||||
delete bi;
|
||||
}
|
||||
|
@ -145,7 +145,6 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
const Normalizer2Impl *nfcImpl;
|
||||
const UCaseProps *csp;
|
||||
|
||||
/* current-level start/limit - s1/s2 as current */
|
||||
const UChar *start1, *start2, *limit1, *limit2;
|
||||
@ -183,11 +182,6 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
||||
} else {
|
||||
nfcImpl=NULL;
|
||||
}
|
||||
if((options&U_COMPARE_IGNORE_CASE)!=0) {
|
||||
csp=ucase_getSingleton();
|
||||
} else {
|
||||
csp=NULL;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
@ -319,7 +313,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
||||
*/
|
||||
|
||||
if( level1==0 && (options&U_COMPARE_IGNORE_CASE) &&
|
||||
(length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
|
||||
(length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
|
||||
) {
|
||||
/* cp1 case-folds to the code point "length" or to p[length] */
|
||||
if(U_IS_SURROGATE(c1)) {
|
||||
@ -364,7 +358,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
||||
}
|
||||
|
||||
if( level2==0 && (options&U_COMPARE_IGNORE_CASE) &&
|
||||
(length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
|
||||
(length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
|
||||
) {
|
||||
/* cp2 case-folds to the code point "length" or to p[length] */
|
||||
if(U_IS_SURROGATE(c2)) {
|
||||
|
@ -128,9 +128,8 @@ static UBool changesWhenCasefolded(const BinaryProperty &/*prop*/, UChar32 c, UP
|
||||
}
|
||||
if(c>=0) {
|
||||
/* single code point */
|
||||
const UCaseProps *csp=ucase_getSingleton();
|
||||
const UChar *resultString;
|
||||
return (UBool)(ucase_toFullFolding(csp, c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
|
||||
return (UBool)(ucase_toFullFolding(c, &resultString, U_FOLD_CASE_DEFAULT)>=0);
|
||||
} else {
|
||||
/* guess some large but stack-friendly capacity */
|
||||
UChar dest[2*UCASE_MAX_STRING_LENGTH];
|
||||
@ -576,14 +575,13 @@ u_getFC_NFKC_Closure(UChar32 c, UChar *dest, int32_t destCapacity, UErrorCode *p
|
||||
// case folding and NFKC.)
|
||||
// For the derivation, see Unicode's DerivedNormalizationProps.txt.
|
||||
const Normalizer2 *nfkc=Normalizer2::getNFKCInstance(*pErrorCode);
|
||||
const UCaseProps *csp=ucase_getSingleton();
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
// first: b = NFKC(Fold(a))
|
||||
UnicodeString folded1String;
|
||||
const UChar *folded1;
|
||||
int32_t folded1Length=ucase_toFullFolding(csp, c, &folded1, U_FOLD_CASE_DEFAULT);
|
||||
int32_t folded1Length=ucase_toFullFolding(c, &folded1, U_FOLD_CASE_DEFAULT);
|
||||
if(folded1Length<0) {
|
||||
const Normalizer2Impl *nfkcImpl=Normalizer2Factory::getImpl(nfkc);
|
||||
if(nfkcImpl->getCompQuickCheck(nfkcImpl->getNorm16(c))!=UNORM_NO) {
|
||||
|
@ -18,23 +18,6 @@
|
||||
#define __USTR_IMP_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "ucase.h"
|
||||
|
||||
/** Simple declaration to avoid including unicode/ubrk.h. */
|
||||
#ifndef UBRK_TYPEDEF_UBREAK_ITERATOR
|
||||
# define UBRK_TYPEDEF_UBREAK_ITERATOR
|
||||
typedef struct UBreakIterator UBreakIterator;
|
||||
#endif
|
||||
|
||||
#ifndef U_COMPARE_IGNORE_CASE
|
||||
/* see also unorm.h */
|
||||
/**
|
||||
* Option bit for unorm_compare:
|
||||
* Perform case-insensitive comparison.
|
||||
*/
|
||||
#define U_COMPARE_IGNORE_CASE 0x10000
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for strncmp style.
|
||||
@ -53,211 +36,6 @@ uprv_strCompare(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
UBool strncmpStyle, UBool codePointOrder);
|
||||
|
||||
/**
|
||||
* Internal API, used by u_strcasecmp() etc.
|
||||
* Compare strings case-insensitively,
|
||||
* in code point order or code unit order.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
u_strcmpFold(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Interanl API, used for detecting length of
|
||||
* shared prefix case-insensitively.
|
||||
* @param s1 input string 1
|
||||
* @param length1 length of string 1, or -1 (NULL terminated)
|
||||
* @param s2 input string 2
|
||||
* @param length2 length of string 2, or -1 (NULL terminated)
|
||||
* @param options compare options
|
||||
* @param matchLen1 (output) length of partial prefix match in s1
|
||||
* @param matchLen2 (output) length of partial prefix match in s2
|
||||
* @param pErrorCode receives error status
|
||||
*/
|
||||
U_CAPI void
|
||||
u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
int32_t *matchLen1, int32_t *matchLen2,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Are the Unicode properties loaded?
|
||||
* This must be used before internal functions are called that do
|
||||
* not perform this check.
|
||||
* Generate a debug assertion failure if data is not loaded.
|
||||
*/
|
||||
U_CFUNC UBool
|
||||
uprv_haveProperties(UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Load the Unicode property data.
|
||||
* Intended primarily for use from u_init().
|
||||
* Has no effect if property data is already loaded.
|
||||
* NOT thread safe.
|
||||
*/
|
||||
/*U_CFUNC int8_t
|
||||
uprv_loadPropsData(UErrorCode *errorCode);*/
|
||||
|
||||
/*
|
||||
* Internal string casing functions implementing
|
||||
* ustring.h/ustrcase.c and UnicodeString case mapping functions.
|
||||
*/
|
||||
|
||||
struct UCaseMap {
|
||||
const UCaseProps *csp;
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
UBreakIterator *iter; /* We adopt the iterator, so we own it. */
|
||||
#endif
|
||||
char locale[32];
|
||||
int32_t locCache;
|
||||
uint32_t options;
|
||||
};
|
||||
|
||||
#ifndef __UCASEMAP_H__
|
||||
typedef struct UCaseMap UCaseMap;
|
||||
#endif
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
# define UCASEMAP_INITIALIZER { NULL, { 0 }, 0, 0 }
|
||||
#else
|
||||
# define UCASEMAP_INITIALIZER { NULL, NULL, { 0 }, 0, 0 }
|
||||
#endif
|
||||
|
||||
U_CFUNC void
|
||||
ustrcase_setTempCaseMapLocale(UCaseMap *csm, const char *locale);
|
||||
|
||||
#ifndef U_STRING_CASE_MAPPER_DEFINED
|
||||
#define U_STRING_CASE_MAPPER_DEFINED
|
||||
|
||||
/**
|
||||
* String case mapping function type, used by ustrcase_map().
|
||||
* All error checking must be done.
|
||||
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
|
||||
* src and dest must not overlap.
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UStringCaseMapper(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToLower(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToUpper(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToTitle(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
/** Implements UStringCaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalFold(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Implements argument checking and buffer handling
|
||||
* for string case mapping as a common function.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ustrcase_map(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UStringCaseMapper *stringCaseMapper,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* UTF-8 string case mapping function type, used by ucasemap_mapUTF8().
|
||||
* UTF-8 version of UStringCaseMapper.
|
||||
* All error checking must be done.
|
||||
* The UCaseMap must be fully initialized, with locale and/or iter set as needed.
|
||||
* src and dest must not overlap.
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UTF8CaseMapper(const UCaseMap *csm,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/** Implements UTF8CaseMapper. */
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(const UCaseMap *csm,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Implements argument checking and buffer handling
|
||||
* for UTF-8 string case mapping as a common function.
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
ucasemap_mapUTF8(const UCaseMap *csm,
|
||||
uint8_t *dest, int32_t destCapacity,
|
||||
const uint8_t *src, int32_t srcLength,
|
||||
UTF8CaseMapper *stringCaseMapper,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
namespace GreekUpper {
|
||||
|
||||
// Data bits.
|
||||
static const uint32_t UPPER_MASK = 0x3ff;
|
||||
static const uint32_t HAS_VOWEL = 0x1000;
|
||||
static const uint32_t HAS_YPOGEGRAMMENI = 0x2000;
|
||||
static const uint32_t HAS_ACCENT = 0x4000;
|
||||
static const uint32_t HAS_DIALYTIKA = 0x8000;
|
||||
// Further bits during data building and processing, not stored in the data map.
|
||||
static const uint32_t HAS_COMBINING_DIALYTIKA = 0x10000;
|
||||
static const uint32_t HAS_OTHER_GREEK_DIACRITIC = 0x20000;
|
||||
|
||||
static const uint32_t HAS_VOWEL_AND_ACCENT = HAS_VOWEL | HAS_ACCENT;
|
||||
static const uint32_t HAS_VOWEL_AND_ACCENT_AND_DIALYTIKA =
|
||||
HAS_VOWEL_AND_ACCENT | HAS_DIALYTIKA;
|
||||
static const uint32_t HAS_EITHER_DIALYTIKA = HAS_DIALYTIKA | HAS_COMBINING_DIALYTIKA;
|
||||
|
||||
// State bits.
|
||||
static const uint32_t AFTER_CASED = 1;
|
||||
static const uint32_t AFTER_VOWEL_WITH_ACCENT = 2;
|
||||
|
||||
uint32_t getLetterData(UChar32 c);
|
||||
|
||||
/**
|
||||
* Returns a non-zero value for each of the Greek combining diacritics
|
||||
* listed in The Unicode Standard, version 8, chapter 7.2 Greek,
|
||||
* plus some perispomeni look-alikes.
|
||||
*/
|
||||
uint32_t getDiacriticData(UChar32 c);
|
||||
|
||||
} // namespace GreekUpper
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // __cplusplus
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ustr_hashUCharsN(const UChar *str, int32_t length);
|
||||
|
||||
|
@ -22,31 +22,18 @@
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucase.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* functions available in the common library (for unistr_case.cpp) */
|
||||
|
||||
/*
|
||||
* Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
|
||||
* Do this fast because it is called with every function call.
|
||||
* Duplicate of the same function in ustrcase.cpp, to keep it inline.
|
||||
*/
|
||||
static inline void
|
||||
setTempCaseMap(UCaseMap *csm, const char *locale) {
|
||||
if(csm->csp==NULL) {
|
||||
csm->csp=ucase_getSingleton();
|
||||
}
|
||||
if(locale!=NULL && locale[0]==0) {
|
||||
csm->locale[0]=0;
|
||||
} else {
|
||||
ustrcase_setTempCaseMapLocale(csm, locale);
|
||||
}
|
||||
}
|
||||
|
||||
/* public API functions */
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
@ -55,39 +42,73 @@ u_strToTitle(UChar *dest, int32_t destCapacity,
|
||||
UBreakIterator *titleIter,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
setTempCaseMap(&csm, locale);
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
BreakIterator *iter;
|
||||
if(titleIter!=NULL) {
|
||||
ubrk_setText(csm.iter=titleIter, src, srcLength, pErrorCode);
|
||||
iter=reinterpret_cast<BreakIterator *>(titleIter);
|
||||
} else {
|
||||
csm.iter=ubrk_open(UBRK_WORD, csm.locale, src, srcLength, pErrorCode);
|
||||
iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
|
||||
ownedIter.adoptInstead(iter);
|
||||
}
|
||||
int32_t length=ustrcase_map(
|
||||
&csm,
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
iter->setText(s);
|
||||
return ustrcase_mapWithOverlap(
|
||||
ustrcase_getCaseLocale(locale), 0, iter,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToTitle, pErrorCode);
|
||||
if(titleIter==NULL && csm.iter!=NULL) {
|
||||
ubrk_close(csm.iter);
|
||||
}
|
||||
return length;
|
||||
ustrcase_internalToTitle, *pErrorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
int32_t CaseMap::toTitle(
|
||||
const char *locale, uint32_t options, BreakIterator *iter,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
LocalPointer<BreakIterator> ownedIter;
|
||||
if(iter==NULL) {
|
||||
iter=BreakIterator::createWordInstance(Locale(locale), errorCode);
|
||||
ownedIter.adoptInstead(iter);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
iter->setText(s);
|
||||
return ustrcase_map(
|
||||
ustrcase_getCaseLocale(locale), options, iter,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToTitle, edits, errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucasemap_toTitle(UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(csm->iter!=NULL) {
|
||||
ubrk_setText(csm->iter, src, srcLength, pErrorCode);
|
||||
} else {
|
||||
csm->iter=ubrk_open(UBRK_WORD, csm->locale, src, srcLength, pErrorCode);
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if (csm->iter == NULL) {
|
||||
csm->iter = BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
|
||||
}
|
||||
if (U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
UnicodeString s(srcLength<0, src, srcLength);
|
||||
csm->iter->setText(s);
|
||||
return ustrcase_map(
|
||||
csm,
|
||||
csm->caseLocale, csm->options, csm->iter,
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToTitle, pErrorCode);
|
||||
ustrcase_internalToTitle, NULL, *pErrorCode);
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_BREAK_ITERATION
|
||||
|
@ -22,6 +22,8 @@
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/ubrk.h"
|
||||
@ -29,9 +31,30 @@
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucase.h"
|
||||
#include "ucasemap_imp.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "uassert.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
||||
int32_t checkOverflowAndEditsError(int32_t destIndex, int32_t destCapacity,
|
||||
Edits *edits, UErrorCode &errorCode) {
|
||||
if (U_SUCCESS(errorCode)) {
|
||||
if (destIndex > destCapacity) {
|
||||
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
||||
} else if (edits != NULL) {
|
||||
edits->copyErrorTo(errorCode);
|
||||
}
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
/* string casing ------------------------------------------------------------ */
|
||||
@ -39,21 +62,43 @@ U_NAMESPACE_USE
|
||||
/* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */
|
||||
static inline int32_t
|
||||
appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
int32_t result, const UChar *s) {
|
||||
int32_t result, const UChar *s,
|
||||
int32_t cpLength, uint32_t options, icu::Edits *edits) {
|
||||
UChar32 c;
|
||||
int32_t length;
|
||||
|
||||
/* decode the result */
|
||||
if(result<0) {
|
||||
/* (not) original code point */
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(cpLength);
|
||||
if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
c=~result;
|
||||
length=U16_LENGTH(c);
|
||||
} else if(result<=UCASE_MAX_STRING_LENGTH) {
|
||||
c=U_SENTINEL;
|
||||
length=result;
|
||||
if(destIndex<destCapacity && c<=0xffff) { // BMP slightly-fastpath
|
||||
dest[destIndex++]=(UChar)c;
|
||||
return destIndex;
|
||||
}
|
||||
length=cpLength;
|
||||
} else {
|
||||
c=result;
|
||||
length=U16_LENGTH(c);
|
||||
if(result<=UCASE_MAX_STRING_LENGTH) {
|
||||
c=U_SENTINEL;
|
||||
length=result;
|
||||
} else if(destIndex<destCapacity && result<=0xffff) { // BMP slightly-fastpath
|
||||
dest[destIndex++]=(UChar)result;
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(cpLength, 1);
|
||||
}
|
||||
return destIndex;
|
||||
} else {
|
||||
c=result;
|
||||
length=U16_LENGTH(c);
|
||||
}
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(cpLength, length);
|
||||
}
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
@ -99,9 +144,15 @@ appendUChar(UChar *dest, int32_t destIndex, int32_t destCapacity, UChar c) {
|
||||
}
|
||||
|
||||
static inline int32_t
|
||||
appendString(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const UChar *s, int32_t length) {
|
||||
appendUnchanged(UChar *dest, int32_t destIndex, int32_t destCapacity,
|
||||
const UChar *s, int32_t length, uint32_t options, icu::Edits *edits) {
|
||||
if(length>0) {
|
||||
if(edits!=NULL) {
|
||||
edits->addUnchanged(length);
|
||||
if(options & UCASEMAP_OMIT_UNCHANGED_TEXT) {
|
||||
return destIndex;
|
||||
}
|
||||
}
|
||||
if(length>(INT32_MAX-destIndex)) {
|
||||
return -1; // integer overflow
|
||||
}
|
||||
@ -150,84 +201,66 @@ utf16_caseContextIterator(void *context, int8_t dir) {
|
||||
* context [0..srcLength[ into account.
|
||||
*/
|
||||
static int32_t
|
||||
_caseMap(const UCaseMap *csm, UCaseMapFull *map,
|
||||
_caseMap(int32_t caseLocale, uint32_t options, UCaseMapFull *map,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, UCaseContext *csc,
|
||||
int32_t srcStart, int32_t srcLimit,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *s;
|
||||
UChar32 c, c2 = 0;
|
||||
int32_t srcIndex, destIndex;
|
||||
int32_t locCache;
|
||||
|
||||
locCache=csm->locCache;
|
||||
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
srcIndex=srcStart;
|
||||
destIndex=0;
|
||||
int32_t srcIndex=srcStart;
|
||||
int32_t destIndex=0;
|
||||
while(srcIndex<srcLimit) {
|
||||
csc->cpStart=srcIndex;
|
||||
int32_t cpStart;
|
||||
csc->cpStart=cpStart=srcIndex;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, srcIndex, srcLimit, c);
|
||||
csc->cpLimit=srcIndex;
|
||||
c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
|
||||
/* fast path version of appendResult() for BMP results */
|
||||
dest[destIndex++]=(UChar)c2;
|
||||
} else {
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
const UChar *s;
|
||||
c=map(c, utf16_caseContextIterator, csc, &s, caseLocale);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(destIndex>destCapacity) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToTitle(const UCaseMap *csm,
|
||||
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UChar *s;
|
||||
UChar32 c;
|
||||
int32_t prev, titleStart, titleLimit, idx, destIndex;
|
||||
UBool isFirstIndex;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Use the C++ abstract base class to minimize dependencies.
|
||||
// TODO: Change UCaseMap.iter to store a BreakIterator directly.
|
||||
BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter);
|
||||
|
||||
/* set up local variables */
|
||||
int32_t locCache=csm->locCache;
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
destIndex=0;
|
||||
prev=0;
|
||||
isFirstIndex=TRUE;
|
||||
int32_t destIndex=0;
|
||||
int32_t prev=0;
|
||||
UBool isFirstIndex=TRUE;
|
||||
|
||||
/* titlecasing loop */
|
||||
while(prev<srcLength) {
|
||||
/* find next index where to titlecase */
|
||||
int32_t index;
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=FALSE;
|
||||
idx=bi->first();
|
||||
index=iter->first();
|
||||
} else {
|
||||
idx=bi->next();
|
||||
index=iter->next();
|
||||
}
|
||||
if(idx==UBRK_DONE || idx>srcLength) {
|
||||
idx=srcLength;
|
||||
if(index==UBRK_DONE || index>srcLength) {
|
||||
index=srcLength;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -243,29 +276,32 @@ ustrcase_internalToTitle(const UCaseMap *csm,
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<idx) {
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
titleStart=titleLimit=prev;
|
||||
U16_NEXT(src, titleLimit, idx, c);
|
||||
if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
|
||||
int32_t titleStart=prev;
|
||||
int32_t titleLimit=prev;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, titleLimit, index, c);
|
||||
if((options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(c)) {
|
||||
/* Adjust the titlecasing index (titleStart) to the next cased character. */
|
||||
for(;;) {
|
||||
titleStart=titleLimit;
|
||||
if(titleLimit==idx) {
|
||||
if(titleLimit==index) {
|
||||
/*
|
||||
* only uncased characters in [prev..index[
|
||||
* stop with titleStart==titleLimit==index
|
||||
*/
|
||||
break;
|
||||
}
|
||||
U16_NEXT(src, titleLimit, idx, c);
|
||||
if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
|
||||
U16_NEXT(src, titleLimit, index, c);
|
||||
if(UCASE_NONE!=ucase_getType(c)) {
|
||||
break; /* cased letter at [titleStart..titleLimit[ */
|
||||
}
|
||||
}
|
||||
destIndex=appendString(dest, destIndex, destCapacity, src+prev, titleStart-prev);
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+prev, titleStart-prev, options, edits);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -274,48 +310,64 @@ ustrcase_internalToTitle(const UCaseMap *csm,
|
||||
/* titlecase c which is from [titleStart..titleLimit[ */
|
||||
csc.cpStart=titleStart;
|
||||
csc.cpLimit=titleLimit;
|
||||
c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache);
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
|
||||
const UChar *s;
|
||||
c=ucase_toFullTitle(c, utf16_caseContextIterator, &csc, &s, caseLocale);
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s,
|
||||
titleLimit-titleStart, options, edits);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Special case Dutch IJ titlecasing */
|
||||
if (titleStart+1 < idx &&
|
||||
ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069) &&
|
||||
(src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A)) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
if (titleStart+1 < index &&
|
||||
caseLocale == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
|
||||
if (src[titleStart+1] == 0x006A) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
titleLimit++;
|
||||
} else if (src[titleStart+1] == 0x004A) {
|
||||
// Keep the capital J from getting lowercased.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleStart+1, 1, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
titleLimit++;
|
||||
}
|
||||
titleLimit++;
|
||||
}
|
||||
|
||||
/* lowercase [titleLimit..index[ */
|
||||
if(titleLimit<idx) {
|
||||
if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
if(titleLimit<index) {
|
||||
if((options&U_TITLECASE_NO_LOWERCASE)==0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
destIndex+=
|
||||
_caseMap(
|
||||
csm, ucase_toFullLower,
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest+destIndex, destCapacity-destIndex,
|
||||
src, &csc,
|
||||
titleLimit, idx,
|
||||
pErrorCode);
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
titleLimit, index,
|
||||
edits, errorCode);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return destIndex;
|
||||
}
|
||||
} else {
|
||||
/* Optionally just copy the rest of the word unchanged. */
|
||||
destIndex=appendString(dest, destIndex, destCapacity, src+titleLimit, idx-titleLimit);
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleLimit, index-titleLimit, options, edits);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -323,13 +375,10 @@ ustrcase_internalToTitle(const UCaseMap *csm,
|
||||
}
|
||||
}
|
||||
|
||||
prev=idx;
|
||||
prev=index;
|
||||
}
|
||||
|
||||
if(destIndex>destCapacity) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return destIndex;
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_BREAK_ITERATION
|
||||
@ -791,11 +840,11 @@ uint32_t getDiacriticData(UChar32 c) {
|
||||
}
|
||||
}
|
||||
|
||||
UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i, int32_t length) {
|
||||
UBool isFollowedByCasedLetter(const UChar *s, int32_t i, int32_t length) {
|
||||
while (i < length) {
|
||||
UChar32 c;
|
||||
U16_NEXT(s, i, length, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(csp, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(c);
|
||||
if ((type & UCASE_IGNORABLE) != 0) {
|
||||
// Case-ignorable, continue with the loop.
|
||||
} else if (type != UCASE_NONE) {
|
||||
@ -813,11 +862,11 @@ UBool isFollowedByCasedLetter(const UCaseProps *csp, const UChar *s, int32_t i,
|
||||
* for each character.
|
||||
* TODO: Try to re-consolidate one way or another with the non-Greek function.
|
||||
*/
|
||||
int32_t toUpper(const UCaseMap *csm,
|
||||
int32_t toUpper(uint32_t options,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t locCache = UCASE_LOC_GREEK;
|
||||
Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t destIndex=0;
|
||||
uint32_t state = 0;
|
||||
for (int32_t i = 0; i < srcLength;) {
|
||||
@ -825,7 +874,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
UChar32 c;
|
||||
U16_NEXT(src, nextIndex, srcLength, c);
|
||||
uint32_t nextState = 0;
|
||||
int32_t type = ucase_getTypeOrIgnorable(csm->csp, c);
|
||||
int32_t type = ucase_getTypeOrIgnorable(c);
|
||||
if ((type & UCASE_IGNORABLE) != 0) {
|
||||
// c is case-ignorable
|
||||
nextState |= (state & AFTER_CASED);
|
||||
@ -872,7 +921,7 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
(data & HAS_ACCENT) != 0 &&
|
||||
numYpogegrammeni == 0 &&
|
||||
(state & AFTER_CASED) == 0 &&
|
||||
!isFollowedByCasedLetter(csm->csp, src, nextIndex, srcLength)) {
|
||||
!isFollowedByCasedLetter(src, nextIndex, srcLength)) {
|
||||
// Keep disjunctive "or" with (only) a tonos.
|
||||
// We use the same "word boundary" conditions as for the Final_Sigma test.
|
||||
if (i == nextIndex) {
|
||||
@ -890,44 +939,68 @@ int32_t toUpper(const UCaseMap *csm,
|
||||
data &= ~HAS_EITHER_DIALYTIKA;
|
||||
}
|
||||
}
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
|
||||
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
|
||||
|
||||
UBool change = TRUE;
|
||||
if (edits != NULL) {
|
||||
// Find out first whether we are changing the text.
|
||||
change = src[i] != upper || numYpogegrammeni > 0;
|
||||
int32_t i2 = i + 1;
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
change |= i2 >= nextIndex || src[i2] != 0x308;
|
||||
++i2;
|
||||
}
|
||||
if (addTonos) {
|
||||
change |= i2 >= nextIndex || src[i2] != 0x301;
|
||||
++i2;
|
||||
}
|
||||
int32_t oldLength = nextIndex - i;
|
||||
int32_t newLength = (i2 - i) + numYpogegrammeni;
|
||||
change |= oldLength != newLength;
|
||||
if (change) {
|
||||
if (edits != NULL) {
|
||||
edits->addReplace(oldLength, newLength);
|
||||
}
|
||||
} else {
|
||||
if (edits != NULL) {
|
||||
edits->addUnchanged(oldLength);
|
||||
}
|
||||
// Write unchanged text?
|
||||
change = (options & UCASEMAP_OMIT_UNCHANGED_TEXT) == 0;
|
||||
}
|
||||
}
|
||||
if (destIndex >= 0 && addTonos) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
|
||||
}
|
||||
while (destIndex >= 0 && numYpogegrammeni > 0) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
|
||||
--numYpogegrammeni;
|
||||
}
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
|
||||
if (change) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, (UChar)upper);
|
||||
if (destIndex >= 0 && (data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x308); // restore or add a dialytika
|
||||
}
|
||||
if (destIndex >= 0 && addTonos) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x301);
|
||||
}
|
||||
while (destIndex >= 0 && numYpogegrammeni > 0) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x399);
|
||||
--numYpogegrammeni;
|
||||
}
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const UChar *s;
|
||||
UChar32 c2 = 0;
|
||||
c=ucase_toFullUpper(csm->csp, c, NULL, NULL, &s, csm->locale, &locCache);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
|
||||
/* fast path version of appendResult() for BMP results */
|
||||
dest[destIndex++]=(UChar)c2;
|
||||
} else {
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
c=ucase_toFullUpper(c, NULL, NULL, &s, UCASE_LOC_GREEK);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
nextIndex - i, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
i = nextIndex;
|
||||
state = nextState;
|
||||
}
|
||||
|
||||
if(destIndex>destCapacity) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return destIndex;
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
} // namespace GreekUpper
|
||||
@ -936,94 +1009,79 @@ U_NAMESPACE_END
|
||||
/* functions available in the common library (for unistr_case.cpp) */
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToLower(const UCaseMap *csm,
|
||||
ustrcase_internalToLower(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
return _caseMap(
|
||||
csm, ucase_toFullLower,
|
||||
int32_t destIndex = _caseMap(
|
||||
caseLocale, options, ucase_toFullLower,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
pErrorCode);
|
||||
edits, errorCode);
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToUpper(const UCaseMap *csm,
|
||||
ustrcase_internalToUpper(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t locCache = csm->locCache;
|
||||
if (ucase_getCaseLocale(csm->locale, &locCache) == UCASE_LOC_GREEK) {
|
||||
return GreekUpper::toUpper(csm, dest, destCapacity, src, srcLength, pErrorCode);
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
if (caseLocale == UCASE_LOC_GREEK) {
|
||||
return GreekUpper::toUpper(options, dest, destCapacity, src, srcLength, edits, errorCode);
|
||||
}
|
||||
UCaseContext csc=UCASECONTEXT_INITIALIZER;
|
||||
csc.p=(void *)src;
|
||||
csc.limit=srcLength;
|
||||
return _caseMap(
|
||||
csm, ucase_toFullUpper,
|
||||
int32_t destIndex = _caseMap(
|
||||
caseLocale, options, ucase_toFullUpper,
|
||||
dest, destCapacity,
|
||||
src, &csc, 0, srcLength,
|
||||
pErrorCode);
|
||||
edits, errorCode);
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
ustr_foldCase(const UCaseProps *csp,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t srcIndex, destIndex;
|
||||
|
||||
const UChar *s;
|
||||
UChar32 c, c2 = 0;
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalFold(int32_t /* caseLocale */, uint32_t options, UCASEMAP_BREAK_ITERATOR_UNUSED
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
/* case mapping loop */
|
||||
srcIndex=destIndex=0;
|
||||
while(srcIndex<srcLength) {
|
||||
int32_t srcIndex = 0;
|
||||
int32_t destIndex = 0;
|
||||
while (srcIndex < srcLength) {
|
||||
int32_t cpStart = srcIndex;
|
||||
UChar32 c;
|
||||
U16_NEXT(src, srcIndex, srcLength, c);
|
||||
c=ucase_toFullFolding(csp, c, &s, options);
|
||||
if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) {
|
||||
/* fast path version of appendResult() for BMP results */
|
||||
dest[destIndex++]=(UChar)c2;
|
||||
} else {
|
||||
destIndex=appendResult(dest, destIndex, destCapacity, c, s);
|
||||
if(destIndex<0) {
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
const UChar *s;
|
||||
c = ucase_toFullFolding(c, &s, options);
|
||||
destIndex = appendResult(dest, destIndex, destCapacity, c, s,
|
||||
srcIndex - cpStart, options, edits);
|
||||
if (destIndex < 0) {
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if(destIndex>destCapacity) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
return destIndex;
|
||||
}
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalFold(const UCaseMap *csm,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode);
|
||||
return checkOverflowAndEditsError(destIndex, destCapacity, edits, errorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
ustrcase_map(const UCaseMap *csm,
|
||||
ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UStringCaseMapper *stringCaseMapper,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[300];
|
||||
UChar *temp;
|
||||
|
||||
icu::Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
int32_t destLength;
|
||||
|
||||
/* check argument values */
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( destCapacity<0 ||
|
||||
@ -1031,7 +1089,53 @@ ustrcase_map(const UCaseMap *csm,
|
||||
src==NULL ||
|
||||
srcLength<-1
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* get the string length */
|
||||
if(srcLength==-1) {
|
||||
srcLength=u_strlen(src);
|
||||
}
|
||||
|
||||
/* check for overlapping source and destination */
|
||||
if( dest!=NULL &&
|
||||
((src>=dest && src<(dest+destCapacity)) ||
|
||||
(dest>=src && dest<(src+srcLength)))
|
||||
) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if(edits!=NULL) {
|
||||
edits->reset();
|
||||
}
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
dest, destCapacity, src, srcLength, edits, errorCode);
|
||||
return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
ustrcase_mapWithOverlap(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
||||
UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UStringCaseMapper *stringCaseMapper,
|
||||
UErrorCode &errorCode) {
|
||||
UChar buffer[300];
|
||||
UChar *temp;
|
||||
|
||||
int32_t destLength;
|
||||
|
||||
/* check argument values */
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( destCapacity<0 ||
|
||||
(dest==NULL && destCapacity>0) ||
|
||||
src==NULL ||
|
||||
srcLength<-1
|
||||
) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1053,7 +1157,7 @@ ustrcase_map(const UCaseMap *csm,
|
||||
/* allocate a buffer */
|
||||
temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR);
|
||||
if(temp==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
@ -1061,21 +1165,19 @@ ustrcase_map(const UCaseMap *csm,
|
||||
temp=dest;
|
||||
}
|
||||
|
||||
destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode);
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
temp, destCapacity, src, srcLength, NULL, errorCode);
|
||||
if(temp!=dest) {
|
||||
/* copy the result string to the destination buffer */
|
||||
if(destLength>0) {
|
||||
int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity;
|
||||
if(copyLength>0) {
|
||||
u_memmove(dest, temp, copyLength);
|
||||
}
|
||||
if (U_SUCCESS(errorCode) && 0 < destLength && destLength <= destCapacity) {
|
||||
u_memmove(dest, temp, destLength);
|
||||
}
|
||||
if(temp!=buffer) {
|
||||
uprv_free(temp);
|
||||
}
|
||||
}
|
||||
|
||||
return u_terminateUChars(dest, destCapacity, destLength, pErrorCode);
|
||||
return u_terminateUChars(dest, destCapacity, destLength, &errorCode);
|
||||
}
|
||||
|
||||
/* public API functions */
|
||||
@ -1085,16 +1187,29 @@ u_strFoldCase(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
csm.csp=ucase_getSingleton();
|
||||
csm.options=options;
|
||||
return ustrcase_map(
|
||||
&csm,
|
||||
return ustrcase_mapWithOverlap(
|
||||
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalFold, pErrorCode);
|
||||
ustrcase_internalFold, *pErrorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
int32_t CaseMap::fold(
|
||||
uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
return ustrcase_map(
|
||||
UCASE_LOC_ROOT, options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalFold, edits, errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/* case-insensitive string comparisons -------------------------------------- */
|
||||
|
||||
/*
|
||||
@ -1134,8 +1249,6 @@ static int32_t _cmpFold(
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t cmpRes = 0;
|
||||
|
||||
const UCaseProps *csp;
|
||||
|
||||
/* current-level start/limit - s1/s2 as current */
|
||||
const UChar *start1, *start2, *limit1, *limit2;
|
||||
|
||||
@ -1167,7 +1280,6 @@ static int32_t _cmpFold(
|
||||
* assume that at least the option U_COMPARE_IGNORE_CASE is set
|
||||
* otherwise this function would have to behave exactly as uprv_strCompare()
|
||||
*/
|
||||
csp=ucase_getSingleton();
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
@ -1349,7 +1461,7 @@ static int32_t _cmpFold(
|
||||
*/
|
||||
|
||||
if( level1==0 &&
|
||||
(length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0
|
||||
(length=ucase_toFullFolding((UChar32)cp1, &p, options))>=0
|
||||
) {
|
||||
/* cp1 case-folds to the code point "length" or to p[length] */
|
||||
if(U_IS_SURROGATE(c1)) {
|
||||
@ -1395,7 +1507,7 @@ static int32_t _cmpFold(
|
||||
}
|
||||
|
||||
if( level2==0 &&
|
||||
(length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0
|
||||
(length=ucase_toFullFolding((UChar32)cp2, &p, options))>=0
|
||||
) {
|
||||
/* cp2 case-folds to the code point "length" or to p[length] */
|
||||
if(U_IS_SURROGATE(c2)) {
|
||||
|
@ -18,66 +18,24 @@
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "uassert.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "ucase.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "ucasemap_imp.h"
|
||||
|
||||
U_CFUNC void
|
||||
ustrcase_setTempCaseMapLocale(UCaseMap *csm, const char *locale) {
|
||||
/*
|
||||
* We could call ucasemap_setLocale(), but here we really only care about
|
||||
* the initial language subtag, we need not return the real string via
|
||||
* ucasemap_getLocale(), and we don't care about only getting "x" from
|
||||
* "x-some-thing" etc.
|
||||
*
|
||||
* We ignore locales with a longer-than-3 initial subtag.
|
||||
*
|
||||
* We also do not fill in the locCache because it is rarely used,
|
||||
* and not worth setting unless we reuse it for many case mapping operations.
|
||||
* (That's why UCaseMap was created.)
|
||||
*/
|
||||
int i;
|
||||
char c;
|
||||
|
||||
/* the internal functions require locale!=NULL */
|
||||
if(locale==NULL) {
|
||||
// Do not call uprv_getDefaultLocaleID() because that does not see
|
||||
// changes to the default locale via uloc_setDefault().
|
||||
// It would also be inefficient if used frequently because uprv_getDefaultLocaleID()
|
||||
// does not cache the locale ID.
|
||||
//
|
||||
// Unfortunately, uloc_getDefault() has many dependencies.
|
||||
// We only care about a small set of language subtags,
|
||||
// and we do not need the locale ID to be canonicalized.
|
||||
//
|
||||
// Best is to not call case mapping functions with a NULL locale ID.
|
||||
locale=uloc_getDefault();
|
||||
U_CFUNC int32_t
|
||||
ustrcase_getCaseLocale(const char *locale) {
|
||||
if (locale == NULL) {
|
||||
locale = uloc_getDefault();
|
||||
}
|
||||
for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) {
|
||||
csm->locale[i]=c;
|
||||
}
|
||||
if(i<=3) {
|
||||
csm->locale[i]=0; /* Up to 3 non-separator characters. */
|
||||
if (*locale == 0) {
|
||||
return UCASE_LOC_ROOT;
|
||||
} else {
|
||||
csm->locale[0]=0; /* Longer-than-3 initial subtag: Ignore. */
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Set parameters on an empty UCaseMap, for UCaseMap-less API functions.
|
||||
* Do this fast because it is called with every function call.
|
||||
*/
|
||||
static inline void
|
||||
setTempCaseMap(UCaseMap *csm, const char *locale) {
|
||||
if(csm->csp==NULL) {
|
||||
csm->csp=ucase_getSingleton();
|
||||
}
|
||||
if(locale!=NULL && locale[0]==0) {
|
||||
csm->locale[0]=0;
|
||||
} else {
|
||||
ustrcase_setTempCaseMapLocale(csm, locale);
|
||||
return ucase_getCaseLocale(locale);
|
||||
}
|
||||
}
|
||||
|
||||
@ -88,13 +46,11 @@ u_strToLower(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
setTempCaseMap(&csm, locale);
|
||||
return ustrcase_map(
|
||||
&csm,
|
||||
return ustrcase_mapWithOverlap(
|
||||
ustrcase_getCaseLocale(locale), 0, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToLower, pErrorCode);
|
||||
ustrcase_internalToLower, *pErrorCode);
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
@ -102,11 +58,37 @@ u_strToUpper(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
const char *locale,
|
||||
UErrorCode *pErrorCode) {
|
||||
UCaseMap csm=UCASEMAP_INITIALIZER;
|
||||
setTempCaseMap(&csm, locale);
|
||||
return ustrcase_map(
|
||||
&csm,
|
||||
return ustrcase_mapWithOverlap(
|
||||
ustrcase_getCaseLocale(locale), 0, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToUpper, pErrorCode);
|
||||
ustrcase_internalToUpper, *pErrorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
int32_t CaseMap::toLower(
|
||||
const char *locale, uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
return ustrcase_map(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToLower, edits, errorCode);
|
||||
}
|
||||
|
||||
int32_t CaseMap::toUpper(
|
||||
const char *locale, uint32_t options,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UChar *dest, int32_t destCapacity, Edits *edits,
|
||||
UErrorCode &errorCode) {
|
||||
return ustrcase_map(
|
||||
ustrcase_getCaseLocale(locale), options, UCASEMAP_BREAK_ITERATOR_NULL
|
||||
dest, destCapacity,
|
||||
src, srcLength,
|
||||
ustrcase_internalToUpper, edits, errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "cstring.h"
|
||||
|
@ -13,7 +13,9 @@
|
||||
|
||||
#include "unicode/dcfmtsym.h"
|
||||
#include "unicode/plurrule.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ucurr.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "affixpatternparser.h"
|
||||
#include "charstr.h"
|
||||
#include "precision.h"
|
||||
|
@ -92,7 +92,6 @@ UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(CaseMapTransliterator)
|
||||
*/
|
||||
CaseMapTransliterator::CaseMapTransliterator(const UnicodeString &id, UCaseMapFull *map) :
|
||||
Transliterator(id, 0),
|
||||
fCsp(ucase_getSingleton()),
|
||||
fMap(map)
|
||||
{
|
||||
// TODO test incremental mode with context-sensitive text (e.g. greek sigma)
|
||||
@ -110,7 +109,7 @@ CaseMapTransliterator::~CaseMapTransliterator() {
|
||||
*/
|
||||
CaseMapTransliterator::CaseMapTransliterator(const CaseMapTransliterator& o) :
|
||||
Transliterator(o),
|
||||
fCsp(o.fCsp), fMap(o.fMap)
|
||||
fMap(o.fMap)
|
||||
{
|
||||
}
|
||||
|
||||
@ -119,7 +118,6 @@ CaseMapTransliterator::CaseMapTransliterator(const CaseMapTransliterator& o) :
|
||||
*/
|
||||
/*CaseMapTransliterator& CaseMapTransliterator::operator=(const CaseMapTransliterator& o) {
|
||||
Transliterator::operator=(o);
|
||||
fCsp = o.fCsp;
|
||||
fMap = o.fMap;
|
||||
return *this;
|
||||
}*/
|
||||
@ -151,14 +149,14 @@ void CaseMapTransliterator::handleTransliterate(Replaceable& text,
|
||||
UnicodeString tmp;
|
||||
const UChar *s;
|
||||
UChar32 c;
|
||||
int32_t textPos, delta, result, locCache=0;
|
||||
int32_t textPos, delta, result;
|
||||
|
||||
for(textPos=offsets.start; textPos<offsets.limit;) {
|
||||
csc.cpStart=textPos;
|
||||
c=text.char32At(textPos);
|
||||
csc.cpLimit=textPos+=U16_LENGTH(c);
|
||||
|
||||
result=fMap(fCsp, c, utrans_rep_caseContextIterator, &csc, &s, "", &locCache);
|
||||
result=fMap(c, utrans_rep_caseContextIterator, &csc, &s, UCASE_LOC_ROOT);
|
||||
|
||||
if(csc.b1 && isIncremental) {
|
||||
// fMap() tried to look beyond the context limit
|
||||
|
@ -84,7 +84,6 @@ protected:
|
||||
UTransPosition& offsets,
|
||||
UBool isIncremental) const;
|
||||
|
||||
const UCaseProps *fCsp;
|
||||
UCaseMapFull *fMap;
|
||||
|
||||
private:
|
||||
|
@ -20,6 +20,7 @@
|
||||
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/plurrule.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/numsys.h"
|
||||
#include "cstring.h"
|
||||
|
@ -49,6 +49,7 @@
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/numsys.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "uresimp.h"
|
||||
#include "ucurrimp.h"
|
||||
#include "charstr.h"
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "unicode/decimfmt.h"
|
||||
#include "uresimp.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "ureslocs.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "unicode/format.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/parseerr.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "uvector.h"
|
||||
#include "hash.h"
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/udisplaycontext.h"
|
||||
#include "unicode/brkiter.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
|
@ -19,8 +19,7 @@
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
CaseFoldingUTextIterator::CaseFoldingUTextIterator(UText &text) :
|
||||
fUText(text), fcsp(NULL), fFoldChars(NULL), fFoldLength(0) {
|
||||
fcsp = ucase_getSingleton();
|
||||
fUText(text), fFoldChars(NULL), fFoldLength(0) {
|
||||
}
|
||||
|
||||
CaseFoldingUTextIterator::~CaseFoldingUTextIterator() {}
|
||||
@ -35,7 +34,7 @@ UChar32 CaseFoldingUTextIterator::next() {
|
||||
if (originalC == U_SENTINEL) {
|
||||
return originalC;
|
||||
}
|
||||
fFoldLength = ucase_toFullFolding(fcsp, originalC, &fFoldChars, U_FOLD_CASE_DEFAULT);
|
||||
fFoldLength = ucase_toFullFolding(originalC, &fFoldChars, U_FOLD_CASE_DEFAULT);
|
||||
if (fFoldLength >= UCASE_MAX_STRING_LENGTH || fFoldLength < 0) {
|
||||
// input code point folds to a single code point, possibly itself.
|
||||
// See comment in ucase.h for explanation of return values from ucase_toFullFoldings.
|
||||
@ -65,8 +64,7 @@ UBool CaseFoldingUTextIterator::inExpansion() {
|
||||
|
||||
|
||||
CaseFoldingUCharIterator::CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit) :
|
||||
fChars(chars), fIndex(start), fLimit(limit), fcsp(NULL), fFoldChars(NULL), fFoldLength(0) {
|
||||
fcsp = ucase_getSingleton();
|
||||
fChars(chars), fIndex(start), fLimit(limit), fFoldChars(NULL), fFoldLength(0) {
|
||||
}
|
||||
|
||||
|
||||
@ -84,7 +82,7 @@ UChar32 CaseFoldingUCharIterator::next() {
|
||||
}
|
||||
U16_NEXT(fChars, fIndex, fLimit, originalC);
|
||||
|
||||
fFoldLength = ucase_toFullFolding(fcsp, originalC, &fFoldChars, U_FOLD_CASE_DEFAULT);
|
||||
fFoldLength = ucase_toFullFolding(originalC, &fFoldChars, U_FOLD_CASE_DEFAULT);
|
||||
if (fFoldLength >= UCASE_MAX_STRING_LENGTH || fFoldLength < 0) {
|
||||
// input code point folds to a single code point, possibly itself.
|
||||
// See comment in ucase.h for explanation of return values from ucase_toFullFoldings.
|
||||
|
@ -374,7 +374,6 @@ class CaseFoldingUTextIterator: public UMemory {
|
||||
// folding of the same code point from the orignal UText.
|
||||
private:
|
||||
UText &fUText;
|
||||
const UCaseProps *fcsp;
|
||||
const UChar *fFoldChars;
|
||||
int32_t fFoldLength;
|
||||
int32_t fFoldIndex;
|
||||
@ -404,7 +403,6 @@ class CaseFoldingUCharIterator: public UMemory {
|
||||
const UChar *fChars;
|
||||
int64_t fIndex;
|
||||
int64_t fLimit;
|
||||
const UCaseProps *fcsp;
|
||||
const UChar *fFoldChars;
|
||||
int32_t fFoldLength;
|
||||
int32_t fFoldIndex;
|
||||
|
@ -15,6 +15,7 @@
|
||||
#if !UCONFIG_NO_FORMATTING && !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
#include "unicode/dtfmtsym.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/ureldatefmt.h"
|
||||
#include "unicode/udisplaycontext.h"
|
||||
#include "unicode/unum.h"
|
||||
|
@ -20,7 +20,7 @@
|
||||
#include "unicode/udisplaycontext.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/brkiter.h"
|
||||
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "reldtfmt.h"
|
||||
#include "cmemory.h"
|
||||
#include "uresimp.h"
|
||||
|
@ -48,6 +48,7 @@
|
||||
#include "unicode/simpletz.h"
|
||||
#include "unicode/rbtz.h"
|
||||
#include "unicode/tzfmt.h"
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "unicode/utf16.h"
|
||||
#include "unicode/vtzone.h"
|
||||
#include "unicode/udisplaycontext.h"
|
||||
@ -64,6 +65,7 @@
|
||||
#include <float.h>
|
||||
#include "smpdtfst.h"
|
||||
#include "sharednumberformat.h"
|
||||
#include "ucasemap_imp.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "charstr.h"
|
||||
#include "uvector.h"
|
||||
|
@ -97,7 +97,7 @@ void TitlecaseTransliterator::handleTransliterate(
|
||||
int32_t start;
|
||||
for (start = offsets.start - 1; start >= offsets.contextStart; start -= U16_LENGTH(c)) {
|
||||
c = text.char32At(start);
|
||||
type=ucase_getTypeOrIgnorable(fCsp, c);
|
||||
type=ucase_getTypeOrIgnorable(c);
|
||||
if(type>0) { // cased
|
||||
doTitle=FALSE;
|
||||
break;
|
||||
@ -118,19 +118,19 @@ void TitlecaseTransliterator::handleTransliterate(
|
||||
|
||||
UnicodeString tmp;
|
||||
const UChar *s;
|
||||
int32_t textPos, delta, result, locCache=0;
|
||||
int32_t textPos, delta, result;
|
||||
|
||||
for(textPos=offsets.start; textPos<offsets.limit;) {
|
||||
csc.cpStart=textPos;
|
||||
c=text.char32At(textPos);
|
||||
csc.cpLimit=textPos+=U16_LENGTH(c);
|
||||
|
||||
type=ucase_getTypeOrIgnorable(fCsp, c);
|
||||
type=ucase_getTypeOrIgnorable(c);
|
||||
if(type>=0) { // not case-ignorable
|
||||
if(doTitle) {
|
||||
result=ucase_toFullTitle(fCsp, c, utrans_rep_caseContextIterator, &csc, &s, "", &locCache);
|
||||
result=ucase_toFullTitle(c, utrans_rep_caseContextIterator, &csc, &s, UCASE_LOC_ROOT);
|
||||
} else {
|
||||
result=ucase_toFullLower(fCsp, c, utrans_rep_caseContextIterator, &csc, &s, "", &locCache);
|
||||
result=ucase_toFullLower(c, utrans_rep_caseContextIterator, &csc, &s, UCASE_LOC_ROOT);
|
||||
}
|
||||
doTitle = (UBool)(type==0); // doTitle=isUncased
|
||||
|
||||
|
@ -14,8 +14,10 @@
|
||||
#include "unicode/calendar.h"
|
||||
#include "unicode/tzfmt.h"
|
||||
#include "unicode/numsys.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/udat.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "tzgnames.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "unicode/rbtz.h"
|
||||
#include "unicode/simpleformatter.h"
|
||||
#include "unicode/simpletz.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/vtzone.h"
|
||||
|
||||
#include "cmemory.h"
|
||||
|
@ -15,6 +15,7 @@
|
||||
|
||||
#if !UCONFIG_NO_FORMATTING
|
||||
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/timezone.h"
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
|
@ -47,6 +47,7 @@ U_NAMESPACE_BEGIN
|
||||
|
||||
struct CollationData;
|
||||
|
||||
class CharacterIterator;
|
||||
class CollationIterator;
|
||||
class RuleBasedCollator;
|
||||
class UCollationPCE;
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/unum.h"
|
||||
#include "unicode/unistr.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
|
@ -230,6 +230,7 @@ typedef enum UDateDirection {
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class BreakIterator;
|
||||
class RelativeDateTimeCacheData;
|
||||
class SharedNumberFormat;
|
||||
class SharedPluralRules;
|
||||
|
@ -17,7 +17,7 @@
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/simpletz.h"
|
||||
|
||||
#include "unicode/strenum.h"
|
||||
#include "umutex.h"
|
||||
#include "uvector.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -27,6 +27,7 @@
|
||||
#include "unicode/ucasemap.h"
|
||||
#include "cmemory.h"
|
||||
#include "cintltst.h"
|
||||
#include "ucasemap_imp.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* test string case mapping functions --------------------------------------- */
|
||||
@ -744,11 +745,12 @@ TestUCaseMap(void) {
|
||||
if(0!=strcmp(locale, "tr")) {
|
||||
log_err("ucasemap_getLocale(ucasemap_open(\"tur\"))==%s!=\"tr\"\n", locale);
|
||||
}
|
||||
/* overly long locale IDs get truncated to their language code to avoid unnecessary allocation */
|
||||
/* overly long locale IDs may get truncated to their language code to avoid unnecessary allocation */
|
||||
ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode);
|
||||
locale=ucasemap_getLocale(csm);
|
||||
if(0!=strcmp(locale, "i-klingon")) {
|
||||
log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s!=\"i-klingon\"\n", locale);
|
||||
if(0!=strncmp(locale, "i-klingon", 9)) {
|
||||
log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s\n"
|
||||
" does not start with \"i-klingon\"\n", locale);
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include "uprops.h"
|
||||
#include "uset_imp.h"
|
||||
#include "usc_impl.h"
|
||||
#include "udatamem.h" /* for testing ucase_openBinary() */
|
||||
#include "udatamem.h"
|
||||
#include "cucdapi.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
@ -59,7 +59,6 @@ static void TestNumericProperties(void);
|
||||
static void TestPropertyNames(void);
|
||||
static void TestPropertyValues(void);
|
||||
static void TestConsistency(void);
|
||||
static void TestUCase(void);
|
||||
static void TestUBiDiProps(void);
|
||||
static void TestCaseFolding(void);
|
||||
|
||||
@ -196,7 +195,6 @@ void addUnicodeTest(TestNode** root)
|
||||
addTest(root, &TestPropertyNames, "tsutil/cucdtst/TestPropertyNames");
|
||||
addTest(root, &TestPropertyValues, "tsutil/cucdtst/TestPropertyValues");
|
||||
addTest(root, &TestConsistency, "tsutil/cucdtst/TestConsistency");
|
||||
addTest(root, &TestUCase, "tsutil/cucdtst/TestUCase");
|
||||
addTest(root, &TestUBiDiProps, "tsutil/cucdtst/TestUBiDiProps");
|
||||
addTest(root, &TestCaseFolding, "tsutil/cucdtst/TestCaseFolding");
|
||||
}
|
||||
@ -3256,47 +3254,6 @@ TestConsistency() {
|
||||
*/
|
||||
#define HARDCODED_DATA_4497 1
|
||||
|
||||
/* API coverage for ucase.c */
|
||||
static void TestUCase() {
|
||||
#if !HARDCODED_DATA_4497
|
||||
UDataMemory *pData;
|
||||
UCaseProps *csp;
|
||||
const UCaseProps *ccsp;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/* coverage for ucase_openBinary() */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
pData=udata_open(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_data_err("unable to open " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
|
||||
u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
csp=ucase_openBinary((const uint8_t *)pData->pHeader, -1, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("ucase_openBinary() fails for the contents of " UCASE_DATA_NAME "." UCASE_DATA_TYPE ": %s\n",
|
||||
u_errorName(errorCode));
|
||||
udata_close(pData);
|
||||
return;
|
||||
}
|
||||
|
||||
if(UCASE_LOWER!=ucase_getType(csp, 0xdf)) { /* verify islower(sharp s) */
|
||||
log_err("ucase_openBinary() does not seem to return working UCaseProps\n");
|
||||
}
|
||||
|
||||
ucase_close(csp);
|
||||
udata_close(pData);
|
||||
|
||||
/* coverage for ucase_getDummy() */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
ccsp=ucase_getDummy(&errorCode);
|
||||
if(ucase_tolower(ccsp, 0x41)!=0x41) {
|
||||
log_err("ucase_tolower(dummy, A)!=A\n");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* API coverage for ubidi_props.c */
|
||||
static void TestUBiDiProps() {
|
||||
#if !HARDCODED_DATA_4497
|
||||
|
@ -41,6 +41,7 @@
|
||||
#include "apicoll.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/ucol.h"
|
||||
|
||||
|
@ -7,6 +7,7 @@
|
||||
*********************************************************************/
|
||||
|
||||
#include "locnmtst.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cstring.h"
|
||||
|
||||
/*
|
||||
|
@ -21,8 +21,10 @@
|
||||
#include "unicode/measfmt.h"
|
||||
#include "unicode/measure.h"
|
||||
#include "unicode/measunit.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/tmunit.h"
|
||||
#include "unicode/plurrule.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "charstr.h"
|
||||
#include "cstr.h"
|
||||
#include "unicode/reldatefmt.h"
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/measfmt.h"
|
||||
#include "unicode/curramt.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "digitlst.h"
|
||||
#include "textfile.h"
|
||||
#include "tokiter.h"
|
||||
|
@ -19,6 +19,8 @@
|
||||
*/
|
||||
|
||||
#include "unicode/std_string.h"
|
||||
#include "unicode/casemap.h"
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ures.h"
|
||||
#include "unicode/uloc.h"
|
||||
@ -31,10 +33,52 @@
|
||||
#include "unicode/tstdtmod.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
struct EditChange {
|
||||
UBool change;
|
||||
int32_t oldLength, newLength;
|
||||
};
|
||||
|
||||
class StringCaseTest: public IntlTest {
|
||||
public:
|
||||
StringCaseTest();
|
||||
virtual ~StringCaseTest();
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=0);
|
||||
|
||||
void TestCaseConversion();
|
||||
|
||||
void TestCasingImpl(const UnicodeString &input,
|
||||
const UnicodeString &output,
|
||||
int32_t whichCase,
|
||||
void *iter, const char *localeID, uint32_t options);
|
||||
void TestCasing();
|
||||
void TestFullCaseFoldingIterator();
|
||||
void TestGreekUpper();
|
||||
void TestLongUpper();
|
||||
void TestMalformedUTF8();
|
||||
void TestBufferOverflow();
|
||||
void TestEdits();
|
||||
void TestCaseMapWithEdits();
|
||||
void TestLongUnicodeString();
|
||||
|
||||
private:
|
||||
void assertGreekUpper(const char *s, const char *expected);
|
||||
void checkEditsIter(
|
||||
const UnicodeString &name, Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators
|
||||
const EditChange expected[], int32_t expLength, UBool withUnchanged,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
Locale GREEK_LOCALE_;
|
||||
};
|
||||
|
||||
StringCaseTest::StringCaseTest() : GREEK_LOCALE_("el") {}
|
||||
|
||||
StringCaseTest::~StringCaseTest() {}
|
||||
|
||||
extern IntlTest *createStringCaseTest() {
|
||||
return new StringCaseTest();
|
||||
}
|
||||
|
||||
void
|
||||
StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) {
|
||||
if(exec) {
|
||||
@ -50,6 +94,9 @@ StringCaseTest::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
|
||||
TESTCASE_AUTO(TestLongUpper);
|
||||
TESTCASE_AUTO(TestMalformedUTF8);
|
||||
TESTCASE_AUTO(TestBufferOverflow);
|
||||
TESTCASE_AUTO(TestEdits);
|
||||
TESTCASE_AUTO(TestCaseMapWithEdits);
|
||||
TESTCASE_AUTO(TestLongUnicodeString);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
@ -848,3 +895,214 @@ void StringCaseTest::TestBufferOverflow() {
|
||||
errorCode.reset();
|
||||
#endif // U_HAVE_STD_STRING
|
||||
}
|
||||
|
||||
void StringCaseTest::checkEditsIter(
|
||||
const UnicodeString &name,
|
||||
Edits::Iterator ei1, Edits::Iterator ei2, // two equal iterators
|
||||
const EditChange expected[], int32_t expLength, UBool withUnchanged,
|
||||
UErrorCode &errorCode) {
|
||||
assertFalse(name, ei2.findSourceIndex(-1, errorCode));
|
||||
|
||||
int32_t expSrcIndex = 0;
|
||||
int32_t expDestIndex = 0;
|
||||
int32_t expReplIndex = 0;
|
||||
for (int32_t expIndex = 0; expIndex < expLength; ++expIndex) {
|
||||
const EditChange &expect = expected[expIndex];
|
||||
UnicodeString msg = UnicodeString(name).append(u' ') + expIndex;
|
||||
if (withUnchanged || expect.change) {
|
||||
assertTrue(msg, ei1.next(errorCode));
|
||||
assertEquals(msg, expect.change, ei1.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei1.oldLength());
|
||||
assertEquals(msg, expect.newLength, ei1.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei1.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei1.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei1.replacementIndex());
|
||||
}
|
||||
|
||||
if (expect.oldLength > 0) {
|
||||
assertTrue(msg, ei2.findSourceIndex(expSrcIndex, errorCode));
|
||||
assertEquals(msg, expect.change, ei2.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei2.oldLength());
|
||||
assertEquals(msg, expect.newLength, ei2.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei2.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei2.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei2.replacementIndex());
|
||||
if (!withUnchanged) {
|
||||
// For some iterators, move past the current range
|
||||
// so that findSourceIndex() has to look before the current index.
|
||||
ei2.next(errorCode);
|
||||
ei2.next(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
expSrcIndex += expect.oldLength;
|
||||
expDestIndex += expect.newLength;
|
||||
if (expect.change) {
|
||||
expReplIndex += expect.newLength;
|
||||
}
|
||||
}
|
||||
// TODO: remove casts from u"" when merging into trunk
|
||||
UnicodeString msg = UnicodeString(name).append((const UChar *)u" end");
|
||||
assertFalse(msg, ei1.next(errorCode));
|
||||
assertFalse(msg, ei1.hasChange());
|
||||
assertEquals(msg, 0, ei1.oldLength());
|
||||
assertEquals(msg, 0, ei1.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei1.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei1.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei1.replacementIndex());
|
||||
|
||||
assertFalse(name, ei2.findSourceIndex(expSrcIndex, errorCode));
|
||||
}
|
||||
|
||||
void StringCaseTest::TestEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestEdits");
|
||||
Edits edits;
|
||||
assertFalse("new Edits", edits.hasChanges());
|
||||
assertEquals("new Edits", 0, edits.lengthDelta());
|
||||
edits.addUnchanged(1); // multiple unchanged ranges are combined
|
||||
edits.addUnchanged(10000); // too long, and they are split
|
||||
edits.addReplace(0, 0);
|
||||
edits.addUnchanged(2);
|
||||
assertFalse("unchanged 10003", edits.hasChanges());
|
||||
assertEquals("unchanged 10003", 0, edits.lengthDelta());
|
||||
edits.addReplace(1, 1); // multiple short equal-length edits are compressed
|
||||
edits.addUnchanged(0);
|
||||
edits.addReplace(1, 1);
|
||||
edits.addReplace(1, 1);
|
||||
edits.addReplace(0, 10);
|
||||
edits.addReplace(100, 0);
|
||||
edits.addReplace(3000, 4000); // variable-length encoding
|
||||
edits.addReplace(100000, 100000);
|
||||
assertTrue("some edits", edits.hasChanges());
|
||||
assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
|
||||
UErrorCode outErrorCode = U_ZERO_ERROR;
|
||||
assertFalse("edits done: copyErrorTo", edits.copyErrorTo(outErrorCode));
|
||||
|
||||
static const EditChange coarseExpectedChanges[] = {
|
||||
{ FALSE, 10003, 10003 },
|
||||
{ TRUE, 103103, 104013 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"coarse",
|
||||
edits.getCoarseIterator(), edits.getCoarseIterator(),
|
||||
coarseExpectedChanges, UPRV_LENGTHOF(coarseExpectedChanges), TRUE, errorCode);
|
||||
checkEditsIter((const UChar *)u"coarse changes",
|
||||
edits.getCoarseChangesIterator(), edits.getCoarseChangesIterator(),
|
||||
coarseExpectedChanges, UPRV_LENGTHOF(coarseExpectedChanges), FALSE, errorCode);
|
||||
|
||||
static const EditChange fineExpectedChanges[] = {
|
||||
{ FALSE, 10003, 10003 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 0, 10 },
|
||||
{ TRUE, 100, 0 },
|
||||
{ TRUE, 3000, 4000 },
|
||||
{ TRUE, 100000, 100000 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"fine",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
fineExpectedChanges, UPRV_LENGTHOF(fineExpectedChanges), TRUE, errorCode);
|
||||
checkEditsIter((const UChar *)u"fine changes",
|
||||
edits.getFineChangesIterator(), edits.getFineChangesIterator(),
|
||||
fineExpectedChanges, UPRV_LENGTHOF(fineExpectedChanges), FALSE, errorCode);
|
||||
|
||||
edits.reset();
|
||||
assertFalse("reset", edits.hasChanges());
|
||||
assertEquals("reset", 0, edits.lengthDelta());
|
||||
Edits::Iterator ei = edits.getCoarseChangesIterator();
|
||||
assertFalse("reset then iterator", ei.next(errorCode));
|
||||
}
|
||||
|
||||
void StringCaseTest::TestCaseMapWithEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestEdits");
|
||||
UChar dest[20];
|
||||
Edits edits;
|
||||
|
||||
int32_t length = CaseMap::toLower("tr", UCASEMAP_OMIT_UNCHANGED_TEXT,
|
||||
(const UChar *)u"IstanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals((const UChar *)u"toLower(Istanbul)", UnicodeString((const UChar *)u"ıb"), UnicodeString(TRUE, dest, length));
|
||||
static const EditChange lowerExpectedChanges[] = {
|
||||
{ TRUE, 1, 1 },
|
||||
{ FALSE, 4, 4 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ FALSE, 2, 2 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"toLower(Istanbul)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
lowerExpectedChanges, UPRV_LENGTHOF(lowerExpectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
edits.reset();
|
||||
length = CaseMap::toUpper("el", UCASEMAP_OMIT_UNCHANGED_TEXT,
|
||||
(const UChar *)u"Πατάτα", 6, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals((const UChar *)u"toUpper(Πατάτα)", UnicodeString((const UChar *)u"ΑΤΑΤΑ"), UnicodeString(TRUE, dest, length));
|
||||
static const EditChange upperExpectedChanges[] = {
|
||||
{ FALSE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 1 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"toUpper(Πατάτα)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
upperExpectedChanges, UPRV_LENGTHOF(upperExpectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
edits.reset();
|
||||
length = CaseMap::toTitle("nl",
|
||||
UCASEMAP_OMIT_UNCHANGED_TEXT |
|
||||
U_TITLECASE_NO_BREAK_ADJUSTMENT |
|
||||
U_TITLECASE_NO_LOWERCASE,
|
||||
NULL, (const UChar *)u"IjssEL IglOo", 12,
|
||||
dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals((const UChar *)u"toTitle(IjssEL IglOo)", UnicodeString((const UChar *)u"J"), UnicodeString(TRUE, dest, length));
|
||||
static const EditChange titleExpectedChanges[] = {
|
||||
{ FALSE, 1, 1 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ FALSE, 10, 10 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"toTitle(IjssEL IglOo)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
titleExpectedChanges, UPRV_LENGTHOF(titleExpectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
edits.reset();
|
||||
length = CaseMap::fold(UCASEMAP_OMIT_UNCHANGED_TEXT | U_FOLD_CASE_EXCLUDE_SPECIAL_I,
|
||||
(const UChar *)u"IßtanBul", 8, dest, UPRV_LENGTHOF(dest), &edits, errorCode);
|
||||
assertEquals((const UChar *)u"foldCase(IßtanBul)", UnicodeString((const UChar *)u"ıssb"), UnicodeString(TRUE, dest, length));
|
||||
static const EditChange foldExpectedChanges[] = {
|
||||
{ TRUE, 1, 1 },
|
||||
{ TRUE, 1, 2 },
|
||||
{ FALSE, 3, 3 },
|
||||
{ TRUE, 1, 1 },
|
||||
{ FALSE, 2, 2 }
|
||||
};
|
||||
checkEditsIter((const UChar *)u"foldCase(IßtanBul)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
foldExpectedChanges, UPRV_LENGTHOF(foldExpectedChanges),
|
||||
TRUE, errorCode);
|
||||
}
|
||||
|
||||
void StringCaseTest::TestLongUnicodeString() {
|
||||
// Code coverage for UnicodeString case mapping code handling
|
||||
// long strings or many changes in a string.
|
||||
UnicodeString s(TRUE,
|
||||
(const UChar *)
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF"
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF"
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF"
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF"
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF"
|
||||
u"aaaaaaaaaabbbbbbbbbbccccccccccddddddddddeeeeeeeeeeF", 6 * 51);
|
||||
UnicodeString expected(TRUE,
|
||||
(const UChar *)
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF"
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF"
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF"
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF"
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF"
|
||||
u"AAAAAAAAAABBBBBBBBBBCCCCCCCCCCDDDDDDDDDDEEEEEEEEEEF", 6 * 51);
|
||||
s.toUpper(Locale::getRoot());
|
||||
assertEquals("string length 306", expected, s);
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "unicode/messagepattern.h"
|
||||
#include "unicode/selfmt.h"
|
||||
#include "unicode/gregocal.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include <stdio.h>
|
||||
|
||||
void
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "unicode/tzrule.h"
|
||||
#include "unicode/calendar.h"
|
||||
#include "unicode/gregocal.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ucal.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/locid.h"
|
||||
#include "unicode/strenum.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/uenum.h"
|
||||
#include "unicode/utf16.h"
|
||||
@ -29,11 +30,13 @@ using namespace std;
|
||||
|
||||
UnicodeStringTest::~UnicodeStringTest() {}
|
||||
|
||||
extern IntlTest *createStringCaseTest();
|
||||
|
||||
void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *par)
|
||||
{
|
||||
if (exec) logln("TestSuite UnicodeStringTest: ");
|
||||
TESTCASE_AUTO_BEGIN;
|
||||
TESTCASE_AUTO_CLASS(StringCaseTest);
|
||||
TESTCASE_AUTO_CREATE_CLASS(StringCaseTest);
|
||||
TESTCASE_AUTO(TestBasicManipulation);
|
||||
TESTCASE_AUTO(TestCompare);
|
||||
TESTCASE_AUTO(TestExtract);
|
||||
|
@ -94,30 +94,4 @@ public:
|
||||
void TestMoveSwap();
|
||||
};
|
||||
|
||||
class StringCaseTest: public IntlTest {
|
||||
public:
|
||||
StringCaseTest();
|
||||
virtual ~StringCaseTest();
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=0);
|
||||
|
||||
void TestCaseConversion();
|
||||
|
||||
void TestCasingImpl(const UnicodeString &input,
|
||||
const UnicodeString &output,
|
||||
int32_t whichCase,
|
||||
void *iter, const char *localeID, uint32_t options);
|
||||
void TestCasing();
|
||||
void TestFullCaseFoldingIterator();
|
||||
void TestGreekUpper();
|
||||
void TestLongUpper();
|
||||
void TestMalformedUTF8();
|
||||
void TestBufferOverflow();
|
||||
|
||||
private:
|
||||
void assertGreekUpper(const char *s, const char *expected);
|
||||
|
||||
Locale GREEK_LOCALE_;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -22,6 +22,7 @@
|
||||
#include "unicode/ucharstrie.h"
|
||||
#include "unicode/bytestrie.h"
|
||||
#include "unicode/ucnv.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utf16.h"
|
||||
|
||||
#include "charstr.h"
|
||||
|
@ -2,9 +2,14 @@
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import java.io.IOException;
|
||||
|
||||
public final class CaseMap {
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.Edits;
|
||||
import com.ibm.icu.util.ICUUncheckedIOException;
|
||||
|
||||
public final class CaseMapImpl {
|
||||
/**
|
||||
* Implementation of UCaseProps.ContextIterator, iterates over a String.
|
||||
* See ustrcase.c/utf16_caseContextIterator().
|
||||
@ -12,11 +17,11 @@ public final class CaseMap {
|
||||
public static final class StringContextIterator implements UCaseProps.ContextIterator {
|
||||
/**
|
||||
* Constructor.
|
||||
* @param s String to iterate over.
|
||||
* @param src String to iterate over.
|
||||
*/
|
||||
public StringContextIterator(String s) {
|
||||
this.s=s;
|
||||
limit=s.length();
|
||||
public StringContextIterator(CharSequence src) {
|
||||
this.s=src;
|
||||
limit=src.length();
|
||||
cpStart=cpLimit=index=0;
|
||||
dir=0;
|
||||
}
|
||||
@ -60,7 +65,7 @@ public final class CaseMap {
|
||||
public int nextCaseMapCP() {
|
||||
cpStart=cpLimit;
|
||||
if(cpLimit<limit) {
|
||||
int c=s.codePointAt(cpLimit);
|
||||
int c=Character.codePointAt(s, cpLimit);
|
||||
cpLimit+=Character.charCount(c);
|
||||
return c;
|
||||
} else {
|
||||
@ -84,6 +89,10 @@ public final class CaseMap {
|
||||
return cpLimit;
|
||||
}
|
||||
|
||||
public int getCPLength() {
|
||||
return cpLimit-cpStart;
|
||||
}
|
||||
|
||||
// implement UCaseProps.ContextIterator
|
||||
// The following code is not used anywhere in this private class
|
||||
@Override
|
||||
@ -108,11 +117,11 @@ public final class CaseMap {
|
||||
int c;
|
||||
|
||||
if(dir>0 && index<s.length()) {
|
||||
c=s.codePointAt(index);
|
||||
c=Character.codePointAt(s, index);
|
||||
index+=Character.charCount(c);
|
||||
return c;
|
||||
} else if(dir<0 && index>0) {
|
||||
c=s.codePointBefore(index);
|
||||
c=Character.codePointBefore(s, index);
|
||||
index-=Character.charCount(c);
|
||||
return c;
|
||||
}
|
||||
@ -120,44 +129,242 @@ public final class CaseMap {
|
||||
}
|
||||
|
||||
// variables
|
||||
protected String s;
|
||||
protected CharSequence s;
|
||||
protected int index, limit, cpStart, cpLimit;
|
||||
protected int dir; // 0=initial state >0=forward <0=backward
|
||||
}
|
||||
|
||||
/** Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}. */
|
||||
private static final void appendResult(int c, StringBuilder result) {
|
||||
// Decode the result.
|
||||
if (c < 0) {
|
||||
// (not) original code point
|
||||
result.appendCodePoint(~c);
|
||||
} else if (c <= UCaseProps.MAX_STRING_LENGTH) {
|
||||
// The mapping has already been appended to result.
|
||||
/**
|
||||
* Omit unchanged text when case-mapping with Edits.
|
||||
*/
|
||||
public static final int OMIT_UNCHANGED_TEXT = 0x4000;
|
||||
|
||||
private static int appendCodePoint(Appendable a, int c) throws IOException {
|
||||
if (c <= Character.MAX_VALUE) {
|
||||
a.append((char)c);
|
||||
return 1;
|
||||
} else {
|
||||
// Append the single-code point mapping.
|
||||
result.appendCodePoint(c);
|
||||
a.append((char)(0xd7c0 + (c >> 10)));
|
||||
a.append((char)(Character.MIN_LOW_SURROGATE + (c & 0x3ff)));
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Move the other string case mapping functions from UCharacter to here, too.
|
||||
|
||||
public static String toUpper(ULocale locale, String str) {
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
int[] locCache = new int[] { UCaseProps.getCaseLocale(locale, null) };
|
||||
if (locCache[0] == UCaseProps.LOC_GREEK) {
|
||||
return GreekUpper.toUpper(str, locCache);
|
||||
/**
|
||||
* Appends a full case mapping result, see {@link UCaseProps#MAX_STRING_LENGTH}.
|
||||
* @throws IOException
|
||||
*/
|
||||
private static void appendResult(int result, Appendable dest,
|
||||
int cpLength, int options, Edits edits) throws IOException {
|
||||
// Decode the result.
|
||||
if (result < 0) {
|
||||
// (not) original code point
|
||||
if (edits != null) {
|
||||
edits.addUnchanged(cpLength);
|
||||
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
appendCodePoint(dest, ~result);
|
||||
} else if (result <= UCaseProps.MAX_STRING_LENGTH) {
|
||||
// The mapping has already been appended to result.
|
||||
if (edits != null) {
|
||||
edits.addReplace(cpLength, result);
|
||||
}
|
||||
} else {
|
||||
// Append the single-code point mapping.
|
||||
int length = appendCodePoint(dest, result);
|
||||
if (edits != null) {
|
||||
edits.addReplace(cpLength, length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
StringContextIterator iter = new StringContextIterator(str);
|
||||
StringBuilder result = new StringBuilder(str.length());
|
||||
private static final void appendUnchanged(CharSequence src, int start, int length,
|
||||
Appendable dest, int options, Edits edits) throws IOException {
|
||||
if (length > 0) {
|
||||
if (edits != null) {
|
||||
edits.addUnchanged(length);
|
||||
if ((options & OMIT_UNCHANGED_TEXT) != 0) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
dest.append(src, start, start + length);
|
||||
}
|
||||
}
|
||||
|
||||
private static void internalToLower(int caseLocale, int options, StringContextIterator iter,
|
||||
Appendable dest, Edits edits) throws IOException {
|
||||
int c;
|
||||
while((c=iter.nextCaseMapCP())>=0) {
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, iter, result, locale, locCache);
|
||||
appendResult(c, result);
|
||||
while ((c = iter.nextCaseMapCP()) >= 0) {
|
||||
c = UCaseProps.INSTANCE.toFullLower(c, iter, dest, caseLocale);
|
||||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||||
}
|
||||
}
|
||||
|
||||
public static <A extends Appendable> A toLower(int caseLocale, int options,
|
||||
CharSequence src, A dest, Edits edits) {
|
||||
try {
|
||||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
StringContextIterator iter = new StringContextIterator(src);
|
||||
internalToLower(caseLocale, options, iter, dest, edits);
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static <A extends Appendable> A toUpper(int caseLocale, int options,
|
||||
CharSequence src, A dest, Edits edits) {
|
||||
try {
|
||||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
if (caseLocale == UCaseProps.LOC_GREEK) {
|
||||
return GreekUpper.toUpper(options, src, dest, edits);
|
||||
}
|
||||
StringContextIterator iter = new StringContextIterator(src);
|
||||
int c;
|
||||
while ((c = iter.nextCaseMapCP()) >= 0) {
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, iter, dest, caseLocale);
|
||||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||||
}
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static <A extends Appendable> A toTitle(
|
||||
int caseLocale, int options, BreakIterator titleIter,
|
||||
CharSequence src, A dest, Edits edits) {
|
||||
try {
|
||||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
|
||||
/* set up local variables */
|
||||
StringContextIterator iter = new StringContextIterator(src);
|
||||
int srcLength = src.length();
|
||||
int prev=0;
|
||||
boolean isFirstIndex=true;
|
||||
|
||||
/* titlecasing loop */
|
||||
while(prev<srcLength) {
|
||||
/* find next index where to titlecase */
|
||||
int index;
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=false;
|
||||
index=titleIter.first();
|
||||
} else {
|
||||
index=titleIter.next();
|
||||
}
|
||||
if(index==BreakIterator.DONE || index>srcLength) {
|
||||
index=srcLength;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
// find and copy uncased characters [prev..titleStart[
|
||||
int titleStart=prev;
|
||||
iter.setLimit(index);
|
||||
int c=iter.nextCaseMapCP();
|
||||
if((options&UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT)==0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
|
||||
// Adjust the titlecasing index (titleStart) to the next cased character.
|
||||
while((c=iter.nextCaseMapCP())>=0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
|
||||
// If c<0 then we have only uncased characters in [prev..index[
|
||||
// and stopped with titleStart==titleLimit==index.
|
||||
titleStart=iter.getCPStart();
|
||||
appendUnchanged(src, prev, titleStart-prev, dest, options, edits);
|
||||
}
|
||||
|
||||
if(titleStart<index) {
|
||||
int titleLimit=iter.getCPLimit();
|
||||
// titlecase c which is from [titleStart..titleLimit[
|
||||
c = UCaseProps.INSTANCE.toFullTitle(c, iter, dest, caseLocale);
|
||||
appendResult(c, dest, iter.getCPLength(), options, edits);
|
||||
|
||||
// Special case Dutch IJ titlecasing
|
||||
if (titleStart+1 < index && caseLocale == UCaseProps.LOC_DUTCH) {
|
||||
char c1 = src.charAt(titleStart);
|
||||
if ((c1 == 'i' || c1 == 'I')) {
|
||||
char c2 = src.charAt(titleStart+1);
|
||||
if (c2 == 'j') {
|
||||
dest.append('J');
|
||||
if (edits != null) {
|
||||
edits.addReplace(1, 1);
|
||||
}
|
||||
c = iter.nextCaseMapCP();
|
||||
titleLimit++;
|
||||
assert c == c2;
|
||||
assert titleLimit == iter.getCPLimit();
|
||||
} else if (c2 == 'J') {
|
||||
// Keep the capital J from getting lowercased.
|
||||
appendUnchanged(src, titleStart + 1, 1, dest, options, edits);
|
||||
c = iter.nextCaseMapCP();
|
||||
titleLimit++;
|
||||
assert c == c2;
|
||||
assert titleLimit == iter.getCPLimit();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// lowercase [titleLimit..index[
|
||||
if(titleLimit<index) {
|
||||
if((options&UCharacter.TITLECASE_NO_LOWERCASE)==0) {
|
||||
// Normal operation: Lowercase the rest of the word.
|
||||
internalToLower(caseLocale, options, iter, dest, edits);
|
||||
} else {
|
||||
// Optionally just copy the rest of the word unchanged.
|
||||
appendUnchanged(src, titleLimit, index-titleLimit, dest, options, edits);
|
||||
iter.moveToLimit();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prev=index;
|
||||
}
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static <A extends Appendable> A fold(int options,
|
||||
CharSequence src, A dest, Edits edits) {
|
||||
try {
|
||||
if (edits != null) {
|
||||
edits.reset();
|
||||
}
|
||||
int length = src.length();
|
||||
for (int i = 0; i < length;) {
|
||||
int c = Character.codePointAt(src, i);
|
||||
int cpLength = Character.charCount(c);
|
||||
i += cpLength;
|
||||
c = UCaseProps.INSTANCE.toFullFolding(c, dest, options);
|
||||
appendResult(c, dest, cpLength, options, edits);
|
||||
}
|
||||
return dest;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
private static final class GreekUpper {
|
||||
@ -661,12 +868,13 @@ public final class CaseMap {
|
||||
* TODO: Try to re-consolidate one way or another with the non-Greek function.
|
||||
*
|
||||
* <p>Keep this consistent with the C++ versions in ustrcase.cpp (UTF-16) and ucasemap.cpp (UTF-8).
|
||||
* @throws IOException
|
||||
*/
|
||||
private static String toUpper(CharSequence s, int[] locCache) {
|
||||
StringBuilder result = new StringBuilder(s.length());
|
||||
private static <A extends Appendable> A toUpper(int options,
|
||||
CharSequence src, A dest, Edits edits) throws IOException {
|
||||
int state = 0;
|
||||
for (int i = 0; i < s.length();) {
|
||||
int c = Character.codePointAt(s, i);
|
||||
for (int i = 0; i < src.length();) {
|
||||
int c = Character.codePointAt(src, i);
|
||||
int nextIndex = i + Character.charCount(c);
|
||||
int nextState = 0;
|
||||
int type = UCaseProps.INSTANCE.getTypeOrIgnorable(c);
|
||||
@ -695,8 +903,8 @@ public final class CaseMap {
|
||||
numYpogegrammeni = 1;
|
||||
}
|
||||
// Skip combining diacritics after this Greek letter.
|
||||
while (nextIndex < s.length()) {
|
||||
int diacriticData = getDiacriticData(s.charAt(nextIndex));
|
||||
while (nextIndex < src.length()) {
|
||||
int diacriticData = getDiacriticData(src.charAt(nextIndex));
|
||||
if (diacriticData != 0) {
|
||||
data |= diacriticData;
|
||||
if ((diacriticData & HAS_YPOGEGRAMMENI) != 0) {
|
||||
@ -716,7 +924,7 @@ public final class CaseMap {
|
||||
(data & HAS_ACCENT) != 0 &&
|
||||
numYpogegrammeni == 0 &&
|
||||
(state & AFTER_CASED) == 0 &&
|
||||
!isFollowedByCasedLetter(s, nextIndex)) {
|
||||
!isFollowedByCasedLetter(src, nextIndex)) {
|
||||
// Keep disjunctive "or" with (only) a tonos.
|
||||
// We use the same "word boundary" conditions as for the Final_Sigma test.
|
||||
if (i == nextIndex) {
|
||||
@ -734,25 +942,59 @@ public final class CaseMap {
|
||||
data &= ~HAS_EITHER_DIALYTIKA;
|
||||
}
|
||||
}
|
||||
result.appendCodePoint(upper);
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
result.append('\u0308'); // restore or add a dialytika
|
||||
|
||||
boolean change;
|
||||
if (edits == null) {
|
||||
change = true; // common, simple usage
|
||||
} else {
|
||||
// Find out first whether we are changing the text.
|
||||
change = src.charAt(i) != upper || numYpogegrammeni > 0;
|
||||
int i2 = i + 1;
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
change |= i2 >= nextIndex || src.charAt(i2) != 0x308;
|
||||
++i2;
|
||||
}
|
||||
if (addTonos) {
|
||||
change |= i2 >= nextIndex || src.charAt(i2) != 0x301;
|
||||
++i2;
|
||||
}
|
||||
int oldLength = nextIndex - i;
|
||||
int newLength = (i2 - i) + numYpogegrammeni;
|
||||
change |= oldLength != newLength;
|
||||
if (change) {
|
||||
if (edits != null) {
|
||||
edits.addReplace(oldLength, newLength);
|
||||
}
|
||||
} else {
|
||||
if (edits != null) {
|
||||
edits.addUnchanged(oldLength);
|
||||
}
|
||||
// Write unchanged text?
|
||||
change = (options & OMIT_UNCHANGED_TEXT) == 0;
|
||||
}
|
||||
}
|
||||
if (addTonos) {
|
||||
result.append('\u0301');
|
||||
}
|
||||
while (numYpogegrammeni > 0) {
|
||||
result.append('Ι');
|
||||
--numYpogegrammeni;
|
||||
|
||||
if (change) {
|
||||
dest.append((char)upper);
|
||||
if ((data & HAS_EITHER_DIALYTIKA) != 0) {
|
||||
dest.append('\u0308'); // restore or add a dialytika
|
||||
}
|
||||
if (addTonos) {
|
||||
dest.append('\u0301');
|
||||
}
|
||||
while (numYpogegrammeni > 0) {
|
||||
dest.append('Ι');
|
||||
--numYpogegrammeni;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, null, result, null, locCache);
|
||||
appendResult(c, result);
|
||||
c = UCaseProps.INSTANCE.toFullUpper(c, null, dest, UCaseProps.LOC_GREEK);
|
||||
appendResult(c, dest, nextIndex - i, options, edits);
|
||||
}
|
||||
i = nextIndex;
|
||||
state = nextState;
|
||||
}
|
||||
return result.toString();
|
||||
return dest;
|
||||
}
|
||||
}
|
||||
}
|
@ -24,6 +24,7 @@ package com.ibm.icu.impl;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
@ -71,7 +72,7 @@ public final class UCaseProps {
|
||||
// read exceptions[]
|
||||
count=indexes[IX_EXC_LENGTH];
|
||||
if(count>0) {
|
||||
exceptions=ICUBinary.getChars(bytes, count, 0);
|
||||
exceptions=ICUBinary.getString(bytes, count, 0);
|
||||
}
|
||||
|
||||
// read unfold[]
|
||||
@ -150,7 +151,7 @@ public final class UCaseProps {
|
||||
*
|
||||
* @param excWord (in) initial exceptions word
|
||||
* @param index (in) desired slot index
|
||||
* @param excOffset (in) offset into exceptions[] after excWord=exceptions[excOffset++];
|
||||
* @param excOffset (in) offset into exceptions[] after excWord=exceptions.charAt(excOffset++);
|
||||
* @return bits 31..0: slot value
|
||||
* 63..32: modified excOffset, moved to the last char of the value, use +1 for beginning of next slot
|
||||
*/
|
||||
@ -158,11 +159,11 @@ public final class UCaseProps {
|
||||
long value;
|
||||
if((excWord&EXC_DOUBLE_SLOTS)==0) {
|
||||
excOffset+=slotOffset(excWord, index);
|
||||
value=exceptions[excOffset];
|
||||
value=exceptions.charAt(excOffset);
|
||||
} else {
|
||||
excOffset+=2*slotOffset(excWord, index);
|
||||
value=exceptions[excOffset++];
|
||||
value=(value<<16)|exceptions[excOffset];
|
||||
value=exceptions.charAt(excOffset++);
|
||||
value=(value<<16)|exceptions.charAt(excOffset);
|
||||
}
|
||||
return value |((long)excOffset<<32);
|
||||
}
|
||||
@ -172,11 +173,11 @@ public final class UCaseProps {
|
||||
int value;
|
||||
if((excWord&EXC_DOUBLE_SLOTS)==0) {
|
||||
excOffset+=slotOffset(excWord, index);
|
||||
value=exceptions[excOffset];
|
||||
value=exceptions.charAt(excOffset);
|
||||
} else {
|
||||
excOffset+=2*slotOffset(excWord, index);
|
||||
value=exceptions[excOffset++];
|
||||
value=(value<<16)|exceptions[excOffset];
|
||||
value=exceptions.charAt(excOffset++);
|
||||
value=(value<<16)|exceptions.charAt(excOffset);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
@ -191,7 +192,7 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
if(hasSlot(excWord, EXC_LOWER)) {
|
||||
c=getSlotValue(excWord, EXC_LOWER, excOffset);
|
||||
}
|
||||
@ -207,7 +208,7 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
if(hasSlot(excWord, EXC_UPPER)) {
|
||||
c=getSlotValue(excWord, EXC_UPPER, excOffset);
|
||||
}
|
||||
@ -223,7 +224,7 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int index;
|
||||
if(hasSlot(excWord, EXC_TITLE)) {
|
||||
index=EXC_TITLE;
|
||||
@ -291,7 +292,7 @@ public final class UCaseProps {
|
||||
*/
|
||||
int excOffset0, excOffset=getExceptionsOffset(props);
|
||||
int closureOffset;
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int index, closureLength, fullLength, length;
|
||||
|
||||
excOffset0=excOffset;
|
||||
@ -334,7 +335,7 @@ public final class UCaseProps {
|
||||
/* add the full case folding string */
|
||||
length=fullLength&0xf;
|
||||
if(length!=0) {
|
||||
set.add(new String(exceptions, excOffset, length));
|
||||
set.add(exceptions.substring(excOffset, excOffset+length));
|
||||
excOffset+=length;
|
||||
}
|
||||
|
||||
@ -348,8 +349,9 @@ public final class UCaseProps {
|
||||
}
|
||||
|
||||
/* add each code point in the closure string */
|
||||
for(index=0; index<closureLength; index+=UTF16.getCharCount(c)) {
|
||||
c=UTF16.charAt(exceptions, closureOffset, exceptions.length, index);
|
||||
int limit=closureOffset+closureLength;
|
||||
for(index=closureOffset; index<limit; index+=UTF16.getCharCount(c)) {
|
||||
c=exceptions.codePointAt(index);
|
||||
set.add(c);
|
||||
}
|
||||
}
|
||||
@ -468,7 +470,7 @@ public final class UCaseProps {
|
||||
if(!propsHasException(props)) {
|
||||
return props&DOT_MASK;
|
||||
} else {
|
||||
return (exceptions[getExceptionsOffset(props)]>>EXC_DOT_SHIFT)&DOT_MASK;
|
||||
return (exceptions.charAt(getExceptionsOffset(props))>>EXC_DOT_SHIFT)&DOT_MASK;
|
||||
}
|
||||
}
|
||||
|
||||
@ -605,38 +607,49 @@ public final class UCaseProps {
|
||||
*/
|
||||
public static final int MAX_STRING_LENGTH=0x1f;
|
||||
|
||||
private static final int LOC_UNKNOWN=0;
|
||||
private static final int LOC_ROOT=1;
|
||||
//ivate static final int LOC_UNKNOWN=0;
|
||||
public static final int LOC_ROOT=1;
|
||||
private static final int LOC_TURKISH=2;
|
||||
private static final int LOC_LITHUANIAN=3;
|
||||
static final int LOC_GREEK=4;
|
||||
public static final int LOC_DUTCH=5;
|
||||
|
||||
/*
|
||||
* Checks and caches the type of locale ID as it is relevant for case mapping.
|
||||
* If the locCache is not null, then it must be initialized with locCache[0]=0 .
|
||||
*/
|
||||
static final int getCaseLocale(ULocale locale, int[] locCache) {
|
||||
int result;
|
||||
|
||||
if(locCache!=null && (result=locCache[0])!=LOC_UNKNOWN) {
|
||||
return result;
|
||||
public static final int getCaseLocale(Locale locale) {
|
||||
return getCaseLocale(locale.getLanguage());
|
||||
}
|
||||
public static final int getCaseLocale(ULocale locale) {
|
||||
return getCaseLocale(locale.getLanguage());
|
||||
}
|
||||
/** Accepts both 2- and 3-letter language subtags. */
|
||||
private static final int getCaseLocale(String language) {
|
||||
// Check the subtag length to reduce the number of comparisons
|
||||
// for locales without special behavior.
|
||||
// Fastpath for English "en" which is often used for default (=root locale) case mappings,
|
||||
// and for Chinese "zh": Very common but no special case mapping behavior.
|
||||
if(language.length()==2) {
|
||||
if(language.equals("en") || language.charAt(0)>'t') {
|
||||
return LOC_ROOT;
|
||||
} else if(language.equals("tr") || language.equals("az")) {
|
||||
return LOC_TURKISH;
|
||||
} else if(language.equals("el")) {
|
||||
return LOC_GREEK;
|
||||
} else if(language.equals("lt")) {
|
||||
return LOC_LITHUANIAN;
|
||||
} else if(language.equals("nl")) {
|
||||
return LOC_DUTCH;
|
||||
}
|
||||
} else if(language.length()==3) {
|
||||
if(language.equals("tur") || language.equals("aze")) {
|
||||
return LOC_TURKISH;
|
||||
} else if(language.equals("ell")) {
|
||||
return LOC_GREEK;
|
||||
} else if(language.equals("lit")) {
|
||||
return LOC_LITHUANIAN;
|
||||
} else if(language.equals("nld")) {
|
||||
return LOC_DUTCH;
|
||||
}
|
||||
}
|
||||
|
||||
result=LOC_ROOT;
|
||||
|
||||
String language=locale.getLanguage();
|
||||
if(language.equals("tr") || language.equals("tur") || language.equals("az") || language.equals("aze")) {
|
||||
result=LOC_TURKISH;
|
||||
} else if(language.equals("el") || language.equals("ell")) {
|
||||
result=LOC_GREEK;
|
||||
} else if(language.equals("lt") || language.equals("lit")) {
|
||||
result=LOC_LITHUANIAN;
|
||||
}
|
||||
|
||||
if(locCache!=null) {
|
||||
locCache[0]=result;
|
||||
}
|
||||
return result;
|
||||
return LOC_ROOT;
|
||||
}
|
||||
|
||||
/* Is followed by {case-ignorable}* cased ? (dir determines looking forward/backward) */
|
||||
@ -797,19 +810,14 @@ public final class UCaseProps {
|
||||
* See ContextIterator for details.
|
||||
* If iter==null then a context-independent result is returned.
|
||||
* @param out If the mapping result is a string, then it is appended to out.
|
||||
* @param locale Locale ID for locale-dependent mappings.
|
||||
* @param locCache Initialize locCache[0] to 0; may be used to cache the result of parsing
|
||||
* the locale ID for subsequent calls.
|
||||
* Can be null.
|
||||
* @param caseLocale Case locale value from ucase_getCaseLocale().
|
||||
* @return Output code point or string length, see MAX_STRING_LENGTH.
|
||||
*
|
||||
* @see ContextIterator
|
||||
* @see #MAX_STRING_LENGTH
|
||||
* @internal
|
||||
*/
|
||||
public final int toFullLower(int c, ContextIterator iter,
|
||||
StringBuilder out,
|
||||
ULocale locale, int[] locCache) {
|
||||
public final int toFullLower(int c, ContextIterator iter, Appendable out, int caseLocale) {
|
||||
int result, props;
|
||||
|
||||
result=c;
|
||||
@ -820,22 +828,20 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props), excOffset2;
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int full;
|
||||
|
||||
excOffset2=excOffset;
|
||||
|
||||
if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
|
||||
/* use hardcoded conditions and mappings */
|
||||
int loc=getCaseLocale(locale, locCache);
|
||||
|
||||
/*
|
||||
* Test for conditional mappings first
|
||||
* (otherwise the unconditional default mappings are always taken),
|
||||
* then test for characters that have unconditional mappings in SpecialCasing.txt,
|
||||
* then get the UnicodeData.txt mappings.
|
||||
*/
|
||||
if( loc==LOC_LITHUANIAN &&
|
||||
if( caseLocale==LOC_LITHUANIAN &&
|
||||
/* base characters, find accents above */
|
||||
(((c==0x49 || c==0x4a || c==0x12e) &&
|
||||
isFollowedByMoreAbove(iter)) ||
|
||||
@ -858,30 +864,34 @@ public final class UCaseProps {
|
||||
00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||||
0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||||
*/
|
||||
switch(c) {
|
||||
case 0x49: /* LATIN CAPITAL LETTER I */
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
case 0x4a: /* LATIN CAPITAL LETTER J */
|
||||
out.append(jDot);
|
||||
return 2;
|
||||
case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
|
||||
out.append(iOgonekDot);
|
||||
return 2;
|
||||
case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
|
||||
out.append(iDotGrave);
|
||||
return 3;
|
||||
case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
|
||||
out.append(iDotAcute);
|
||||
return 3;
|
||||
case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
|
||||
out.append(iDotTilde);
|
||||
return 3;
|
||||
default:
|
||||
return 0; /* will not occur */
|
||||
try {
|
||||
switch(c) {
|
||||
case 0x49: /* LATIN CAPITAL LETTER I */
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
case 0x4a: /* LATIN CAPITAL LETTER J */
|
||||
out.append(jDot);
|
||||
return 2;
|
||||
case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
|
||||
out.append(iOgonekDot);
|
||||
return 2;
|
||||
case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */
|
||||
out.append(iDotGrave);
|
||||
return 3;
|
||||
case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */
|
||||
out.append(iDotAcute);
|
||||
return 3;
|
||||
case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
|
||||
out.append(iDotTilde);
|
||||
return 3;
|
||||
default:
|
||||
return 0; /* will not occur */
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
/* # Turkish and Azeri */
|
||||
} else if(loc==LOC_TURKISH && c==0x130) {
|
||||
} else if(caseLocale==LOC_TURKISH && c==0x130) {
|
||||
/*
|
||||
# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||||
# The following rules handle those cases.
|
||||
@ -890,7 +900,7 @@ public final class UCaseProps {
|
||||
0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
*/
|
||||
return 0x69;
|
||||
} else if(loc==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
|
||||
} else if(caseLocale==LOC_TURKISH && c==0x307 && isPrecededBy_I(iter)) {
|
||||
/*
|
||||
# When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||||
# This matches the behavior of the canonically equivalent I-dot_above
|
||||
@ -899,7 +909,7 @@ public final class UCaseProps {
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
*/
|
||||
return 0; /* remove the dot (continue without output) */
|
||||
} else if(loc==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
|
||||
} else if(caseLocale==LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter)) {
|
||||
/*
|
||||
# When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||||
|
||||
@ -913,8 +923,12 @@ public final class UCaseProps {
|
||||
|
||||
0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
*/
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
try {
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
} else if( c==0x3a3 &&
|
||||
!isFollowedByCasedLetter(iter, 1) &&
|
||||
isFollowedByCasedLetter(iter, -1) /* -1=preceded */
|
||||
@ -936,11 +950,15 @@ public final class UCaseProps {
|
||||
/* start of full case mapping strings */
|
||||
excOffset=(int)(value>>32)+1;
|
||||
|
||||
/* set the output pointer to the lowercase mapping */
|
||||
out.append(exceptions, excOffset, full);
|
||||
try {
|
||||
// append the lowercase mapping
|
||||
out.append(exceptions, excOffset, excOffset+full);
|
||||
|
||||
/* return the string length */
|
||||
return full;
|
||||
/* return the string length */
|
||||
return full;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -954,8 +972,8 @@ public final class UCaseProps {
|
||||
|
||||
/* internal */
|
||||
private final int toUpperOrTitle(int c, ContextIterator iter,
|
||||
StringBuilder out,
|
||||
ULocale locale, int[] locCache,
|
||||
Appendable out,
|
||||
int loc,
|
||||
boolean upperNotTitle) {
|
||||
int result;
|
||||
int props;
|
||||
@ -968,15 +986,13 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props), excOffset2;
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int full, index;
|
||||
|
||||
excOffset2=excOffset;
|
||||
|
||||
if((excWord&EXC_CONDITIONAL_SPECIAL)!=0) {
|
||||
/* use hardcoded conditions and mappings */
|
||||
int loc=getCaseLocale(locale, locCache);
|
||||
|
||||
if(loc==LOC_TURKISH && c==0x69) {
|
||||
/*
|
||||
# Turkish and Azeri
|
||||
@ -1026,11 +1042,15 @@ public final class UCaseProps {
|
||||
}
|
||||
|
||||
if(full!=0) {
|
||||
/* set the output pointer to the result string */
|
||||
out.append(exceptions, excOffset, full);
|
||||
try {
|
||||
// append the result string
|
||||
out.append(exceptions, excOffset, excOffset+full);
|
||||
|
||||
/* return the string length */
|
||||
return full;
|
||||
/* return the string length */
|
||||
return full;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1049,15 +1069,15 @@ public final class UCaseProps {
|
||||
}
|
||||
|
||||
public final int toFullUpper(int c, ContextIterator iter,
|
||||
StringBuilder out,
|
||||
ULocale locale, int[] locCache) {
|
||||
return toUpperOrTitle(c, iter, out, locale, locCache, true);
|
||||
Appendable out,
|
||||
int caseLocale) {
|
||||
return toUpperOrTitle(c, iter, out, caseLocale, true);
|
||||
}
|
||||
|
||||
public final int toFullTitle(int c, ContextIterator iter,
|
||||
StringBuilder out,
|
||||
ULocale locale, int[] locCache) {
|
||||
return toUpperOrTitle(c, iter, out, locale, locCache, false);
|
||||
Appendable out,
|
||||
int caseLocale) {
|
||||
return toUpperOrTitle(c, iter, out, caseLocale, false);
|
||||
}
|
||||
|
||||
/* case folding ------------------------------------------------------------- */
|
||||
@ -1117,7 +1137,7 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props);
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int index;
|
||||
if((excWord&EXC_CONDITIONAL_FOLD)!=0) {
|
||||
/* special case folding mappings, hardcoded */
|
||||
@ -1168,7 +1188,7 @@ public final class UCaseProps {
|
||||
* together in a way that they still fold to common result strings.
|
||||
*/
|
||||
|
||||
public final int toFullFolding(int c, StringBuilder out, int options) {
|
||||
public final int toFullFolding(int c, Appendable out, int options) {
|
||||
int result;
|
||||
int props;
|
||||
|
||||
@ -1180,7 +1200,7 @@ public final class UCaseProps {
|
||||
}
|
||||
} else {
|
||||
int excOffset=getExceptionsOffset(props), excOffset2;
|
||||
int excWord=exceptions[excOffset++];
|
||||
int excWord=exceptions.charAt(excOffset++);
|
||||
int full, index;
|
||||
|
||||
excOffset2=excOffset;
|
||||
@ -1194,8 +1214,12 @@ public final class UCaseProps {
|
||||
return 0x69;
|
||||
} else if(c==0x130) {
|
||||
/* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
try {
|
||||
out.append(iDot);
|
||||
return 2;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* Turkic mappings */
|
||||
@ -1219,11 +1243,15 @@ public final class UCaseProps {
|
||||
full=(full>>4)&0xf;
|
||||
|
||||
if(full!=0) {
|
||||
/* set the output pointer to the result string */
|
||||
out.append(exceptions, excOffset, full);
|
||||
try {
|
||||
// append the result string
|
||||
out.append(exceptions, excOffset, excOffset+full);
|
||||
|
||||
/* return the string length */
|
||||
return full;
|
||||
/* return the string length */
|
||||
return full;
|
||||
} catch (IOException e) {
|
||||
throw new ICUUncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1242,7 +1270,6 @@ public final class UCaseProps {
|
||||
|
||||
/* case mapping properties API ---------------------------------------------- */
|
||||
|
||||
private static final int[] rootLocCache = { LOC_ROOT };
|
||||
/*
|
||||
* We need a StringBuilder for multi-code point output from the
|
||||
* full case mapping functions. However, we do not actually use that output,
|
||||
@ -1282,20 +1309,20 @@ public final class UCaseProps {
|
||||
*/
|
||||
case UProperty.CHANGES_WHEN_LOWERCASED:
|
||||
dummyStringBuilder.setLength(0);
|
||||
return toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
|
||||
return toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0;
|
||||
case UProperty.CHANGES_WHEN_UPPERCASED:
|
||||
dummyStringBuilder.setLength(0);
|
||||
return toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
|
||||
return toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0;
|
||||
case UProperty.CHANGES_WHEN_TITLECASED:
|
||||
dummyStringBuilder.setLength(0);
|
||||
return toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
|
||||
return toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
|
||||
/* case UProperty.CHANGES_WHEN_CASEFOLDED: -- in UCharacterProperty.java */
|
||||
case UProperty.CHANGES_WHEN_CASEMAPPED:
|
||||
dummyStringBuilder.setLength(0);
|
||||
return
|
||||
toFullLower(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
|
||||
toFullUpper(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0 ||
|
||||
toFullTitle(c, null, dummyStringBuilder, ULocale.ROOT, rootLocCache)>=0;
|
||||
toFullLower(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
|
||||
toFullUpper(c, null, dummyStringBuilder, LOC_ROOT)>=0 ||
|
||||
toFullTitle(c, null, dummyStringBuilder, LOC_ROOT)>=0;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
@ -1303,7 +1330,7 @@ public final class UCaseProps {
|
||||
|
||||
// data members -------------------------------------------------------- ***
|
||||
private int indexes[];
|
||||
private char exceptions[];
|
||||
private String exceptions;
|
||||
private char unfold[];
|
||||
|
||||
private Trie2_16 trie;
|
||||
|
@ -15,8 +15,7 @@ import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import com.ibm.icu.impl.CaseMap;
|
||||
import com.ibm.icu.impl.CaseMap.StringContextIterator;
|
||||
import com.ibm.icu.impl.CaseMapImpl;
|
||||
import com.ibm.icu.impl.IllegalIcuArgumentException;
|
||||
import com.ibm.icu.impl.Trie2;
|
||||
import com.ibm.icu.impl.UBiDiProps;
|
||||
@ -29,6 +28,7 @@ import com.ibm.icu.impl.UPropertyAliases;
|
||||
import com.ibm.icu.lang.UCharacterEnums.ECharacterCategory;
|
||||
import com.ibm.icu.lang.UCharacterEnums.ECharacterDirection;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.Edits;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.util.RangeValueIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
@ -4875,7 +4875,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toUpperCase(String str)
|
||||
{
|
||||
return toUpperCase(ULocale.getDefault(), str);
|
||||
return toUpperCase(getDefaultCaseLocale(), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4887,7 +4887,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toLowerCase(String str)
|
||||
{
|
||||
return toLowerCase(ULocale.getDefault(), str);
|
||||
return toLowerCase(getDefaultCaseLocale(), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4910,7 +4910,94 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toTitleCase(String str, BreakIterator breakiter)
|
||||
{
|
||||
return toTitleCase(ULocale.getDefault(), str, breakiter);
|
||||
return toTitleCase(Locale.getDefault(), str, breakiter, 0);
|
||||
}
|
||||
|
||||
private static int getDefaultCaseLocale() {
|
||||
return UCaseProps.getCaseLocale(Locale.getDefault());
|
||||
}
|
||||
|
||||
private static int getCaseLocale(Locale locale) {
|
||||
if (locale == null) {
|
||||
locale = Locale.getDefault();
|
||||
}
|
||||
return UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
private static int getCaseLocale(ULocale locale) {
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
return UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
private static String toLowerCase(int caseLocale, String str) {
|
||||
if (str.length() <= 100) {
|
||||
if (str.isEmpty()) {
|
||||
return str;
|
||||
}
|
||||
// Collect and apply only changes.
|
||||
// Good if no or few changes. Bad (slow) if many changes.
|
||||
Edits edits = new Edits();
|
||||
StringBuilder replacementChars = CaseMapImpl.toLower(
|
||||
caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
|
||||
return applyEdits(str, replacementChars, edits);
|
||||
} else {
|
||||
return CaseMapImpl.toLower(caseLocale, 0, str,
|
||||
new StringBuilder(str.length()), null).toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static String toUpperCase(int caseLocale, String str) {
|
||||
if (str.length() <= 100) {
|
||||
if (str.isEmpty()) {
|
||||
return str;
|
||||
}
|
||||
// Collect and apply only changes.
|
||||
// Good if no or few changes. Bad (slow) if many changes.
|
||||
Edits edits = new Edits();
|
||||
StringBuilder replacementChars = CaseMapImpl.toUpper(
|
||||
caseLocale, CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
|
||||
return applyEdits(str, replacementChars, edits);
|
||||
} else {
|
||||
return CaseMapImpl.toUpper(caseLocale, 0, str,
|
||||
new StringBuilder(str.length()), null).toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static String toTitleCase(int caseLocale, int options, BreakIterator titleIter, String str) {
|
||||
if (str.length() <= 100) {
|
||||
if (str.isEmpty()) {
|
||||
return str;
|
||||
}
|
||||
// Collect and apply only changes.
|
||||
// Good if no or few changes. Bad (slow) if many changes.
|
||||
Edits edits = new Edits();
|
||||
StringBuilder replacementChars = CaseMapImpl.toTitle(
|
||||
caseLocale, options | CaseMapImpl.OMIT_UNCHANGED_TEXT, titleIter, str,
|
||||
new StringBuilder(), edits);
|
||||
return applyEdits(str, replacementChars, edits);
|
||||
} else {
|
||||
return CaseMapImpl.toTitle(caseLocale, options, titleIter, str,
|
||||
new StringBuilder(str.length()), null).toString();
|
||||
}
|
||||
}
|
||||
|
||||
private static String applyEdits(String str, StringBuilder replacementChars, Edits edits) {
|
||||
if (!edits.hasChanges()) {
|
||||
return str;
|
||||
}
|
||||
StringBuilder result = new StringBuilder(str.length() + edits.lengthDelta());
|
||||
for (Edits.Iterator ei = edits.getCoarseIterator(); ei.next();) {
|
||||
if (ei.hasChange()) {
|
||||
int i = ei.replacementIndex();
|
||||
result.append(replacementChars, i, i + ei.newLength());
|
||||
} else {
|
||||
int i = ei.sourceIndex();
|
||||
result.append(str, i, i + ei.oldLength());
|
||||
}
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4923,7 +5010,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toUpperCase(Locale locale, String str)
|
||||
{
|
||||
return toUpperCase(ULocale.forLocale(locale), str);
|
||||
return toUpperCase(getCaseLocale(locale), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4935,7 +5022,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* @stable ICU 3.2
|
||||
*/
|
||||
public static String toUpperCase(ULocale locale, String str) {
|
||||
return CaseMap.toUpper(locale, str);
|
||||
return toUpperCase(getCaseLocale(locale), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4948,7 +5035,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toLowerCase(Locale locale, String str)
|
||||
{
|
||||
return toLowerCase(ULocale.forLocale(locale), str);
|
||||
return toLowerCase(getCaseLocale(locale), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4960,31 +5047,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* @stable ICU 3.2
|
||||
*/
|
||||
public static String toLowerCase(ULocale locale, String str) {
|
||||
StringContextIterator iter = new StringContextIterator(str);
|
||||
StringBuilder result = new StringBuilder(str.length());
|
||||
int[] locCache = new int[1];
|
||||
int c;
|
||||
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
locCache[0]=0;
|
||||
|
||||
while((c=iter.nextCaseMapCP())>=0) {
|
||||
c = UCaseProps.INSTANCE.toFullLower(c, iter, result, locale, locCache);
|
||||
|
||||
/* decode the result */
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
continue;
|
||||
/* } else { append single-code point mapping */
|
||||
}
|
||||
result.appendCodePoint(c);
|
||||
}
|
||||
return result.toString();
|
||||
return toLowerCase(getCaseLocale(locale), str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5009,7 +5072,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
public static String toTitleCase(Locale locale, String str,
|
||||
BreakIterator breakiter)
|
||||
{
|
||||
return toTitleCase(ULocale.forLocale(locale), str, breakiter);
|
||||
return toTitleCase(locale, str, breakiter, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5059,126 +5122,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* @see #TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
*/
|
||||
public static String toTitleCase(ULocale locale, String str,
|
||||
BreakIterator titleIter,
|
||||
int options) {
|
||||
StringContextIterator iter = new StringContextIterator(str);
|
||||
StringBuilder result = new StringBuilder(str.length());
|
||||
int[] locCache = new int[1];
|
||||
int c, nc, srcLength = str.length();
|
||||
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
locCache[0]=0;
|
||||
|
||||
BreakIterator titleIter, int options) {
|
||||
if(titleIter == null) {
|
||||
if (locale == null) {
|
||||
locale = ULocale.getDefault();
|
||||
}
|
||||
titleIter = BreakIterator.getWordInstance(locale);
|
||||
}
|
||||
titleIter.setText(str);
|
||||
|
||||
int prev, titleStart, index;
|
||||
boolean isFirstIndex;
|
||||
boolean isDutch = locale.getLanguage().equals("nl");
|
||||
boolean FirstIJ = true;
|
||||
|
||||
/* set up local variables */
|
||||
prev=0;
|
||||
isFirstIndex=true;
|
||||
|
||||
/* titlecasing loop */
|
||||
while(prev<srcLength) {
|
||||
/* find next index where to titlecase */
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=false;
|
||||
index=titleIter.first();
|
||||
} else {
|
||||
index=titleIter.next();
|
||||
}
|
||||
if(index==BreakIterator.DONE || index>srcLength) {
|
||||
index=srcLength;
|
||||
}
|
||||
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
iter.setLimit(index);
|
||||
c=iter.nextCaseMapCP();
|
||||
if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {
|
||||
while((c=iter.nextCaseMapCP())>=0
|
||||
&& UCaseProps.NONE==UCaseProps.INSTANCE.getType(c)) {}
|
||||
titleStart=iter.getCPStart();
|
||||
if(prev<titleStart) {
|
||||
result.append(str, prev, titleStart);
|
||||
}
|
||||
} else {
|
||||
titleStart=prev;
|
||||
}
|
||||
|
||||
if(titleStart<index) {
|
||||
FirstIJ = true;
|
||||
/* titlecase c which is from titleStart */
|
||||
c = UCaseProps.INSTANCE.toFullTitle(c, iter, result, locale, locCache);
|
||||
|
||||
/* decode the result and lowercase up to index */
|
||||
for(;;) {
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
result.appendCodePoint(c);
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
} else {
|
||||
/* append single-code point mapping */
|
||||
result.appendCodePoint(c);
|
||||
}
|
||||
|
||||
if((options&TITLECASE_NO_LOWERCASE)!=0) {
|
||||
/* Optionally just copy the rest of the word unchanged. */
|
||||
|
||||
int titleLimit=iter.getCPLimit();
|
||||
if(titleLimit<index) {
|
||||
/* Special Case - Dutch IJ Titlecasing */
|
||||
if (isDutch && c == 0x0049 && str.charAt(titleLimit) == 'j') {
|
||||
result.append('J').append(str, titleLimit + 1, index);
|
||||
} else {
|
||||
result.append(str, titleLimit, index);
|
||||
}
|
||||
}
|
||||
iter.moveToLimit();
|
||||
break;
|
||||
} else if((nc=iter.nextCaseMapCP())>=0) {
|
||||
if (isDutch && (nc == 0x004A || nc == 0x006A)
|
||||
&& (c == 0x0049) && (FirstIJ == true)) {
|
||||
c = 0x004A; /* J */
|
||||
FirstIJ = false;
|
||||
} else {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
c = UCaseProps.INSTANCE.toFullLower(nc, iter, result, locale,
|
||||
locCache);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
prev=index;
|
||||
}
|
||||
return result.toString();
|
||||
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
|
||||
}
|
||||
|
||||
|
||||
@ -5281,7 +5233,11 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
public static String toTitleCase(Locale locale, String str,
|
||||
BreakIterator titleIter,
|
||||
int options) {
|
||||
return toTitleCase(ULocale.forLocale(locale), str, titleIter, options);
|
||||
if(titleIter == null) {
|
||||
titleIter = BreakIterator.getWordInstance(locale);
|
||||
}
|
||||
titleIter.setText(str);
|
||||
return toTitleCase(getCaseLocale(locale), options, titleIter, str);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -5398,27 +5354,19 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
public static final String foldCase(String str, int options) {
|
||||
StringBuilder result = new StringBuilder(str.length());
|
||||
int c, i, length;
|
||||
|
||||
length = str.length();
|
||||
for(i=0; i<length;) {
|
||||
c=str.codePointAt(i);
|
||||
i+=Character.charCount(c);
|
||||
c = UCaseProps.INSTANCE.toFullFolding(c, result, options);
|
||||
|
||||
/* decode the result */
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
continue;
|
||||
/* } else { append single-code point mapping */
|
||||
if (str.length() <= 100) {
|
||||
if (str.isEmpty()) {
|
||||
return str;
|
||||
}
|
||||
result.appendCodePoint(c);
|
||||
// Collect and apply only changes.
|
||||
// Good if no or few changes. Bad (slow) if many changes.
|
||||
Edits edits = new Edits();
|
||||
StringBuilder replacementChars = CaseMapImpl.fold(
|
||||
options | CaseMapImpl.OMIT_UNCHANGED_TEXT, str, new StringBuilder(), edits);
|
||||
return applyEdits(str, replacementChars, edits);
|
||||
} else {
|
||||
return CaseMapImpl.fold(options, str, new StringBuilder(str.length()), null).toString();
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
|
339
icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
Normal file
339
icu4j/main/classes/core/src/com/ibm/icu/text/CaseMap.java
Normal file
@ -0,0 +1,339 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
import com.ibm.icu.impl.CaseMapImpl;
|
||||
import com.ibm.icu.impl.UCaseProps;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
* Low-level case mapping options and methods. Immutable.
|
||||
* "Setters" return instances with the union of the current and new options set.
|
||||
*
|
||||
* This class is not intended for public subclassing.
|
||||
*
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract class CaseMap {
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int internalOptions;
|
||||
|
||||
private CaseMap(int opt) { internalOptions = opt; }
|
||||
|
||||
private static int getCaseLocale(Locale locale) {
|
||||
if (locale == null) {
|
||||
locale = Locale.getDefault();
|
||||
}
|
||||
return UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return Lowercasing object with default options.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static Lower toLower() { return Lower.DEFAULT; }
|
||||
/**
|
||||
* @return Uppercasing object with default options.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static Upper toUpper() { return Upper.DEFAULT; }
|
||||
/**
|
||||
* @return Titlecasing object with default options.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static Title toTitle() { return Title.DEFAULT; }
|
||||
/**
|
||||
* @return Case folding object with default options.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static Fold fold() { return Fold.DEFAULT; }
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* omits unchanged text when case-mapping with {@link Edits}.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract CaseMap omitUnchangedText();
|
||||
|
||||
/**
|
||||
* Lowercasing options and methods. Immutable.
|
||||
*
|
||||
* @see #toLower()
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Lower extends CaseMap {
|
||||
private static final Lower DEFAULT = new Lower(0);
|
||||
private static final Lower OMIT_UNCHANGED = new Lower(CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private Lower(int opt) { super(opt); }
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Lower omitUnchangedText() {
|
||||
return OMIT_UNCHANGED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lowercases a string and optionally records edits (see {@link #omitUnchangedText}).
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param locale The locale ID. Can be null for {@link Locale#getDefault}.
|
||||
* (See {@link ULocale#toLocale}.)
|
||||
* @param src The original string.
|
||||
* @param dest A buffer for the result string. Must not be null.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits.reset() first. edits can be null.
|
||||
* @return dest with the result string (or only changes) appended.
|
||||
*
|
||||
* @see UCharacter#toLowerCase(Locale, String)
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public <A extends Appendable> A apply(
|
||||
Locale locale, CharSequence src, A dest, Edits edits) {
|
||||
return CaseMapImpl.toLower(getCaseLocale(locale), internalOptions, src, dest, edits);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Uppercasing options and methods. Immutable.
|
||||
*
|
||||
* @see #toUpper()
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Upper extends CaseMap {
|
||||
private static final Upper DEFAULT = new Upper(0);
|
||||
private static final Upper OMIT_UNCHANGED = new Upper(CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private Upper(int opt) { super(opt); }
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Upper omitUnchangedText() {
|
||||
return OMIT_UNCHANGED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Uppercases a string and optionally records edits (see {@link #omitUnchangedText}).
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param locale The locale ID. Can be null for {@link Locale#getDefault}.
|
||||
* (See {@link ULocale#toLocale}.)
|
||||
* @param src The original string.
|
||||
* @param dest A buffer for the result string. Must not be null.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits.reset() first. edits can be null.
|
||||
* @return dest with the result string (or only changes) appended.
|
||||
*
|
||||
* @see UCharacter#toUpperCase(Locale, String)
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public <A extends Appendable> A apply(
|
||||
Locale locale, CharSequence src, A dest, Edits edits) {
|
||||
return CaseMapImpl.toUpper(getCaseLocale(locale), internalOptions, src, dest, edits);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Titlecasing options and methods. Immutable.
|
||||
*
|
||||
* @see #toTitle()
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Title extends CaseMap {
|
||||
private static final Title DEFAULT = new Title(0);
|
||||
private static final Title OMIT_UNCHANGED = new Title(CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private Title(int opt) { super(opt); }
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Title omitUnchangedText() {
|
||||
if (internalOptions == 0 || internalOptions == CaseMapImpl.OMIT_UNCHANGED_TEXT) {
|
||||
return OMIT_UNCHANGED;
|
||||
}
|
||||
return new Title(internalOptions | CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* does not lowercase non-initial parts of words when titlecasing.
|
||||
*
|
||||
* <p>By default, titlecasing will titlecase the first cased character
|
||||
* of a word and lowercase all other characters.
|
||||
* With this option, the other characters will not be modified.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see UCharacter#TITLECASE_NO_LOWERCASE
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title noLowercase() {
|
||||
return new Title(internalOptions | UCharacter.TITLECASE_NO_LOWERCASE);
|
||||
}
|
||||
|
||||
// TODO: update references to the Unicode Standard for recent version
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* does not adjust the titlecasing indexes from BreakIterator::next() indexes;
|
||||
* titlecases exactly the characters at breaks from the iterator.
|
||||
*
|
||||
* <p>By default, titlecasing will take each break iterator index,
|
||||
* adjust it by looking for the next cased character, and titlecase that one.
|
||||
* Other characters are lowercased.
|
||||
*
|
||||
* <p>This follows Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see UCharacter#TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Title noBreakAdjustment() {
|
||||
return new Title(internalOptions | UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Titlecases a string and optionally records edits (see {@link #omitUnchangedText}).
|
||||
* Casing is locale-dependent and context-sensitive.
|
||||
* The result may be longer or shorter than the original.
|
||||
*
|
||||
* <p>Titlecasing uses a break iterator to find the first characters of words
|
||||
* that are to be titlecased. It titlecases those characters and lowercases
|
||||
* all others. (This can be modified with options bits.)
|
||||
*
|
||||
* @param locale The locale ID. Can be null for {@link Locale#getDefault}.
|
||||
* (See {@link ULocale#toLocale}.)
|
||||
* @param iter A break iterator to find the first characters of words that are to be titlecased.
|
||||
* It is set to the source string (setText())
|
||||
* and used one or more times for iteration (first() and next()).
|
||||
* If null, then a word break iterator for the locale is used
|
||||
* (or something equivalent).
|
||||
* @param src The original string.
|
||||
* @param dest A buffer for the result string. Must not be null.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits.reset() first. edits can be null.
|
||||
* @return dest with the result string (or only changes) appended.
|
||||
*
|
||||
* @see UCharacter#toTitleCase(Locale, String, BreakIterator, int)
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public <A extends Appendable> A apply(
|
||||
Locale locale, BreakIterator iter, CharSequence src, A dest, Edits edits) {
|
||||
if (iter == null) {
|
||||
iter = BreakIterator.getWordInstance(locale);
|
||||
}
|
||||
iter.setText(src.toString());
|
||||
return CaseMapImpl.toTitle(
|
||||
getCaseLocale(locale), internalOptions, iter, src, dest, edits);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Case folding options and methods. Immutable.
|
||||
*
|
||||
* @see #fold()
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Fold extends CaseMap {
|
||||
private static final Fold DEFAULT = new Fold(0);
|
||||
private static final Fold TURKIC = new Fold(UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I);
|
||||
private static final Fold OMIT_UNCHANGED = new Fold(CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private static final Fold TURKIC_OMIT_UNCHANGED = new Fold(
|
||||
UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I | CaseMapImpl.OMIT_UNCHANGED_TEXT);
|
||||
private Fold(int opt) { super(opt); }
|
||||
|
||||
/**
|
||||
* {@inheritDoc}
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Fold omitUnchangedText() {
|
||||
return (internalOptions & UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0 ?
|
||||
OMIT_UNCHANGED : TURKIC_OMIT_UNCHANGED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an instance that behaves like this one but
|
||||
* handles dotted I and dotless i appropriately for Turkic languages (tr, az).
|
||||
*
|
||||
* <p>Uses the Unicode CaseFolding.txt mappings marked with 'T' that
|
||||
* are to be excluded for default mappings and
|
||||
* included for the Turkic-specific mappings.
|
||||
*
|
||||
* @return an options object with this option.
|
||||
* @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Fold turkic() {
|
||||
return (internalOptions & CaseMapImpl.OMIT_UNCHANGED_TEXT) == 0 ?
|
||||
TURKIC : TURKIC_OMIT_UNCHANGED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Case-folds a string and optionally records edits (see {@link #omitUnchangedText}).
|
||||
*
|
||||
* <p>Case-folding is locale-independent and not context-sensitive,
|
||||
* but there is an option for whether to include or exclude mappings for dotted I
|
||||
* and dotless i that are marked with 'T' in CaseFolding.txt.
|
||||
*
|
||||
* <p>The result may be longer or shorter than the original.
|
||||
*
|
||||
* @param src The original string.
|
||||
* @param dest A buffer for the result string. Must not be null.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* This function calls edits.reset() first. edits can be null.
|
||||
* @return dest with the result string (or only changes) appended.
|
||||
*
|
||||
* @see UCharacter#foldCase(String, int)
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public <A extends Appendable> A apply(CharSequence src, A dest, Edits edits) {
|
||||
return CaseMapImpl.fold(internalOptions, src, dest, edits);
|
||||
}
|
||||
}
|
||||
}
|
494
icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
Normal file
494
icu4j/main/classes/core/src/com/ibm/icu/text/Edits.java
Normal file
@ -0,0 +1,494 @@
|
||||
// © 2017 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.nio.BufferOverflowException;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Records lengths of string edits but not replacement text.
|
||||
* Supports replacements, insertions, deletions in linear progression.
|
||||
* Does not support moving/reordering of text.
|
||||
*
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final class Edits {
|
||||
// 0000uuuuuuuuuuuu records u+1 unchanged text units.
|
||||
private static final int MAX_UNCHANGED_LENGTH = 0x1000;
|
||||
private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
|
||||
|
||||
// 0wwwcccccccccccc with w=1..6 records ccc+1 replacements of w:w text units.
|
||||
// No length change.
|
||||
private static final int MAX_SHORT_WIDTH = 6;
|
||||
private static final int MAX_SHORT_CHANGE_LENGTH = 0xfff;
|
||||
private static final int MAX_SHORT_CHANGE = 0x6fff;
|
||||
|
||||
// 0111mmmmmmnnnnnn records a replacement of m text units with n.
|
||||
// m or n = 61: actual length follows in the next edits array unit.
|
||||
// m or n = 62..63: actual length follows in the next two edits array units.
|
||||
// Bit 30 of the actual length is in the head unit.
|
||||
// Trailing units have bit 15 set.
|
||||
private static final int LENGTH_IN_1TRAIL = 61;
|
||||
private static final int LENGTH_IN_2TRAIL = 62;
|
||||
|
||||
private static final int STACK_CAPACITY = 100;
|
||||
private char[] array;
|
||||
private int length;
|
||||
private int delta;
|
||||
|
||||
/**
|
||||
* Constructs an empty object.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Edits() {
|
||||
array = new char[STACK_CAPACITY];
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the data but may not release memory.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void reset() {
|
||||
length = delta = 0;
|
||||
}
|
||||
|
||||
private void setLastUnit(int last) {
|
||||
array[length - 1] = (char)last;
|
||||
}
|
||||
private int lastUnit() {
|
||||
return length > 0 ? array[length - 1] : 0xffff;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a record for an unchanged segment of text.
|
||||
* Normally called from inside ICU string transformation functions, not user code.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void addUnchanged(int unchangedLength) {
|
||||
if(unchangedLength < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"addUnchanged(" + unchangedLength + "): length must not be negative");
|
||||
}
|
||||
// Merge into previous unchanged-text record, if any.
|
||||
int last = lastUnit();
|
||||
if(last < MAX_UNCHANGED) {
|
||||
int remaining = MAX_UNCHANGED - last;
|
||||
if (remaining >= unchangedLength) {
|
||||
setLastUnit(last + unchangedLength);
|
||||
return;
|
||||
}
|
||||
setLastUnit(MAX_UNCHANGED);
|
||||
unchangedLength -= remaining;
|
||||
}
|
||||
// Split large lengths into multiple units.
|
||||
while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
|
||||
append(MAX_UNCHANGED);
|
||||
unchangedLength -= MAX_UNCHANGED_LENGTH;
|
||||
}
|
||||
// Write a small (remaining) length.
|
||||
if(unchangedLength > 0) {
|
||||
append(unchangedLength - 1);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a record for a text replacement/insertion/deletion.
|
||||
* Normally called from inside ICU string transformation functions, not user code.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void addReplace(int oldLength, int newLength) {
|
||||
if(oldLength == newLength && 0 < oldLength && oldLength <= MAX_SHORT_WIDTH) {
|
||||
// Replacement of short oldLength text units by same-length new text.
|
||||
// Merge into previous short-replacement record, if any.
|
||||
int last = lastUnit();
|
||||
if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
|
||||
(last >> 12) == oldLength && (last & 0xfff) < MAX_SHORT_CHANGE_LENGTH) {
|
||||
setLastUnit(last + 1);
|
||||
return;
|
||||
}
|
||||
append(oldLength << 12);
|
||||
return;
|
||||
}
|
||||
|
||||
if(oldLength < 0 || newLength < 0) {
|
||||
throw new IllegalArgumentException(
|
||||
"addReplace(" + oldLength + ", " + newLength +
|
||||
"): both lengths must be non-negative");
|
||||
}
|
||||
if (oldLength == 0 && newLength == 0) {
|
||||
return;
|
||||
}
|
||||
int newDelta = newLength - oldLength;
|
||||
if (newDelta != 0) {
|
||||
if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) ||
|
||||
(newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) {
|
||||
// Integer overflow or underflow.
|
||||
throw new IndexOutOfBoundsException();
|
||||
}
|
||||
delta += newDelta;
|
||||
}
|
||||
|
||||
int head = 0x7000;
|
||||
if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
head |= newLength;
|
||||
append(head);
|
||||
} else if ((array.length - length) >= 5 || growArray()) {
|
||||
int limit = length + 1;
|
||||
if(oldLength < LENGTH_IN_1TRAIL) {
|
||||
head |= oldLength << 6;
|
||||
} else if(oldLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL << 6;
|
||||
array[limit++] = (char)(0x8000 | oldLength);
|
||||
} else {
|
||||
head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
|
||||
array[limit++] = (char)(0x8000 | (oldLength >> 15));
|
||||
array[limit++] = (char)(0x8000 | oldLength);
|
||||
}
|
||||
if(newLength < LENGTH_IN_1TRAIL) {
|
||||
head |= newLength;
|
||||
} else if(newLength <= 0x7fff) {
|
||||
head |= LENGTH_IN_1TRAIL;
|
||||
array[limit++] = (char)(0x8000 | newLength);
|
||||
} else {
|
||||
head |= LENGTH_IN_2TRAIL + (newLength >> 30);
|
||||
array[limit++] = (char)(0x8000 | (newLength >> 15));
|
||||
array[limit++] = (char)(0x8000 | newLength);
|
||||
}
|
||||
array[length] = (char)head;
|
||||
length = limit;
|
||||
}
|
||||
}
|
||||
|
||||
private void append(int r) {
|
||||
if(length < array.length || growArray()) {
|
||||
array[length++] = (char)r;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean growArray() {
|
||||
int newCapacity;
|
||||
if (array.length == STACK_CAPACITY) {
|
||||
newCapacity = 2000;
|
||||
} else if (array.length == Integer.MAX_VALUE) {
|
||||
throw new BufferOverflowException();
|
||||
} else if (array.length >= (Integer.MAX_VALUE / 2)) {
|
||||
newCapacity = Integer.MAX_VALUE;
|
||||
} else {
|
||||
newCapacity = 2 * array.length;
|
||||
}
|
||||
// Grow by at least 5 units so that a maximal change record will fit.
|
||||
if ((newCapacity - array.length) < 5) {
|
||||
throw new BufferOverflowException();
|
||||
}
|
||||
array = Arrays.copyOf(array, newCapacity);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* How much longer is the new text compared with the old text?
|
||||
* @return new length minus old length
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int lengthDelta() { return delta; }
|
||||
/**
|
||||
* @return true if there are any change edits
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean hasChanges() {
|
||||
if (delta != 0) {
|
||||
return true;
|
||||
}
|
||||
for (int i = 0; i < length; ++i) {
|
||||
if (array[i] > MAX_UNCHANGED) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Access to the list of edits.
|
||||
* @see #getCoarseIterator
|
||||
* @see #getFineIterator
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Iterator {
|
||||
private final char[] array;
|
||||
private int index;
|
||||
private final int length;
|
||||
private int remaining;
|
||||
private final boolean onlyChanges_, coarse;
|
||||
|
||||
private boolean changed;
|
||||
private int oldLength_, newLength_;
|
||||
private int srcIndex, replIndex, destIndex;
|
||||
|
||||
private Iterator(char[] a, int len, boolean oc, boolean crs) {
|
||||
array = a;
|
||||
length = len;
|
||||
onlyChanges_ = oc;
|
||||
coarse = crs;
|
||||
}
|
||||
|
||||
private int readLength(int head) {
|
||||
if (head < LENGTH_IN_1TRAIL) {
|
||||
return head;
|
||||
} else if (head < LENGTH_IN_2TRAIL) {
|
||||
assert(index < length);
|
||||
assert(array[index] >= 0x8000);
|
||||
return array[index++] & 0x7fff;
|
||||
} else {
|
||||
assert((index + 2) <= length);
|
||||
assert(array[index] >= 0x8000);
|
||||
assert(array[index + 1] >= 0x8000);
|
||||
int len = ((head & 1) << 30) |
|
||||
((array[index] & 0x7fff) << 15) |
|
||||
(array[index + 1] & 0x7fff);
|
||||
index += 2;
|
||||
return len;
|
||||
}
|
||||
}
|
||||
|
||||
private void updateIndexes() {
|
||||
srcIndex += oldLength_;
|
||||
if (changed) {
|
||||
replIndex += newLength_;
|
||||
}
|
||||
destIndex += newLength_;
|
||||
}
|
||||
|
||||
private boolean noNext() {
|
||||
// No change beyond the string.
|
||||
changed = false;
|
||||
oldLength_ = newLength_ = 0;
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Advances to the next edit.
|
||||
* @return true if there is another edit
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean next() {
|
||||
return next(onlyChanges_);
|
||||
}
|
||||
|
||||
private boolean next(boolean onlyChanges) {
|
||||
// We have an errorCode in case we need to start guarding against integer overflows.
|
||||
// It is also convenient for caller loops if we bail out when an error was set elsewhere.
|
||||
updateIndexes();
|
||||
if (remaining > 0) {
|
||||
// Fine-grained iterator: Continue a sequence of equal-length changes.
|
||||
--remaining;
|
||||
return true;
|
||||
}
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
int u = array[index++];
|
||||
if (u <= MAX_UNCHANGED) {
|
||||
// Combine adjacent unchanged ranges.
|
||||
changed = false;
|
||||
oldLength_ = u + 1;
|
||||
while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
|
||||
++index;
|
||||
oldLength_ += u + 1;
|
||||
}
|
||||
newLength_ = oldLength_;
|
||||
if (onlyChanges) {
|
||||
updateIndexes();
|
||||
if (index >= length) {
|
||||
return noNext();
|
||||
}
|
||||
// already fetched u > MAX_UNCHANGED at index
|
||||
++index;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
changed = true;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
if (coarse) {
|
||||
int w = u >> 12;
|
||||
int len = (u & 0xfff) + 1;
|
||||
oldLength_ = newLength_ = len * w;
|
||||
} else {
|
||||
// Split a sequence of equal-length changes that was compressed into one unit.
|
||||
oldLength_ = newLength_ = u >> 12;
|
||||
remaining = u & 0xfff;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
assert(u <= 0x7fff);
|
||||
oldLength_ = readLength((u >> 6) & 0x3f);
|
||||
newLength_ = readLength(u & 0x3f);
|
||||
if (!coarse) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Combine adjacent changes.
|
||||
while (index < length && (u = array[index]) > MAX_UNCHANGED) {
|
||||
++index;
|
||||
if (u <= MAX_SHORT_CHANGE) {
|
||||
int w = u >> 12;
|
||||
int len = (u & 0xfff) + 1;
|
||||
len = len * w;
|
||||
oldLength_ += len;
|
||||
newLength_ += len;
|
||||
} else {
|
||||
assert(u <= 0x7fff);
|
||||
int oldLen = readLength((u >> 6) & 0x3f);
|
||||
int newLen = readLength(u & 0x3f);
|
||||
oldLength_ += oldLen;
|
||||
newLength_ += newLen;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the edit that contains the source index.
|
||||
* The source index may be found in a non-change
|
||||
* even if normal iteration would skip non-changes.
|
||||
* Normal iteration can continue from a found edit.
|
||||
*
|
||||
* <p>The iterator state before this search logically does not matter.
|
||||
* (It may affect the performance of the search.)
|
||||
*
|
||||
* <p>The iterator state after this search is undefined
|
||||
* if the source index is out of bounds for the source string.
|
||||
*
|
||||
* @param i source index
|
||||
* @return true if the edit for the source index was found
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean findSourceIndex(int i) {
|
||||
if (i < 0) { return false; }
|
||||
if (i < srcIndex) {
|
||||
// Reset the iterator to the start.
|
||||
index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
|
||||
} else if (i < (srcIndex + oldLength_)) {
|
||||
// The index is in the current span.
|
||||
return true;
|
||||
}
|
||||
while (next(false)) {
|
||||
if (i < (srcIndex + oldLength_)) {
|
||||
// The index is in the current span.
|
||||
return true;
|
||||
}
|
||||
if (remaining > 0) {
|
||||
// Is the index in one of the remaining compressed edits?
|
||||
// srcIndex is the start of the current span, before the remaining ones.
|
||||
int len = (remaining + 1) * oldLength_;
|
||||
if (i < (srcIndex + len)) {
|
||||
int n = (i - srcIndex) / oldLength_; // 1 <= n <= remaining
|
||||
len = n * oldLength_;
|
||||
srcIndex += len;
|
||||
replIndex += len;
|
||||
destIndex += len;
|
||||
remaining -= n;
|
||||
return true;
|
||||
}
|
||||
// Make next() skip all of these edits at once.
|
||||
oldLength_ = newLength_ = len;
|
||||
remaining = 0;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this edit replaces oldLength() units with newLength() different ones.
|
||||
* false if oldLength units remain unchanged.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean hasChange() { return changed; }
|
||||
/**
|
||||
* @return the number of units in the original string which are replaced or remain unchanged.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int oldLength() { return oldLength_; }
|
||||
/**
|
||||
* @return the number of units in the modified string, if hasChange() is true.
|
||||
* Same as oldLength if hasChange() is false.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int newLength() { return newLength_; }
|
||||
|
||||
/**
|
||||
* @return the current index into the source string
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int sourceIndex() { return srcIndex; }
|
||||
/**
|
||||
* @return the current index into the replacement-characters-only string,
|
||||
* not counting unchanged spans
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int replacementIndex() { return replIndex; }
|
||||
/**
|
||||
* @return the current index into the full destination string
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int destinationIndex() { return destIndex; }
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns an Iterator for coarse-grained changes for simple string updates.
|
||||
* Skips non-changes.
|
||||
* @return an Iterator that merges adjacent changes.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Iterator getCoarseChangesIterator() {
|
||||
return new Iterator(array, length, true, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
|
||||
* @return an Iterator that merges adjacent changes.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Iterator getCoarseIterator() {
|
||||
return new Iterator(array, length, false, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for fine-grained changes for modifying styled text.
|
||||
* Skips non-changes.
|
||||
* @return an Iterator that separates adjacent changes.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Iterator getFineChangesIterator() {
|
||||
return new Iterator(array, length, true, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
|
||||
* @return an Iterator that separates adjacent changes.
|
||||
* @draft ICU 59
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Iterator getFineIterator() {
|
||||
return new Iterator(array, length, false, false);
|
||||
}
|
||||
}
|
@ -3866,7 +3866,6 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
int n = getRangeCount();
|
||||
int result;
|
||||
StringBuilder full = new StringBuilder();
|
||||
int locCache[] = new int[1];
|
||||
|
||||
for (int i=0; i<n; ++i) {
|
||||
int start = getRangeStart(i);
|
||||
@ -3881,13 +3880,13 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (int cp=start; cp<=end; ++cp) {
|
||||
result = csp.toFullLower(cp, null, full, root, locCache);
|
||||
result = csp.toFullLower(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullTitle(cp, null, full, root, locCache);
|
||||
result = csp.toFullTitle(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullUpper(cp, null, full, root, locCache);
|
||||
result = csp.toFullUpper(cp, null, full, UCaseProps.LOC_ROOT);
|
||||
addCaseMapping(foldSet, result, full);
|
||||
|
||||
result = csp.toFullFolding(cp, full, 0);
|
||||
@ -3906,6 +3905,7 @@ public class UnicodeSet extends UnicodeFilter implements Iterable<String>, Compa
|
||||
} else {
|
||||
BreakIterator bi = BreakIterator.getWordInstance(root);
|
||||
for (String str : strings) {
|
||||
// TODO: call lower-level functions
|
||||
foldSet.add(UCharacter.toLowerCase(root, str));
|
||||
foldSet.add(UCharacter.toTitleCase(root, str, bi));
|
||||
foldSet.add(UCharacter.toUpperCase(root, str));
|
||||
|
@ -44,7 +44,7 @@ class LowercaseTransliterator extends Transliterator{
|
||||
private final UCaseProps csp;
|
||||
private ReplaceableContextIterator iter;
|
||||
private StringBuilder result;
|
||||
private int[] locCache;
|
||||
private int caseLocale;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
@ -56,8 +56,7 @@ class LowercaseTransliterator extends Transliterator{
|
||||
csp=UCaseProps.INSTANCE;
|
||||
iter=new ReplaceableContextIterator();
|
||||
result = new StringBuilder();
|
||||
locCache = new int[1];
|
||||
locCache[0]=0;
|
||||
caseLocale = UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -85,7 +84,7 @@ class LowercaseTransliterator extends Transliterator{
|
||||
iter.setLimit(offsets.limit);
|
||||
iter.setContextLimits(offsets.contextStart, offsets.contextLimit);
|
||||
while((c=iter.nextCaseMapCP())>=0) {
|
||||
c=csp.toFullLower(c, iter, result, locale, locCache);
|
||||
c=csp.toFullLower(c, iter, result, caseLocale);
|
||||
|
||||
if(iter.didReachLimit() && isIncremental) {
|
||||
// the case mapping function tried to look beyond the context limit
|
||||
|
@ -42,7 +42,7 @@ class TitlecaseTransliterator extends Transliterator {
|
||||
private final UCaseProps csp;
|
||||
private ReplaceableContextIterator iter;
|
||||
private StringBuilder result;
|
||||
private int[] locCache;
|
||||
private int caseLocale;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
@ -55,8 +55,7 @@ class TitlecaseTransliterator extends Transliterator {
|
||||
csp=UCaseProps.INSTANCE;
|
||||
iter=new ReplaceableContextIterator();
|
||||
result = new StringBuilder();
|
||||
locCache = new int[1];
|
||||
locCache[0]=0;
|
||||
caseLocale = UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -119,9 +118,9 @@ class TitlecaseTransliterator extends Transliterator {
|
||||
type=csp.getTypeOrIgnorable(c);
|
||||
if(type>=0) { // not case-ignorable
|
||||
if(doTitle) {
|
||||
c=csp.toFullTitle(c, iter, result, locale, locCache);
|
||||
c=csp.toFullTitle(c, iter, result, caseLocale);
|
||||
} else {
|
||||
c=csp.toFullLower(c, iter, result, locale, locCache);
|
||||
c=csp.toFullLower(c, iter, result, caseLocale);
|
||||
}
|
||||
doTitle = type==0; // doTitle=isUncased
|
||||
|
||||
|
@ -41,7 +41,7 @@ class UppercaseTransliterator extends Transliterator {
|
||||
private final UCaseProps csp;
|
||||
private ReplaceableContextIterator iter;
|
||||
private StringBuilder result;
|
||||
private int[] locCache;
|
||||
private int caseLocale;
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
@ -52,8 +52,7 @@ class UppercaseTransliterator extends Transliterator {
|
||||
csp=UCaseProps.INSTANCE;
|
||||
iter=new ReplaceableContextIterator();
|
||||
result = new StringBuilder();
|
||||
locCache = new int[1];
|
||||
locCache[0]=0;
|
||||
caseLocale = UCaseProps.getCaseLocale(locale);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -81,7 +80,7 @@ class UppercaseTransliterator extends Transliterator {
|
||||
iter.setLimit(offsets.limit);
|
||||
iter.setContextLimits(offsets.contextStart, offsets.contextLimit);
|
||||
while((c=iter.nextCaseMapCP())>=0) {
|
||||
c=csp.toFullUpper(c, iter, result, locale, locCache);
|
||||
c=csp.toFullUpper(c, iter, result, caseLocale);
|
||||
|
||||
if(iter.didReachLimit() && isIncremental) {
|
||||
// the case mapping function tried to look beyond the context limit
|
||||
|
@ -24,6 +24,8 @@ import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.CaseMap;
|
||||
import com.ibm.icu.text.Edits;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
@ -708,6 +710,191 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
assertGreekUpper("ρωμέικα", "ΡΩΜΕΪΚΑ");
|
||||
}
|
||||
|
||||
private static final class EditChange {
|
||||
private boolean change;
|
||||
private int oldLength, newLength;
|
||||
EditChange(boolean change, int oldLength, int newLength) {
|
||||
this.change = change;
|
||||
this.oldLength = oldLength;
|
||||
this.newLength = newLength;
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkEditsIter(
|
||||
String name, Edits.Iterator ei1, Edits.Iterator ei2, // two equal iterators
|
||||
EditChange[] expected, boolean withUnchanged) {
|
||||
assertFalse(name, ei2.findSourceIndex(-1));
|
||||
|
||||
int expSrcIndex = 0;
|
||||
int expDestIndex = 0;
|
||||
int expReplIndex = 0;
|
||||
for (int expIndex = 0; expIndex < expected.length; ++expIndex) {
|
||||
EditChange expect = expected[expIndex];
|
||||
String msg = name + ' ' + expIndex;
|
||||
if (withUnchanged || expect.change) {
|
||||
assertTrue(msg, ei1.next());
|
||||
assertEquals(msg, expect.change, ei1.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei1.oldLength());
|
||||
assertEquals(msg, expect.newLength, ei1.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei1.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei1.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei1.replacementIndex());
|
||||
}
|
||||
|
||||
if (expect.oldLength > 0) {
|
||||
assertTrue(msg, ei2.findSourceIndex(expSrcIndex));
|
||||
assertEquals(msg, expect.change, ei2.hasChange());
|
||||
assertEquals(msg, expect.oldLength, ei2.oldLength());
|
||||
assertEquals(msg, expect.newLength, ei2.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei2.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei2.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei2.replacementIndex());
|
||||
if (!withUnchanged) {
|
||||
// For some iterators, move past the current range
|
||||
// so that findSourceIndex() has to look before the current index.
|
||||
ei2.next();
|
||||
ei2.next();
|
||||
}
|
||||
}
|
||||
|
||||
expSrcIndex += expect.oldLength;
|
||||
expDestIndex += expect.newLength;
|
||||
if (expect.change) {
|
||||
expReplIndex += expect.newLength;
|
||||
}
|
||||
}
|
||||
String msg = name + " end";
|
||||
assertFalse(msg, ei1.next());
|
||||
assertFalse(msg, ei1.hasChange());
|
||||
assertEquals(msg, 0, ei1.oldLength());
|
||||
assertEquals(msg, 0, ei1.newLength());
|
||||
assertEquals(msg, expSrcIndex, ei1.sourceIndex());
|
||||
assertEquals(msg, expDestIndex, ei1.destinationIndex());
|
||||
assertEquals(msg, expReplIndex, ei1.replacementIndex());
|
||||
|
||||
assertFalse(name, ei2.findSourceIndex(expSrcIndex));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestEdits() {
|
||||
Edits edits = new Edits();
|
||||
assertFalse("new Edits", edits.hasChanges());
|
||||
assertEquals("new Edits", 0, edits.lengthDelta());
|
||||
edits.addUnchanged(1); // multiple unchanged ranges are combined
|
||||
edits.addUnchanged(10000); // too long, and they are split
|
||||
edits.addReplace(0, 0);
|
||||
edits.addUnchanged(2);
|
||||
assertFalse("unchanged 10003", edits.hasChanges());
|
||||
assertEquals("unchanged 10003", 0, edits.lengthDelta());
|
||||
edits.addReplace(1, 1); // multiple short equal-length edits are compressed
|
||||
edits.addUnchanged(0);
|
||||
edits.addReplace(1, 1);
|
||||
edits.addReplace(1, 1);
|
||||
edits.addReplace(0, 10);
|
||||
edits.addReplace(100, 0);
|
||||
edits.addReplace(3000, 4000); // variable-length encoding
|
||||
edits.addReplace(100000, 100000);
|
||||
assertTrue("some edits", edits.hasChanges());
|
||||
assertEquals("some edits", 10 - 100 + 1000, edits.lengthDelta());
|
||||
|
||||
EditChange[] coarseExpectedChanges = new EditChange[] {
|
||||
new EditChange(false, 10003, 10003),
|
||||
new EditChange(true, 103103, 104013)
|
||||
};
|
||||
checkEditsIter("coarse",
|
||||
edits.getCoarseIterator(), edits.getCoarseIterator(),
|
||||
coarseExpectedChanges, true);
|
||||
checkEditsIter("coarse changes",
|
||||
edits.getCoarseChangesIterator(), edits.getCoarseChangesIterator(),
|
||||
coarseExpectedChanges, false);
|
||||
|
||||
EditChange[] fineExpectedChanges = new EditChange[] {
|
||||
new EditChange(false, 10003, 10003),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 0, 10),
|
||||
new EditChange(true, 100, 0),
|
||||
new EditChange(true, 3000, 4000),
|
||||
new EditChange(true, 100000, 100000)
|
||||
};
|
||||
checkEditsIter("fine",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
fineExpectedChanges, true);
|
||||
checkEditsIter("fine changes",
|
||||
edits.getFineChangesIterator(), edits.getFineChangesIterator(),
|
||||
fineExpectedChanges, false);
|
||||
|
||||
edits.reset();
|
||||
assertFalse("reset", edits.hasChanges());
|
||||
assertEquals("reset", 0, edits.lengthDelta());
|
||||
Edits.Iterator ei = edits.getCoarseChangesIterator();
|
||||
assertFalse("reset then iterator", ei.next());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestCaseMapWithEdits() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Edits edits = new Edits();
|
||||
|
||||
sb = CaseMap.toLower().omitUnchangedText().apply(TURKISH_LOCALE_, "IstanBul", sb, edits);
|
||||
assertEquals("toLower(Istanbul)", "ıb", sb.toString());
|
||||
EditChange[] lowerExpectedChanges = new EditChange[] {
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(false, 4, 4),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(false, 2, 2)
|
||||
};
|
||||
checkEditsIter("toLower(Istanbul)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
lowerExpectedChanges, true);
|
||||
|
||||
sb.delete(0, sb.length());
|
||||
edits.reset();
|
||||
sb = CaseMap.toUpper().omitUnchangedText().apply(GREEK_LOCALE_, "Πατάτα", sb, edits);
|
||||
assertEquals("toUpper(Πατάτα)", "ΑΤΑΤΑ", sb.toString());
|
||||
EditChange[] upperExpectedChanges = new EditChange[] {
|
||||
new EditChange(false, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 1)
|
||||
};
|
||||
checkEditsIter("toUpper(Πατάτα)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
upperExpectedChanges, true);
|
||||
|
||||
sb.delete(0, sb.length());
|
||||
edits.reset();
|
||||
sb = CaseMap.toTitle().omitUnchangedText().noBreakAdjustment().noLowercase().apply(
|
||||
new Locale("nl"), null, "IjssEL IglOo", sb, edits);
|
||||
assertEquals("toTitle(IjssEL IglOo)", "J", sb.toString());
|
||||
EditChange[] titleExpectedChanges = new EditChange[] {
|
||||
new EditChange(false, 1, 1),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(false, 10, 10)
|
||||
};
|
||||
checkEditsIter("toTitle(IjssEL IglOo)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
titleExpectedChanges, true);
|
||||
|
||||
sb.delete(0, sb.length());
|
||||
edits.reset();
|
||||
sb = CaseMap.fold().omitUnchangedText().turkic().apply("IßtanBul", sb, edits);
|
||||
assertEquals("fold(IßtanBul)", "ıssb", sb.toString());
|
||||
EditChange[] foldExpectedChanges = new EditChange[] {
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(true, 1, 2),
|
||||
new EditChange(false, 3, 3),
|
||||
new EditChange(true, 1, 1),
|
||||
new EditChange(false, 2, 2)
|
||||
};
|
||||
checkEditsIter("fold(IßtanBul)",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
foldExpectedChanges, true);
|
||||
}
|
||||
|
||||
// private data members - test data --------------------------------------
|
||||
|
||||
private static final Locale TURKISH_LOCALE_ = new Locale("tr", "TR");
|
||||
@ -945,7 +1132,7 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
// private methods -------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Converting the hex numbers represented betwee n ';' to Unicode strings
|
||||
* Converting the hex numbers represented between ';' to Unicode strings
|
||||
* @param str string to break up into Unicode strings
|
||||
* @return array of Unicode strings ending with a null
|
||||
*/
|
||||
|
Loading…
Reference in New Issue
Block a user