ICU-7736 add Normalizer2::getDecomposition(c)
X-SVN-Rev: 28161
This commit is contained in:
parent
200c0ff239
commit
82160e104c
@ -148,6 +148,11 @@ FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
return first;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
return !set.contains(c) && norm2.getDecomposition(c, decomposition);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
|
@ -75,6 +75,10 @@ class NoopNormalizer2 : public Normalizer2 {
|
||||
return first;
|
||||
}
|
||||
virtual UBool
|
||||
getDecomposition(UChar32, UnicodeString &) const {
|
||||
return FALSE;
|
||||
}
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &, UErrorCode &) const {
|
||||
return TRUE;
|
||||
}
|
||||
@ -160,6 +164,21 @@ public:
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
UChar buffer[4];
|
||||
int32_t length;
|
||||
const UChar *d=impl.getDecomposition(c, buffer, length);
|
||||
if(d==NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
if(d==buffer) {
|
||||
decomposition.setTo(buffer, length); // copy the string (Jamos from Hangul syllable c)
|
||||
} else {
|
||||
decomposition.setTo(FALSE, d, length); // read-only alias
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
// quick checks
|
||||
virtual UBool
|
||||
@ -630,8 +649,8 @@ unorm2_normalize(const UNormalizer2 *norm2,
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( (src==NULL && length!=0) || length<-1 ||
|
||||
capacity<0 || (dest==NULL && capacity>0) ||
|
||||
if( (src==NULL ? length!=0 : length<-1) ||
|
||||
(dest==NULL ? capacity!=0 : capacity<0) ||
|
||||
(src==dest && src!=NULL)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
@ -665,8 +684,9 @@ normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( (second==NULL && secondLength!=0) || secondLength<-1 ||
|
||||
firstCapacity<0 || (first==NULL && firstCapacity>0) || firstLength<-1 ||
|
||||
if( (second==NULL ? secondLength!=0 : secondLength<-1) ||
|
||||
(first==NULL ? (firstCapacity!=0 || firstLength!=0) :
|
||||
(firstCapacity<0 || firstLength<-1)) ||
|
||||
(first==second && first!=NULL)
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
@ -718,6 +738,25 @@ unorm2_append(const UNormalizer2 *norm2,
|
||||
FALSE, pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_getDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(decomposition==NULL ? capacity!=0 : capacity<0) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(decomposition, 0, capacity);
|
||||
if(reinterpret_cast<const Normalizer2 *>(norm2)->getDecomposition(c, destString)) {
|
||||
return destString.extract(decomposition, capacity, *pErrorCode);
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_isNormalized(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
|
@ -174,6 +174,19 @@ public:
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const = 0;
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
|
||||
* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
|
||||
* This function is independent of the mode of the Normalizer2.
|
||||
* @param c code point
|
||||
* @param decomposition String object which will be set to c's
|
||||
* decomposition mapping, if there is one.
|
||||
* @return TRUE if c has a decomposition, otherwise FALSE
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
@ -365,6 +378,19 @@ public:
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Equivalent to normalize(UnicodeString(c))
|
||||
* on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster.
|
||||
* This function is independent of the mode of the Normalizer2.
|
||||
* @param c code point
|
||||
* @param decomposition String object which will be set to c's
|
||||
* decomposition mapping, if there is one.
|
||||
* @return TRUE if c has a decomposition, otherwise FALSE
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
@ -259,6 +259,27 @@ unorm2_append(const UNormalizer2 *norm2,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c. Equivalent to unorm2_normalize(string(c))
|
||||
* on a UNORM2_DECOMPOSE UNormalizer2 instance, but much faster.
|
||||
* This function is independent of the mode of the UNormalizer2.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c code point
|
||||
* @param decomposition String buffer which will be set to c's
|
||||
* decomposition mapping, if there is one.
|
||||
* @param capacity number of UChars that can be written to decomposition
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return the non-negative length of c's decomposition, if there is one; otherwise a negative value
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_getDecomposition(const UNormalizer2 *norm2,
|
||||
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
|
@ -1401,7 +1401,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
UnicodeSet foldSet(*this);
|
||||
UnicodeString str;
|
||||
USetAdder sa = {
|
||||
(USet *)&foldSet,
|
||||
foldSet.toUSet(),
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
|
@ -31,6 +31,7 @@
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/unorm2.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "unormimp.h"
|
||||
#include "uprops.h"
|
||||
@ -58,7 +59,9 @@ unorm_quickCheckWithOptions(const UChar *src, int32_t srcLength,
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
|
||||
return unorm2_quickCheck((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
|
||||
return unorm2_quickCheck(
|
||||
reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
|
||||
src, srcLength, pErrorCode);
|
||||
} else {
|
||||
return unorm2_quickCheck((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
|
||||
}
|
||||
@ -79,7 +82,9 @@ unorm_isNormalizedWithOptions(const UChar *src, int32_t srcLength,
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
|
||||
return unorm2_isNormalized((const UNormalizer2 *)&fn2, src, srcLength, pErrorCode);
|
||||
return unorm2_isNormalized(
|
||||
reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
|
||||
src, srcLength, pErrorCode);
|
||||
} else {
|
||||
return unorm2_isNormalized((const UNormalizer2 *)n2, src, srcLength, pErrorCode);
|
||||
}
|
||||
@ -96,7 +101,8 @@ unorm_normalize(const UChar *src, int32_t srcLength,
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, *pErrorCode);
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2 fn2(*n2, *uniset_getUnicode32Instance(*pErrorCode));
|
||||
return unorm2_normalize((const UNormalizer2 *)&fn2,
|
||||
return unorm2_normalize(
|
||||
reinterpret_cast<const UNormalizer2 *>(static_cast<Normalizer2 *>(&fn2)),
|
||||
src, srcLength, dest, destCapacity, pErrorCode);
|
||||
} else {
|
||||
return unorm2_normalize((const UNormalizer2 *)n2,
|
||||
|
@ -269,29 +269,22 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
}
|
||||
} else if(column==UPROPS_SRC_CASE_AND_NORM) {
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
UChar nfdBuffer[4];
|
||||
const UChar *nfd;
|
||||
int32_t nfdLength;
|
||||
UnicodeString nfd;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
switch(which) {
|
||||
case UCHAR_CHANGES_WHEN_CASEFOLDED:
|
||||
nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength);
|
||||
if(nfd!=NULL) {
|
||||
if(nfcNorm2->getDecomposition(c, nfd)) {
|
||||
/* c has a decomposition */
|
||||
if(nfdLength==1) {
|
||||
if(nfd.length()==1) {
|
||||
c=nfd[0]; /* single BMP code point */
|
||||
} else if(nfdLength<=U16_MAX_LENGTH) {
|
||||
int32_t i=0;
|
||||
U16_NEXT(nfd, i, nfdLength, c);
|
||||
if(i==nfdLength) {
|
||||
/* single supplementary code point */
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if(nfd.length()<=U16_MAX_LENGTH &&
|
||||
nfd.length()==U16_LENGTH(c=nfd.char32At(0))
|
||||
) {
|
||||
/* single supplementary code point */
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
@ -308,8 +301,12 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
||||
/* guess some large but stack-friendly capacity */
|
||||
UChar dest[2*UCASE_MAX_STRING_LENGTH];
|
||||
int32_t destLength;
|
||||
destLength=u_strFoldCase(dest, LENGTHOF(dest), nfd, nfdLength, U_FOLD_CASE_DEFAULT, &errorCode);
|
||||
return (UBool)(U_SUCCESS(errorCode) && 0!=u_strCompare(nfd, nfdLength, dest, destLength, FALSE));
|
||||
destLength=u_strFoldCase(dest, LENGTHOF(dest),
|
||||
nfd.getBuffer(), nfd.length(),
|
||||
U_FOLD_CASE_DEFAULT, &errorCode);
|
||||
return (UBool)(U_SUCCESS(errorCode) &&
|
||||
0!=u_strCompare(nfd.getBuffer(), nfd.length(),
|
||||
dest, destLength, FALSE));
|
||||
}
|
||||
default:
|
||||
break;
|
||||
|
@ -61,6 +61,9 @@ TestComposition(void);
|
||||
static void
|
||||
TestFCD(void);
|
||||
|
||||
static void
|
||||
TestGetDecomposition(void);
|
||||
|
||||
static const char* const canonTests[][3] = {
|
||||
/* Input*/ /*Decomposed*/ /*Composed*/
|
||||
{ "cat", "cat", "cat" },
|
||||
@ -147,6 +150,7 @@ void addNormTest(TestNode** root)
|
||||
addTest(root, &TestNextPrevious, "tsnorm/cnormtst/TestNextPrevious");
|
||||
addTest(root, &TestFCNFKCClosure, "tsnorm/cnormtst/TestFCNFKCClosure");
|
||||
addTest(root, &TestComposition, "tsnorm/cnormtst/TestComposition");
|
||||
addTest(root, &TestComposition, "tsnorm/cnormtst/TestGetDecomposition");
|
||||
}
|
||||
|
||||
static const char* const modeStrings[]={
|
||||
@ -1464,4 +1468,47 @@ TestComposition(void) {
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
TestGetDecomposition() {
|
||||
UChar decomp[32];
|
||||
int32_t length;
|
||||
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const UNormalizer2 *n2=unorm2_getInstance(NULL, "nfc", UNORM2_COMPOSE_CONTIGUOUS, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err_status(errorCode, "unorm2_getInstance(nfc/FCC) failed: %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
length=unorm2_getDecomposition(n2, 0x20, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length>=0) {
|
||||
log_err("unorm2_getDecomposition(space) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xe4, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x61 || decomp[1]!=0x308 || decomp[2]!=0) {
|
||||
log_err("unorm2_getDecomposition(a-umlaut) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, decomp, LENGTHOF(decomp), &errorCode);
|
||||
if(U_FAILURE(errorCode) || length!=3 || decomp[0]!=0x1100 || decomp[1]!=0x1161 || decomp[2]!=0x11a8 || decomp[3]!=0) {
|
||||
log_err("unorm2_getDecomposition(Hangul syllable U+AC01) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, NULL, 0, &errorCode);
|
||||
if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3) {
|
||||
log_err("unorm2_getDecomposition(Hangul syllable U+AC01) overflow failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, decomp, -1, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(capacity<0) failed\n");
|
||||
}
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=unorm2_getDecomposition(n2, 0xac01, NULL, 4, &errorCode);
|
||||
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
|
||||
log_err("unorm2_getDecomposition(decomposition=NULL) failed\n");
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
@ -1106,7 +1106,6 @@ BasicNormalizerTest::TestCompare() {
|
||||
UnicodeSet iSet, set;
|
||||
|
||||
UnicodeString s1, s2;
|
||||
UChar32 start, end;
|
||||
|
||||
const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_FAILURE(errorCode) || !nfcImpl->ensureCanonIterData(errorCode)) {
|
||||
@ -1123,46 +1122,48 @@ BasicNormalizerTest::TestCompare() {
|
||||
}
|
||||
|
||||
// test all of these precomposed characters
|
||||
const Normalizer2 *nfcNorm2=Normalizer2Factory::getNFCInstance(errorCode);
|
||||
UnicodeSetIterator it(set);
|
||||
while(it.nextRange() && !it.isString()) {
|
||||
start=it.getCodepoint();
|
||||
end=it.getCodepointEnd();
|
||||
while(start<=end) {
|
||||
s1.setTo(start);
|
||||
while(it.next() && !it.isString()) {
|
||||
UChar32 c=it.getCodepoint();
|
||||
if(!nfcNorm2->getDecomposition(c, s2)) {
|
||||
dataerrln("NFC.getDecomposition(i-composite U+%04lx) failed", (long)c);
|
||||
return;
|
||||
}
|
||||
|
||||
s1.setTo(c);
|
||||
for(k=0; k<LENGTHOF(opt); ++k) {
|
||||
// test Normalizer::compare
|
||||
errorCode=U_ZERO_ERROR;
|
||||
Normalizer::decompose(s1, FALSE, 0, s2, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dataerrln("Normalizer::decompose(U+%04x) failed: %s", start, u_errorName(errorCode));
|
||||
return;
|
||||
result=_norm_compare(s1, s2, opt[k].options, errorCode);
|
||||
refResult=ref_norm_compare(s1, s2, opt[k].options, errorCode);
|
||||
if(_sign(result)!=_sign(refResult)) {
|
||||
errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s",
|
||||
c, opt[k].name, _signString(result), _signString(refResult),
|
||||
U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
|
||||
}
|
||||
|
||||
for(k=0; k<LENGTHOF(opt); ++k) {
|
||||
// test Normalizer::compare
|
||||
// test UnicodeString::caseCompare - same internal implementation function
|
||||
if(opt[k].options&U_COMPARE_IGNORE_CASE) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
result=_norm_compare(s1, s2, opt[k].options, errorCode);
|
||||
refResult=ref_norm_compare(s1, s2, opt[k].options, errorCode);
|
||||
result=s1.caseCompare(s2, opt[k].options);
|
||||
refResult=ref_case_compare(s1, s2, opt[k].options);
|
||||
if(_sign(result)!=_sign(refResult)) {
|
||||
errln("Normalizer::compare(U+%04x with its NFD, %s)%s should be %s %s",
|
||||
start, opt[k].name, _signString(result), _signString(refResult),
|
||||
errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s",
|
||||
c, opt[k].name, _signString(result), _signString(refResult),
|
||||
U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
|
||||
}
|
||||
|
||||
// test UnicodeString::caseCompare - same internal implementation function
|
||||
if(opt[k].options&U_COMPARE_IGNORE_CASE) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
result=s1.caseCompare(s2, opt[k].options);
|
||||
refResult=ref_case_compare(s1, s2, opt[k].options);
|
||||
if(_sign(result)!=_sign(refResult)) {
|
||||
errln("UniStr::caseCompare(U+%04x with its NFD, %s)%s should be %s %s",
|
||||
start, opt[k].name, _signString(result), _signString(refResult),
|
||||
U_SUCCESS(errorCode) ? "" : u_errorName(errorCode));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
// test getDecomposition() for some characters that do not decompose
|
||||
if( nfcNorm2->getDecomposition(0x20, s2) ||
|
||||
nfcNorm2->getDecomposition(0x4e00, s2) ||
|
||||
nfcNorm2->getDecomposition(0x20002, s2)
|
||||
) {
|
||||
errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
|
||||
}
|
||||
}
|
||||
|
||||
// verify that case-folding does not un-FCD strings
|
||||
|
Loading…
Reference in New Issue
Block a user