ICU-3432 move uniset.cpp data for case closure to ucase.icu; have gencase build case closure data; ucase.c use it; UnicodeSet::closeOver() call that
X-SVN-Rev: 16902
This commit is contained in:
parent
67f46c57e8
commit
ca77616509
@ -33,6 +33,7 @@ struct UCaseProps {
|
||||
UDataMemory *mem;
|
||||
const int32_t *indexes;
|
||||
const uint16_t *exceptions;
|
||||
const UChar *unfold;
|
||||
|
||||
UTrie trie;
|
||||
uint8_t formatVersion[4];
|
||||
@ -68,38 +69,50 @@ static UCaseProps *
|
||||
ucase_openData(UCaseProps *cspProto,
|
||||
const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
|
||||
UCaseProps *csp;
|
||||
int32_t size, trieSize;
|
||||
int32_t size;
|
||||
|
||||
cspProto->indexes=(const int32_t *)bin;
|
||||
if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 ||
|
||||
(length>=0 && length<cspProto->indexes[UCASE_IX_LENGTH])
|
||||
if( (length>=0 && length<16*4) ||
|
||||
cspProto->indexes[UCASE_IX_INDEX_TOP]<16
|
||||
) {
|
||||
/* length or indexes[] too short for minimum indexes[] length of 16 */
|
||||
*pErrorCode=U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* get the trie address, after indexes[] */
|
||||
size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
|
||||
bin+=size;
|
||||
if(length>=0 && (length-=size)<16) {
|
||||
*pErrorCode=U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
if(length>=0) {
|
||||
if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
|
||||
length-=size;
|
||||
} else {
|
||||
/* length too short for indexes[] or for the whole data length */
|
||||
*pErrorCode=U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
bin+=size;
|
||||
/* from here on, assume that the sizes of the items fit into the total length */
|
||||
|
||||
/* unserialize the trie */
|
||||
trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE];
|
||||
trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode);
|
||||
/* unserialize the trie, after indexes[] */
|
||||
size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
|
||||
utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
bin+=size;
|
||||
|
||||
/* get exceptions[] */
|
||||
bin+=trieSize;
|
||||
if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) {
|
||||
*pErrorCode=U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
|
||||
cspProto->exceptions=(const uint16_t *)bin;
|
||||
bin+=size;
|
||||
|
||||
/* get unfold[] */
|
||||
size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
|
||||
if(size!=0) {
|
||||
cspProto->unfold=(const UChar *)bin;
|
||||
bin+=size;
|
||||
} else {
|
||||
cspProto->unfold=NULL;
|
||||
}
|
||||
|
||||
/* allocate, copy, and return the new UCaseProps */
|
||||
csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
|
||||
@ -322,8 +335,8 @@ ucase_swap(const UDataSwapper *ds,
|
||||
utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
|
||||
offset+=count;
|
||||
|
||||
/* swap the uint16_t exceptions[] */
|
||||
count=indexes[UCASE_IX_EXC_LENGTH]*2;
|
||||
/* swap the uint16_t exceptions[] and unfold[] */
|
||||
count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2;
|
||||
ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
|
||||
offset+=count;
|
||||
|
||||
@ -338,13 +351,13 @@ ucase_swap(const UDataSwapper *ds,
|
||||
static UBool U_CALLCONV
|
||||
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
/* add the start code point to the USet */
|
||||
USetAdder *sa=(USetAdder *)context;
|
||||
const USetAdder *sa=(const USetAdder *)context;
|
||||
sa->add(sa->set, start);
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
@ -368,8 +381,6 @@ ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pError
|
||||
#define GET_PROPS(csp, c, result) \
|
||||
UTRIE_GET16(&(csp)->trie, c, result);
|
||||
|
||||
#define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
|
||||
#define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
|
||||
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
|
||||
|
||||
#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
|
||||
@ -423,8 +434,8 @@ ucase_tolower(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
@ -441,8 +452,8 @@ ucase_toupper(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
|
||||
c+=GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
@ -459,8 +470,8 @@ ucase_totitle(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
|
||||
c+=GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
@ -478,12 +489,231 @@ ucase_totitle(const UCaseProps *csp, UChar32 c) {
|
||||
return c;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
|
||||
uint16_t props;
|
||||
|
||||
/*
|
||||
* Hardcode the case closure of i and its relatives and ignore the
|
||||
* data file data for these characters.
|
||||
* The Turkic dotless i and dotted I with their case mapping conditions
|
||||
* and case folding option make the related characters behave specially.
|
||||
* This code matches their closure behavior to their case folding behavior.
|
||||
*/
|
||||
static const UChar
|
||||
iDot[2]= { 0x69, 0x307 };
|
||||
|
||||
switch(c) {
|
||||
case 0x49:
|
||||
/* regular i and I are in one equivalence class */
|
||||
sa->add(sa->set, 0x69);
|
||||
return;
|
||||
case 0x69:
|
||||
sa->add(sa->set, 0x49);
|
||||
return;
|
||||
case 0x130:
|
||||
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
|
||||
sa->addString(sa->set, iDot, 2);
|
||||
return;
|
||||
case 0x131:
|
||||
/* dotless i is in a class by itself */
|
||||
return;
|
||||
default:
|
||||
/* otherwise use the data file data */
|
||||
break;
|
||||
}
|
||||
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
/* add the one simple case mapping, no matter what type it is */
|
||||
int32_t delta=UCASE_GET_DELTA(props);
|
||||
if(delta!=0) {
|
||||
sa->add(sa->set, c+delta);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* c has exceptions, so there may be multiple simple and/or
|
||||
* full case mappings. Add them all.
|
||||
*/
|
||||
const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
|
||||
const UChar *closure;
|
||||
uint16_t excWord=*pe++;
|
||||
int32_t index, closureLength, fullLength, length;
|
||||
|
||||
pe0=pe;
|
||||
|
||||
/* add all simple case mappings */
|
||||
for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
|
||||
if(HAS_SLOT(excWord, index)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, index, pe, c);
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
}
|
||||
|
||||
/* get the closure string pointer & length */
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
|
||||
closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
|
||||
closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
|
||||
} else {
|
||||
closureLength=0;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* add all full case mappings */
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
|
||||
++pe;
|
||||
fullLength&=0xffff; /* bits 16 and higher are reserved */
|
||||
while(fullLength!=0) {
|
||||
length=fullLength&0xf;
|
||||
if(length!=0) {
|
||||
sa->addString(sa->set, (const UChar *)pe, length);
|
||||
pe+=length;
|
||||
}
|
||||
fullLength>>=4;
|
||||
}
|
||||
closure=(const UChar *)pe; /* behind full case mappings */
|
||||
}
|
||||
#endif
|
||||
|
||||
/* add the full case folding */
|
||||
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
|
||||
pe=pe0;
|
||||
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
|
||||
|
||||
/* start of full case mapping strings */
|
||||
++pe;
|
||||
|
||||
fullLength&=0xffff; /* bits 16 and higher are reserved */
|
||||
|
||||
/* skip the lowercase result string */
|
||||
pe+=fullLength&UCASE_FULL_LOWER;
|
||||
fullLength>>=4;
|
||||
|
||||
/* add the full case folding string */
|
||||
length=fullLength&0xf;
|
||||
if(length!=0) {
|
||||
sa->addString(sa->set, (const UChar *)pe, length);
|
||||
pe+=length;
|
||||
}
|
||||
|
||||
/* skip the uppercase and titlecase strings */
|
||||
fullLength>>=4;
|
||||
pe+=fullLength&0xf;
|
||||
fullLength>>=4;
|
||||
pe+=fullLength;
|
||||
|
||||
closure=(const UChar *)pe; /* behind full case mappings */
|
||||
}
|
||||
|
||||
/* add each code point in the closure string */
|
||||
for(index=0; index<closureLength;) {
|
||||
U16_NEXT_UNSAFE(closure, index, c);
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* compare s, which has a length, with t, which has a maximum length or is NUL-terminated
|
||||
* must be length>0 and max>0 and length<=max
|
||||
*/
|
||||
static U_INLINE int32_t
|
||||
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
|
||||
int32_t c1, c2;
|
||||
|
||||
max-=length; /* we require length<=max, so no need to decrement max in the loop */
|
||||
do {
|
||||
c1=*s++;
|
||||
c2=*t++;
|
||||
if(c2==0) {
|
||||
return 1; /* reached the end of t but not of s */
|
||||
}
|
||||
c1-=c2;
|
||||
if(c1!=0) {
|
||||
return c1; /* return difference result */
|
||||
}
|
||||
} while(--length>0);
|
||||
/* ends with length==0 */
|
||||
|
||||
if(max==0 || *t==0) {
|
||||
return 0; /* equal to length of both strings */
|
||||
} else {
|
||||
return -max; /* return lengh difference */
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
|
||||
const UChar *unfold, *p;
|
||||
int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth, unfoldCPWidth;
|
||||
|
||||
if(csp->unfold==NULL || s==NULL) {
|
||||
return FALSE; /* no reverse case folding data, or no string */
|
||||
}
|
||||
if(length<=1) {
|
||||
/* the string is too short to find any match */
|
||||
/*
|
||||
* more precise would be:
|
||||
* if(!u_strHasMoreChar32Than(s, length, 1))
|
||||
* but this does not make much practical difference because
|
||||
* a single supplementary code point would just not be found
|
||||
*/
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
unfold=csp->unfold;
|
||||
unfoldRows=unfold[UCASE_UNFOLD_ROWS];
|
||||
unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
|
||||
unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
|
||||
unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
|
||||
unfold+=unfoldRowWidth;
|
||||
|
||||
if(length>unfoldStringWidth) {
|
||||
/* the string is too long to find any match */
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/* do a binary search for the string */
|
||||
start=0;
|
||||
limit=unfoldRows;
|
||||
while(start<limit) {
|
||||
i=(start+limit)/2;
|
||||
p=unfold+(i*unfoldRowWidth);
|
||||
result=strcmpMax(s, length, p, unfoldStringWidth);
|
||||
|
||||
if(result==0) {
|
||||
/* found the string: add each code point, and its case closure */
|
||||
UChar32 c;
|
||||
|
||||
for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
|
||||
U16_NEXT_UNSAFE(p, i, c);
|
||||
sa->add(sa->set, c);
|
||||
ucase_addCaseClosure(csp, c, sa);
|
||||
}
|
||||
return TRUE;
|
||||
} else if(result<0) {
|
||||
limit=i;
|
||||
} else /* result>0 */ {
|
||||
start=i+1;
|
||||
}
|
||||
}
|
||||
|
||||
return FALSE; /* string not found */
|
||||
}
|
||||
|
||||
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getType(const UCaseProps *csp, UChar32 c) {
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
return GET_CASE_TYPE(props);
|
||||
return UCASE_GET_TYPE(props);
|
||||
}
|
||||
|
||||
/** @return same as ucase_getType(), or <0 if c is case-ignorable */
|
||||
@ -492,7 +722,7 @@ ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
|
||||
int32_t type;
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
type=GET_CASE_TYPE(props);
|
||||
type=UCASE_GET_TYPE(props);
|
||||
if(type!=UCASE_NONE) {
|
||||
return type;
|
||||
} else if(
|
||||
@ -775,7 +1005,7 @@ isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void
|
||||
|
||||
for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
|
||||
GET_PROPS(csp, c, props);
|
||||
if(GET_CASE_TYPE(props)!=UCASE_NONE) {
|
||||
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
|
||||
return TRUE; /* followed by cased letter */
|
||||
} else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
|
||||
/* case-ignorable, continue with the loop */
|
||||
@ -934,8 +1164,8 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
|
||||
result=c;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
@ -1081,8 +1311,8 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
|
||||
result=c;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
|
||||
result=c+GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
@ -1236,8 +1466,8 @@ ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) {
|
||||
uint16_t props;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
c+=UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
|
||||
@ -1305,8 +1535,8 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
|
||||
result=c;
|
||||
GET_PROPS(csp, c, props);
|
||||
if(!PROPS_HAS_EXCEPTION(props)) {
|
||||
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+GET_SIGNED_DELTA(props);
|
||||
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
|
||||
result=c+UCASE_GET_DELTA(props);
|
||||
}
|
||||
} else {
|
||||
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
|
||||
|
@ -51,7 +51,7 @@ ucase_swap(const UDataSwapper *ds,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode);
|
||||
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Bit mask for getting just the options from a string compare options word
|
||||
@ -83,6 +83,33 @@ ucase_totitle(const UCaseProps *csp, UChar32 c);
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options);
|
||||
|
||||
/**
|
||||
* Adds all simple case mappings and the full case folding for c to sa,
|
||||
* and also adds special case closure mappings.
|
||||
* c itself is not added.
|
||||
* For example, the mappings
|
||||
* - for s include long s
|
||||
* - for sharp s include ss
|
||||
* - for k include the Kelvin sign
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
|
||||
|
||||
/**
|
||||
* Maps the string to single code points and adds the associated case closure
|
||||
* mappings.
|
||||
* The string is mapped to code points if it is their full case folding string.
|
||||
* In other words, this performs a reverse full case folding and then
|
||||
* adds the case closure items of the resulting code points.
|
||||
* If the string is found and its closure applied, then
|
||||
* the string itself is added as well as part of its code points' closure.
|
||||
* It must be length>=0.
|
||||
*
|
||||
* @return TRUE if the string was found
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
|
||||
|
||||
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucase_getType(const UCaseProps *csp, UChar32 c);
|
||||
@ -211,6 +238,7 @@ enum {
|
||||
UCASE_IX_LENGTH,
|
||||
UCASE_IX_TRIE_SIZE,
|
||||
UCASE_IX_EXC_LENGTH,
|
||||
UCASE_IX_UNFOLD_LENGTH,
|
||||
|
||||
UCASE_IX_MAX_FULL_LENGTH=15,
|
||||
UCASE_IX_TOP=16
|
||||
@ -227,6 +255,8 @@ enum {
|
||||
UCASE_TITLE
|
||||
};
|
||||
|
||||
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
|
||||
|
||||
#define UCASE_SENSITIVE 4
|
||||
#define UCASE_EXCEPTION 8
|
||||
|
||||
@ -264,7 +294,7 @@ enum {
|
||||
UCASE_EXC_TITLE,
|
||||
UCASE_EXC_4, /* reserved */
|
||||
UCASE_EXC_5, /* reserved */
|
||||
UCASE_EXC_6, /* reserved */
|
||||
UCASE_EXC_CLOSURE,
|
||||
UCASE_EXC_FULL_MAPPINGS,
|
||||
UCASE_EXC_ALL_SLOTS /* one past the last slot */
|
||||
};
|
||||
@ -296,6 +326,17 @@ enum {
|
||||
#define UCASE_FULL_UPPER 0xf00
|
||||
#define UCASE_FULL_TITLE 0xf000
|
||||
|
||||
/* maximum lengths */
|
||||
#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
|
||||
#define UCASE_CLOSURE_MAX_LENGTH 0xf
|
||||
|
||||
/* constants for reverse case folding ("unfold") data */
|
||||
enum {
|
||||
UCASE_UNFOLD_ROWS,
|
||||
UCASE_UNFOLD_ROW_WIDTH,
|
||||
UCASE_UNFOLD_STRING_WIDTH
|
||||
};
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
||||
|
@ -997,7 +997,7 @@ ublock_getCode(UChar32 c) {
|
||||
|
||||
/* for Hangul_Syllable_Type */
|
||||
U_CAPI void U_EXPORT2
|
||||
uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
UChar32 c;
|
||||
int32_t value, value2;
|
||||
|
||||
@ -1061,7 +1061,7 @@ uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
static UBool U_CALLCONV
|
||||
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
/* add the start code point to the USet */
|
||||
USetAdder *sa=(USetAdder *)context;
|
||||
const USetAdder *sa=(const USetAdder *)context;
|
||||
sa->add(sa->set, start);
|
||||
return TRUE;
|
||||
}
|
||||
@ -1069,7 +1069,7 @@ _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint
|
||||
#define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -2979,7 +2979,7 @@ _ISO_2022_SafeClone(
|
||||
|
||||
static void
|
||||
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
|
@ -28,7 +28,7 @@
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
sa->addRange(sa->set, 0, 0x10ffff);
|
||||
@ -36,7 +36,7 @@ ucnv_getCompleteUnicodeSet(const UConverter *cnv,
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
sa->addRange(sa->set, 0, 0xd7ff);
|
||||
|
@ -171,7 +171,7 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
|
||||
* For more documentation, see ucnv_getUnicodeSet() in ucnv.h.
|
||||
*/
|
||||
typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
@ -246,13 +246,13 @@ U_CDECL_END
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
@ -932,7 +932,7 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
static void
|
||||
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
const int32_t *cx,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
int32_t minLength,
|
||||
UChar32 c,
|
||||
@ -989,7 +989,7 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
const int32_t *cx;
|
||||
|
@ -384,7 +384,7 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
@ -664,7 +664,7 @@ _LMBCSSafeClone(const UConverter *cnv,
|
||||
|
||||
static void
|
||||
_LMBCSGetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
/* all but U+F6xx, see LMBCS explanation above (search for F6xx) */
|
||||
|
@ -510,7 +510,7 @@ _HZ_SafeClone(const UConverter *cnv,
|
||||
|
||||
static void
|
||||
_HZ_GetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
/* the tilde '~' is hardcoded in the converter */
|
||||
|
@ -1332,7 +1332,7 @@ _ISCII_SafeClone(const UConverter *cnv,
|
||||
|
||||
static void
|
||||
_ISCIIGetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
|
@ -332,7 +332,7 @@ noMoreInput:
|
||||
|
||||
static void
|
||||
_Latin1GetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
sa->addRange(sa->set, 0, 0xff);
|
||||
@ -534,7 +534,7 @@ _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
|
||||
|
||||
static void
|
||||
_ASCIIGetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
sa->addRange(sa->set, 0, 0x7f);
|
||||
|
@ -344,7 +344,7 @@ gb18030Ranges[13][4]={
|
||||
static void
|
||||
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
|
||||
|
||||
@ -421,7 +421,7 @@ _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||
UErrorCode *pErrorCode) {
|
||||
@ -434,7 +434,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UConverterMBCSTable *mbcsTable;
|
||||
@ -571,7 +571,7 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
|
||||
static void
|
||||
ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(cnv->options&_MBCS_OPTION_GB18030) {
|
||||
|
@ -373,7 +373,7 @@ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||
UErrorCode *pErrorCode);
|
||||
@ -388,7 +388,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
USetAdder *sa,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
|
@ -1718,7 +1718,7 @@ uprv_getMaxISOCommentLength() {
|
||||
* @param uset USet to receive characters. Existing contents are deleted.
|
||||
*/
|
||||
static void
|
||||
charSetToUSet(uint32_t cset[8], USetAdder *sa) {
|
||||
charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
|
||||
UChar us[256];
|
||||
char cs[256];
|
||||
|
||||
@ -1755,7 +1755,7 @@ charSetToUSet(uint32_t cset[8], USetAdder *sa) {
|
||||
* @param set USet to receive characters.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getCharNameCharacters(USetAdder *sa) {
|
||||
uprv_getCharNameCharacters(const USetAdder *sa) {
|
||||
charSetToUSet(gNameSet, sa);
|
||||
}
|
||||
|
||||
@ -1769,7 +1769,7 @@ urename.h and uprops.h changed accordingly.
|
||||
* @param set USetAdder to receive characters.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getISOCommentCharacters(USetAdder *sa) {
|
||||
uprv_getISOCommentCharacters(const USetAdder *sa) {
|
||||
charSetToUSet(gISOCommentSet, sa);
|
||||
}
|
||||
#endif
|
||||
|
@ -19,7 +19,6 @@ U_NAMESPACE_BEGIN
|
||||
class ParsePosition;
|
||||
class SymbolTable;
|
||||
class UVector;
|
||||
class CaseEquivClass;
|
||||
class RuleCharacterIterator;
|
||||
|
||||
/**
|
||||
@ -1324,20 +1323,6 @@ private:
|
||||
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
|
||||
|
||||
friend class UnicodeSetIterator;
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Implementation: closeOver
|
||||
//----------------------------------------------------------------
|
||||
|
||||
void caseCloseOne(const UnicodeString& folded);
|
||||
|
||||
void caseCloseOne(const CaseEquivClass& c);
|
||||
|
||||
void caseCloseOne(UChar folded);
|
||||
|
||||
static const CaseEquivClass* getCaseMapOf(const UnicodeString& folded);
|
||||
|
||||
static const CaseEquivClass* getCaseMapOf(UChar folded);
|
||||
};
|
||||
|
||||
inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
|
||||
|
@ -57,6 +57,25 @@ enum {
|
||||
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
|
||||
* match all except 'a', 'A', 'b', and 'B'. This performs a full
|
||||
* closure over case mappings, e.g. U+017F for s.
|
||||
*
|
||||
* The resulting set is a superset of the input for the code points but
|
||||
* not for the strings.
|
||||
* It performs a case mapping closure of the code points and adds
|
||||
* full case folding strings for the code points, and reduces strings of
|
||||
* the original set to their full case folding equivalents.
|
||||
*
|
||||
* This is designed for case-insensitive matches, for example
|
||||
* in regular expressions. The full code point case closure allows to check
|
||||
* an input character directly against the closure set.
|
||||
* Strings are matched by comparing the case-folded form from the closure
|
||||
* set with an incremental case folding of the string in question.
|
||||
*
|
||||
* The closure set will also contain single code points if the original
|
||||
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
|
||||
* This is not necessary (that is, redundant) for the above matching method
|
||||
* but results in the same closure sets regardless of whether the original
|
||||
* set contained the code point or a string.
|
||||
*
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
USET_CASE_INSENSITIVE = 2,
|
||||
|
@ -28,7 +28,6 @@
|
||||
#include "uset_imp.h"
|
||||
#include "ruleiter.h"
|
||||
#include "cmemory.h"
|
||||
#include "uhash.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "util.h"
|
||||
#include "uvector.h"
|
||||
@ -42,7 +41,6 @@
|
||||
#include "mutex.h"
|
||||
#include "uassert.h"
|
||||
#include "hash.h"
|
||||
#include "ucmp8.h"
|
||||
|
||||
// initial storage. Must be >= 0
|
||||
// *** same as in uniset.cpp ! ***
|
||||
@ -157,10 +155,6 @@ U_NAMESPACE_BEGIN
|
||||
|
||||
static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
|
||||
|
||||
static Hashtable* CASE_EQUIV_HASH = NULL; // for closeOver(USET_CASE)
|
||||
|
||||
static CompactByteArray* CASE_EQUIV_CBA = NULL; // for closeOver(USET_CASE)
|
||||
|
||||
// helper functions for matching of pattern syntax pieces ------------------ ***
|
||||
// these functions are parallel to the PERL_OPEN etc. strings above
|
||||
|
||||
@ -1318,16 +1312,6 @@ static UBool U_CALLCONV uset_cleanup(void) {
|
||||
}
|
||||
}
|
||||
|
||||
if (CASE_EQUIV_HASH != NULL) {
|
||||
delete CASE_EQUIV_HASH;
|
||||
CASE_EQUIV_HASH = NULL;
|
||||
}
|
||||
|
||||
if (CASE_EQUIV_CBA != NULL) {
|
||||
ucmp8_close(CASE_EQUIV_CBA);
|
||||
CASE_EQUIV_CBA = NULL;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -1406,33 +1390,26 @@ addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString
|
||||
}
|
||||
|
||||
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
if ((attribute & USET_CASE) != 0) {
|
||||
UnicodeSet foldSet;
|
||||
UnicodeString str;
|
||||
int32_t n = getRangeCount();
|
||||
for (int32_t i=0; i<n; ++i) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
UChar32 end = getRangeEnd(i);
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
str.truncate(0);
|
||||
str.append(u_foldCase(cp, U_FOLD_CASE_DEFAULT));
|
||||
foldSet.caseCloseOne(str);
|
||||
}
|
||||
}
|
||||
if (strings != NULL && strings->size() > 0) {
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
str = * (const UnicodeString*) strings->elementAt(j);
|
||||
foldSet.caseCloseOne(str.foldCase());
|
||||
}
|
||||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
else if ((attribute & USET_ADD_CASE_MAPPINGS)) {
|
||||
UnicodeSet foldSet(*this);
|
||||
UnicodeString str;
|
||||
if (attribute & (USET_CASE | USET_ADD_CASE_MAPPINGS)) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UCaseProps *csp = ucase_getSingleton(&status);
|
||||
if (U_SUCCESS(status)) {
|
||||
UnicodeSet foldSet(*this);
|
||||
UnicodeString str;
|
||||
USetAdder sa = {
|
||||
(USet *)&foldSet,
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString
|
||||
};
|
||||
|
||||
// start with input set to guarantee inclusion
|
||||
// USET_CASE: remove strings because the strings will actually be reduced (folded);
|
||||
// therefore, start with no strings and add only those needed
|
||||
if (attribute & USET_CASE) {
|
||||
foldSet.strings->removeAllElements();
|
||||
}
|
||||
|
||||
int32_t n = getRangeCount();
|
||||
UChar32 result;
|
||||
const UChar *full;
|
||||
@ -1442,45 +1419,64 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
UChar32 start = getRangeStart(i);
|
||||
UChar32 end = getRangeEnd(i);
|
||||
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
if (attribute & USET_CASE) {
|
||||
// full case closure
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
ucase_addCaseClosure(csp, cp, &sa);
|
||||
}
|
||||
} else {
|
||||
// add case mappings
|
||||
// (does not add long s for regular s, or Kelvin for k, for example)
|
||||
for (UChar32 cp=start; cp<=end; ++cp) {
|
||||
result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
|
||||
result = ucase_toFullFolding(csp, cp, &full, 0);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
result = ucase_toFullFolding(csp, cp, &full, 0);
|
||||
addCaseMapping(foldSet, result, full, str);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (strings != NULL && strings->size() > 0) {
|
||||
Locale root("");
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
#endif
|
||||
if (U_SUCCESS(status)) {
|
||||
const UnicodeString *pStr;
|
||||
|
||||
if (attribute & USET_CASE) {
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
pStr = (const UnicodeString *) strings->elementAt(j);
|
||||
(str = *pStr).toLower(root);
|
||||
foldSet.add(str);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
(str = *pStr).toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
#endif
|
||||
(str = *pStr).toUpper(root);
|
||||
foldSet.add(str);
|
||||
(str = *pStr).foldCase();
|
||||
foldSet.add(str);
|
||||
str = *(const UnicodeString *) strings->elementAt(j);
|
||||
str.foldCase();
|
||||
if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
|
||||
foldSet.add(str); // does not map to code points: add the folded string itself
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Locale root("");
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
delete bi;
|
||||
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
|
||||
#endif
|
||||
if (U_SUCCESS(status)) {
|
||||
const UnicodeString *pStr;
|
||||
|
||||
for (int32_t j=0; j<strings->size(); ++j) {
|
||||
pStr = (const UnicodeString *) strings->elementAt(j);
|
||||
(str = *pStr).toLower(root);
|
||||
foldSet.add(str);
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
(str = *pStr).toTitle(bi, root);
|
||||
foldSet.add(str);
|
||||
#endif
|
||||
(str = *pStr).toUpper(root);
|
||||
foldSet.add(str);
|
||||
(str = *pStr).foldCase();
|
||||
foldSet.add(str);
|
||||
}
|
||||
}
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
delete bi;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
*this = foldSet;
|
||||
}
|
||||
@ -1488,525 +1484,4 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Case folding implementation
|
||||
//----------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Data structure representing a case-fold equivalency class. It is a
|
||||
* SET containing 0 or more code units, and 0 or more strings of
|
||||
* length 2 code units or longer.
|
||||
*
|
||||
* This class is implemented as a 8-UChar buffer with a few
|
||||
* convenience methods on it. The format of the buffer:
|
||||
* - All single code units in this set, followed by a terminating
|
||||
* zero. If none, then just a terminating zero.
|
||||
* - Zero or more 0-terminated strings, each of length >= 2
|
||||
* code units.
|
||||
* - A single terminating (UChar)0.
|
||||
*
|
||||
* Usage:
|
||||
*
|
||||
* const CaseEquivClass& c = ...;
|
||||
* const UChar* p;
|
||||
* for (c.getStrings(p); *p; c.nextString(p)) {
|
||||
* foo(p);
|
||||
* }
|
||||
*/
|
||||
class CaseEquivClass {
|
||||
public:
|
||||
UChar data[8];
|
||||
|
||||
/**
|
||||
* Return the string of single code units. May be "". Will never
|
||||
* be NULL.
|
||||
*/
|
||||
const UChar* getSingles() const {
|
||||
return data;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the first multi-code-unit string. May be "" if there
|
||||
* are none. Will never be NULL.
|
||||
* @param p pointer to be set to point to the first string.
|
||||
*/
|
||||
void getStrings(const UChar*& p) const {
|
||||
p = data;
|
||||
nextString(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* Advance a pointer from one multi-code-unit string to the next.
|
||||
* May advance 'p' to point to "" if there are no more.
|
||||
* Do NOT call if *p == 0.
|
||||
* @param p pointer to be advanced to point to the next string.
|
||||
*/
|
||||
static void nextString(const UChar*& p) {
|
||||
while (*p++) {}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* IMPORTANT: The following two static data arrays represent the
|
||||
* information used to do case closure. The first array is an array
|
||||
* of pairs. That is, for each even index e, entries [e] and [e+1]
|
||||
* form a pair of case equivalent code units. The entry at [e] is the
|
||||
* folded one, that is, the one for which u_foldCase(x)==x.
|
||||
*
|
||||
* The second static array is an array of CaseEquivClass objects.
|
||||
* Since these objects are just adorned UChar[] arrays, they can be
|
||||
* initialized in place in the array, and all of them can live in a
|
||||
* single piece of static memory, with no heap allocation.
|
||||
*/
|
||||
|
||||
// MACHINE-GENERATED: Do not edit (see com.ibm.icu.dev.tools.translit.UnicodeSetCloseOver)
|
||||
static const UChar CASE_PAIRS[] = {
|
||||
0x0061,0x0041,0x0062,0x0042,0x0063,0x0043,0x0064,0x0044,0x0065,0x0045,
|
||||
0x0066,0x0046,0x0067,0x0047,0x0068,0x0048,0x0069,0x0049,0x006A,0x004A,
|
||||
0x006C,0x004C,0x006D,0x004D,0x006E,0x004E,0x006F,0x004F,0x0070,0x0050,
|
||||
0x0071,0x0051,0x0072,0x0052,0x0074,0x0054,0x0075,0x0055,0x0076,0x0056,
|
||||
0x0077,0x0057,0x0078,0x0058,0x0079,0x0059,0x007A,0x005A,0x00E0,0x00C0,
|
||||
0x00E1,0x00C1,0x00E2,0x00C2,0x00E3,0x00C3,0x00E4,0x00C4,0x00E6,0x00C6,
|
||||
0x00E7,0x00C7,0x00E8,0x00C8,0x00E9,0x00C9,0x00EA,0x00CA,0x00EB,0x00CB,
|
||||
0x00EC,0x00CC,0x00ED,0x00CD,0x00EE,0x00CE,0x00EF,0x00CF,0x00F0,0x00D0,
|
||||
0x00F1,0x00D1,0x00F2,0x00D2,0x00F3,0x00D3,0x00F4,0x00D4,0x00F5,0x00D5,
|
||||
0x00F6,0x00D6,0x00F8,0x00D8,0x00F9,0x00D9,0x00FA,0x00DA,0x00FB,0x00DB,
|
||||
0x00FC,0x00DC,0x00FD,0x00DD,0x00FE,0x00DE,0x00FF,0x0178,0x0101,0x0100,
|
||||
0x0103,0x0102,0x0105,0x0104,0x0107,0x0106,0x0109,0x0108,0x010B,0x010A,
|
||||
0x010D,0x010C,0x010F,0x010E,0x0111,0x0110,0x0113,0x0112,0x0115,0x0114,
|
||||
0x0117,0x0116,0x0119,0x0118,0x011B,0x011A,0x011D,0x011C,0x011F,0x011E,
|
||||
0x0121,0x0120,0x0123,0x0122,0x0125,0x0124,0x0127,0x0126,0x0129,0x0128,
|
||||
0x012B,0x012A,0x012D,0x012C,0x012F,0x012E,0x0133,0x0132,0x0135,0x0134,
|
||||
0x0137,0x0136,0x013A,0x0139,0x013C,0x013B,0x013E,0x013D,0x0140,0x013F,
|
||||
0x0142,0x0141,0x0144,0x0143,0x0146,0x0145,0x0148,0x0147,0x014B,0x014A,
|
||||
0x014D,0x014C,0x014F,0x014E,0x0151,0x0150,0x0153,0x0152,0x0155,0x0154,
|
||||
0x0157,0x0156,0x0159,0x0158,0x015B,0x015A,0x015D,0x015C,0x015F,0x015E,
|
||||
0x0161,0x0160,0x0163,0x0162,0x0165,0x0164,0x0167,0x0166,0x0169,0x0168,
|
||||
0x016B,0x016A,0x016D,0x016C,0x016F,0x016E,0x0171,0x0170,0x0173,0x0172,
|
||||
0x0175,0x0174,0x0177,0x0176,0x017A,0x0179,0x017C,0x017B,0x017E,0x017D,
|
||||
0x0183,0x0182,0x0185,0x0184,0x0188,0x0187,0x018C,0x018B,0x0192,0x0191,
|
||||
0x0195,0x01F6,0x0199,0x0198,0x019E,0x0220,0x01A1,0x01A0,0x01A3,0x01A2,
|
||||
0x01A5,0x01A4,0x01A8,0x01A7,0x01AD,0x01AC,0x01B0,0x01AF,0x01B4,0x01B3,
|
||||
0x01B6,0x01B5,0x01B9,0x01B8,0x01BD,0x01BC,0x01BF,0x01F7,0x01CE,0x01CD,
|
||||
0x01D0,0x01CF,0x01D2,0x01D1,0x01D4,0x01D3,0x01D6,0x01D5,0x01D8,0x01D7,
|
||||
0x01DA,0x01D9,0x01DC,0x01DB,0x01DD,0x018E,0x01DF,0x01DE,0x01E1,0x01E0,
|
||||
0x01E3,0x01E2,0x01E5,0x01E4,0x01E7,0x01E6,0x01E9,0x01E8,0x01EB,0x01EA,
|
||||
0x01ED,0x01EC,0x01EF,0x01EE,0x01F5,0x01F4,0x01F9,0x01F8,0x01FB,0x01FA,
|
||||
0x01FD,0x01FC,0x01FF,0x01FE,0x0201,0x0200,0x0203,0x0202,0x0205,0x0204,
|
||||
0x0207,0x0206,0x0209,0x0208,0x020B,0x020A,0x020D,0x020C,0x020F,0x020E,
|
||||
0x0211,0x0210,0x0213,0x0212,0x0215,0x0214,0x0217,0x0216,0x0219,0x0218,
|
||||
0x021B,0x021A,0x021D,0x021C,0x021F,0x021E,0x0223,0x0222,0x0225,0x0224,
|
||||
0x0227,0x0226,0x0229,0x0228,0x022B,0x022A,0x022D,0x022C,0x022F,0x022E,
|
||||
0x0231,0x0230,0x0233,0x0232,0x0253,0x0181,0x0254,0x0186,0x0256,0x0189,
|
||||
0x0257,0x018A,0x0259,0x018F,0x025B,0x0190,0x0260,0x0193,0x0263,0x0194,
|
||||
0x0268,0x0197,0x0269,0x0196,0x026F,0x019C,0x0272,0x019D,0x0275,0x019F,
|
||||
0x0280,0x01A6,0x0283,0x01A9,0x0288,0x01AE,0x028A,0x01B1,0x028B,0x01B2,
|
||||
0x0292,0x01B7,0x03AC,0x0386,0x03AD,0x0388,0x03AE,0x0389,0x03AF,0x038A,
|
||||
0x03B1,0x0391,0x03B3,0x0393,0x03B4,0x0394,0x03B6,0x0396,0x03B7,0x0397,
|
||||
0x03BB,0x039B,0x03BD,0x039D,0x03BE,0x039E,0x03BF,0x039F,0x03C4,0x03A4,
|
||||
0x03C5,0x03A5,0x03C7,0x03A7,0x03C8,0x03A8,0x03CA,0x03AA,0x03CB,0x03AB,
|
||||
0x03CC,0x038C,0x03CD,0x038E,0x03CE,0x038F,0x03D9,0x03D8,0x03DB,0x03DA,
|
||||
0x03DD,0x03DC,0x03DF,0x03DE,0x03E1,0x03E0,0x03E3,0x03E2,0x03E5,0x03E4,
|
||||
0x03E7,0x03E6,0x03E9,0x03E8,0x03EB,0x03EA,0x03ED,0x03EC,0x03EF,0x03EE,
|
||||
0x0430,0x0410,0x0431,0x0411,0x0432,0x0412,0x0433,0x0413,0x0434,0x0414,
|
||||
0x0435,0x0415,0x0436,0x0416,0x0437,0x0417,0x0438,0x0418,0x0439,0x0419,
|
||||
0x043A,0x041A,0x043B,0x041B,0x043C,0x041C,0x043D,0x041D,0x043E,0x041E,
|
||||
0x043F,0x041F,0x0440,0x0420,0x0441,0x0421,0x0442,0x0422,0x0443,0x0423,
|
||||
0x0444,0x0424,0x0445,0x0425,0x0446,0x0426,0x0447,0x0427,0x0448,0x0428,
|
||||
0x0449,0x0429,0x044A,0x042A,0x044B,0x042B,0x044C,0x042C,0x044D,0x042D,
|
||||
0x044E,0x042E,0x044F,0x042F,0x0450,0x0400,0x0451,0x0401,0x0452,0x0402,
|
||||
0x0453,0x0403,0x0454,0x0404,0x0455,0x0405,0x0456,0x0406,0x0457,0x0407,
|
||||
0x0458,0x0408,0x0459,0x0409,0x045A,0x040A,0x045B,0x040B,0x045C,0x040C,
|
||||
0x045D,0x040D,0x045E,0x040E,0x045F,0x040F,0x0461,0x0460,0x0463,0x0462,
|
||||
0x0465,0x0464,0x0467,0x0466,0x0469,0x0468,0x046B,0x046A,0x046D,0x046C,
|
||||
0x046F,0x046E,0x0471,0x0470,0x0473,0x0472,0x0475,0x0474,0x0477,0x0476,
|
||||
0x0479,0x0478,0x047B,0x047A,0x047D,0x047C,0x047F,0x047E,0x0481,0x0480,
|
||||
0x048B,0x048A,0x048D,0x048C,0x048F,0x048E,0x0491,0x0490,0x0493,0x0492,
|
||||
0x0495,0x0494,0x0497,0x0496,0x0499,0x0498,0x049B,0x049A,0x049D,0x049C,
|
||||
0x049F,0x049E,0x04A1,0x04A0,0x04A3,0x04A2,0x04A5,0x04A4,0x04A7,0x04A6,
|
||||
0x04A9,0x04A8,0x04AB,0x04AA,0x04AD,0x04AC,0x04AF,0x04AE,0x04B1,0x04B0,
|
||||
0x04B3,0x04B2,0x04B5,0x04B4,0x04B7,0x04B6,0x04B9,0x04B8,0x04BB,0x04BA,
|
||||
0x04BD,0x04BC,0x04BF,0x04BE,0x04C2,0x04C1,0x04C4,0x04C3,0x04C6,0x04C5,
|
||||
0x04C8,0x04C7,0x04CA,0x04C9,0x04CC,0x04CB,0x04CE,0x04CD,0x04D1,0x04D0,
|
||||
0x04D3,0x04D2,0x04D5,0x04D4,0x04D7,0x04D6,0x04D9,0x04D8,0x04DB,0x04DA,
|
||||
0x04DD,0x04DC,0x04DF,0x04DE,0x04E1,0x04E0,0x04E3,0x04E2,0x04E5,0x04E4,
|
||||
0x04E7,0x04E6,0x04E9,0x04E8,0x04EB,0x04EA,0x04ED,0x04EC,0x04EF,0x04EE,
|
||||
0x04F1,0x04F0,0x04F3,0x04F2,0x04F5,0x04F4,0x04F9,0x04F8,0x0501,0x0500,
|
||||
0x0503,0x0502,0x0505,0x0504,0x0507,0x0506,0x0509,0x0508,0x050B,0x050A,
|
||||
0x050D,0x050C,0x050F,0x050E,0x0561,0x0531,0x0562,0x0532,0x0563,0x0533,
|
||||
0x0564,0x0534,0x0565,0x0535,0x0566,0x0536,0x0567,0x0537,0x0568,0x0538,
|
||||
0x0569,0x0539,0x056A,0x053A,0x056B,0x053B,0x056C,0x053C,0x056D,0x053D,
|
||||
0x056E,0x053E,0x056F,0x053F,0x0570,0x0540,0x0571,0x0541,0x0572,0x0542,
|
||||
0x0573,0x0543,0x0574,0x0544,0x0575,0x0545,0x0576,0x0546,0x0577,0x0547,
|
||||
0x0578,0x0548,0x0579,0x0549,0x057A,0x054A,0x057B,0x054B,0x057C,0x054C,
|
||||
0x057D,0x054D,0x057E,0x054E,0x057F,0x054F,0x0580,0x0550,0x0581,0x0551,
|
||||
0x0582,0x0552,0x0583,0x0553,0x0584,0x0554,0x0585,0x0555,0x0586,0x0556,
|
||||
0x1E01,0x1E00,0x1E03,0x1E02,0x1E05,0x1E04,0x1E07,0x1E06,0x1E09,0x1E08,
|
||||
0x1E0B,0x1E0A,0x1E0D,0x1E0C,0x1E0F,0x1E0E,0x1E11,0x1E10,0x1E13,0x1E12,
|
||||
0x1E15,0x1E14,0x1E17,0x1E16,0x1E19,0x1E18,0x1E1B,0x1E1A,0x1E1D,0x1E1C,
|
||||
0x1E1F,0x1E1E,0x1E21,0x1E20,0x1E23,0x1E22,0x1E25,0x1E24,0x1E27,0x1E26,
|
||||
0x1E29,0x1E28,0x1E2B,0x1E2A,0x1E2D,0x1E2C,0x1E2F,0x1E2E,0x1E31,0x1E30,
|
||||
0x1E33,0x1E32,0x1E35,0x1E34,0x1E37,0x1E36,0x1E39,0x1E38,0x1E3B,0x1E3A,
|
||||
0x1E3D,0x1E3C,0x1E3F,0x1E3E,0x1E41,0x1E40,0x1E43,0x1E42,0x1E45,0x1E44,
|
||||
0x1E47,0x1E46,0x1E49,0x1E48,0x1E4B,0x1E4A,0x1E4D,0x1E4C,0x1E4F,0x1E4E,
|
||||
0x1E51,0x1E50,0x1E53,0x1E52,0x1E55,0x1E54,0x1E57,0x1E56,0x1E59,0x1E58,
|
||||
0x1E5B,0x1E5A,0x1E5D,0x1E5C,0x1E5F,0x1E5E,0x1E63,0x1E62,0x1E65,0x1E64,
|
||||
0x1E67,0x1E66,0x1E69,0x1E68,0x1E6B,0x1E6A,0x1E6D,0x1E6C,0x1E6F,0x1E6E,
|
||||
0x1E71,0x1E70,0x1E73,0x1E72,0x1E75,0x1E74,0x1E77,0x1E76,0x1E79,0x1E78,
|
||||
0x1E7B,0x1E7A,0x1E7D,0x1E7C,0x1E7F,0x1E7E,0x1E81,0x1E80,0x1E83,0x1E82,
|
||||
0x1E85,0x1E84,0x1E87,0x1E86,0x1E89,0x1E88,0x1E8B,0x1E8A,0x1E8D,0x1E8C,
|
||||
0x1E8F,0x1E8E,0x1E91,0x1E90,0x1E93,0x1E92,0x1E95,0x1E94,0x1EA1,0x1EA0,
|
||||
0x1EA3,0x1EA2,0x1EA5,0x1EA4,0x1EA7,0x1EA6,0x1EA9,0x1EA8,0x1EAB,0x1EAA,
|
||||
0x1EAD,0x1EAC,0x1EAF,0x1EAE,0x1EB1,0x1EB0,0x1EB3,0x1EB2,0x1EB5,0x1EB4,
|
||||
0x1EB7,0x1EB6,0x1EB9,0x1EB8,0x1EBB,0x1EBA,0x1EBD,0x1EBC,0x1EBF,0x1EBE,
|
||||
0x1EC1,0x1EC0,0x1EC3,0x1EC2,0x1EC5,0x1EC4,0x1EC7,0x1EC6,0x1EC9,0x1EC8,
|
||||
0x1ECB,0x1ECA,0x1ECD,0x1ECC,0x1ECF,0x1ECE,0x1ED1,0x1ED0,0x1ED3,0x1ED2,
|
||||
0x1ED5,0x1ED4,0x1ED7,0x1ED6,0x1ED9,0x1ED8,0x1EDB,0x1EDA,0x1EDD,0x1EDC,
|
||||
0x1EDF,0x1EDE,0x1EE1,0x1EE0,0x1EE3,0x1EE2,0x1EE5,0x1EE4,0x1EE7,0x1EE6,
|
||||
0x1EE9,0x1EE8,0x1EEB,0x1EEA,0x1EED,0x1EEC,0x1EEF,0x1EEE,0x1EF1,0x1EF0,
|
||||
0x1EF3,0x1EF2,0x1EF5,0x1EF4,0x1EF7,0x1EF6,0x1EF9,0x1EF8,0x1F00,0x1F08,
|
||||
0x1F01,0x1F09,0x1F02,0x1F0A,0x1F03,0x1F0B,0x1F04,0x1F0C,0x1F05,0x1F0D,
|
||||
0x1F06,0x1F0E,0x1F07,0x1F0F,0x1F10,0x1F18,0x1F11,0x1F19,0x1F12,0x1F1A,
|
||||
0x1F13,0x1F1B,0x1F14,0x1F1C,0x1F15,0x1F1D,0x1F20,0x1F28,0x1F21,0x1F29,
|
||||
0x1F22,0x1F2A,0x1F23,0x1F2B,0x1F24,0x1F2C,0x1F25,0x1F2D,0x1F26,0x1F2E,
|
||||
0x1F27,0x1F2F,0x1F30,0x1F38,0x1F31,0x1F39,0x1F32,0x1F3A,0x1F33,0x1F3B,
|
||||
0x1F34,0x1F3C,0x1F35,0x1F3D,0x1F36,0x1F3E,0x1F37,0x1F3F,0x1F40,0x1F48,
|
||||
0x1F41,0x1F49,0x1F42,0x1F4A,0x1F43,0x1F4B,0x1F44,0x1F4C,0x1F45,0x1F4D,
|
||||
0x1F51,0x1F59,0x1F53,0x1F5B,0x1F55,0x1F5D,0x1F57,0x1F5F,0x1F60,0x1F68,
|
||||
0x1F61,0x1F69,0x1F62,0x1F6A,0x1F63,0x1F6B,0x1F64,0x1F6C,0x1F65,0x1F6D,
|
||||
0x1F66,0x1F6E,0x1F67,0x1F6F,0x1F70,0x1FBA,0x1F71,0x1FBB,0x1F72,0x1FC8,
|
||||
0x1F73,0x1FC9,0x1F74,0x1FCA,0x1F75,0x1FCB,0x1F76,0x1FDA,0x1F77,0x1FDB,
|
||||
0x1F78,0x1FF8,0x1F79,0x1FF9,0x1F7A,0x1FEA,0x1F7B,0x1FEB,0x1F7C,0x1FFA,
|
||||
0x1F7D,0x1FFB,0x1FB0,0x1FB8,0x1FB1,0x1FB9,0x1FD0,0x1FD8,0x1FD1,0x1FD9,
|
||||
0x1FE0,0x1FE8,0x1FE1,0x1FE9,0x1FE5,0x1FEC,0x2170,0x2160,0x2171,0x2161,
|
||||
0x2172,0x2162,0x2173,0x2163,0x2174,0x2164,0x2175,0x2165,0x2176,0x2166,
|
||||
0x2177,0x2167,0x2178,0x2168,0x2179,0x2169,0x217A,0x216A,0x217B,0x216B,
|
||||
0x217C,0x216C,0x217D,0x216D,0x217E,0x216E,0x217F,0x216F,0x24D0,0x24B6,
|
||||
0x24D1,0x24B7,0x24D2,0x24B8,0x24D3,0x24B9,0x24D4,0x24BA,0x24D5,0x24BB,
|
||||
0x24D6,0x24BC,0x24D7,0x24BD,0x24D8,0x24BE,0x24D9,0x24BF,0x24DA,0x24C0,
|
||||
0x24DB,0x24C1,0x24DC,0x24C2,0x24DD,0x24C3,0x24DE,0x24C4,0x24DF,0x24C5,
|
||||
0x24E0,0x24C6,0x24E1,0x24C7,0x24E2,0x24C8,0x24E3,0x24C9,0x24E4,0x24CA,
|
||||
0x24E5,0x24CB,0x24E6,0x24CC,0x24E7,0x24CD,0x24E8,0x24CE,0x24E9,0x24CF,
|
||||
0xFF41,0xFF21,0xFF42,0xFF22,0xFF43,0xFF23,0xFF44,0xFF24,0xFF45,0xFF25,
|
||||
0xFF46,0xFF26,0xFF47,0xFF27,0xFF48,0xFF28,0xFF49,0xFF29,0xFF4A,0xFF2A,
|
||||
0xFF4B,0xFF2B,0xFF4C,0xFF2C,0xFF4D,0xFF2D,0xFF4E,0xFF2E,0xFF4F,0xFF2F,
|
||||
0xFF50,0xFF30,0xFF51,0xFF31,0xFF52,0xFF32,0xFF53,0xFF33,0xFF54,0xFF34,
|
||||
0xFF55,0xFF35,0xFF56,0xFF36,0xFF57,0xFF37,0xFF58,0xFF38,0xFF59,0xFF39,
|
||||
0xFF5A,0xFF3A,
|
||||
};
|
||||
|
||||
// MACHINE-GENERATED: Do not edit (see com.ibm.icu.dev.tools.translit.UnicodeSetCloseOver)
|
||||
static const CaseEquivClass CASE_NONPAIRS[] = {
|
||||
{{0x1E9A,0, 0x0061,0x02BE,0, 0}},
|
||||
{{0xFB00,0, 0x0066,0x0066,0, 0}},
|
||||
{{0xFB03,0, 0x0066,0x0066,0x0069,0, 0}},
|
||||
{{0xFB04,0, 0x0066,0x0066,0x006C,0, 0}},
|
||||
{{0xFB01,0, 0x0066,0x0069,0, 0}},
|
||||
{{0xFB02,0, 0x0066,0x006C,0, 0}},
|
||||
{{0x1E96,0, 0x0068,0x0331,0, 0}},
|
||||
{{0x0130,0, 0x0069,0x0307,0, 0}},
|
||||
{{0x01F0,0, 0x006A,0x030C,0, 0}},
|
||||
{{0x004B,0x006B,0x212A,0, 0}},
|
||||
{{0x0053,0x0073,0x017F,0, 0}},
|
||||
{{0x00DF,0, 0x0073,0x0073,0, 0}},
|
||||
{{0xFB05,0xFB06,0, 0x0073,0x0074,0, 0}},
|
||||
{{0x1E97,0, 0x0074,0x0308,0, 0}},
|
||||
{{0x1E98,0, 0x0077,0x030A,0, 0}},
|
||||
{{0x1E99,0, 0x0079,0x030A,0, 0}},
|
||||
{{0x00C5,0x00E5,0x212B,0, 0}},
|
||||
{{0x01C4,0x01C5,0x01C6,0, 0}},
|
||||
{{0x01C7,0x01C8,0x01C9,0, 0}},
|
||||
{{0x01CA,0x01CB,0x01CC,0, 0}},
|
||||
{{0x01F1,0x01F2,0x01F3,0, 0}},
|
||||
{{0x0149,0, 0x02BC,0x006E,0, 0}},
|
||||
{{0x1FB4,0, 0x03AC,0x03B9,0, 0}},
|
||||
{{0x1FC4,0, 0x03AE,0x03B9,0, 0}},
|
||||
{{0x1FB6,0, 0x03B1,0x0342,0, 0}},
|
||||
{{0x1FB7,0, 0x03B1,0x0342,0x03B9,0, 0}},
|
||||
{{0x1FB3,0x1FBC,0, 0x03B1,0x03B9,0, 0}},
|
||||
{{0x0392,0x03B2,0x03D0,0, 0}},
|
||||
{{0x0395,0x03B5,0x03F5,0, 0}},
|
||||
{{0x1FC6,0, 0x03B7,0x0342,0, 0}},
|
||||
{{0x1FC7,0, 0x03B7,0x0342,0x03B9,0, 0}},
|
||||
{{0x1FC3,0x1FCC,0, 0x03B7,0x03B9,0, 0}},
|
||||
{{0x0398,0x03B8,0x03D1,0x03F4,0, 0}},
|
||||
{{0x0345,0x0399,0x03B9,0x1FBE,0, 0}},
|
||||
{{0x1FD2,0, 0x03B9,0x0308,0x0300,0, 0}},
|
||||
{{0x0390,0x1FD3,0, 0x03B9,0x0308,0x0301,0, 0}},
|
||||
{{0x1FD7,0, 0x03B9,0x0308,0x0342,0, 0}},
|
||||
{{0x1FD6,0, 0x03B9,0x0342,0, 0}},
|
||||
{{0x039A,0x03BA,0x03F0,0, 0}},
|
||||
{{0x00B5,0x039C,0x03BC,0, 0}},
|
||||
{{0x03A0,0x03C0,0x03D6,0, 0}},
|
||||
{{0x03A1,0x03C1,0x03F1,0, 0}},
|
||||
{{0x1FE4,0, 0x03C1,0x0313,0, 0}},
|
||||
{{0x03A3,0x03C2,0x03C3,0x03F2,0, 0}},
|
||||
{{0x1FE2,0, 0x03C5,0x0308,0x0300,0, 0}},
|
||||
{{0x03B0,0x1FE3,0, 0x03C5,0x0308,0x0301,0, 0}},
|
||||
{{0x1FE7,0, 0x03C5,0x0308,0x0342,0, 0}},
|
||||
{{0x1F50,0, 0x03C5,0x0313,0, 0}},
|
||||
{{0x1F52,0, 0x03C5,0x0313,0x0300,0, 0}},
|
||||
{{0x1F54,0, 0x03C5,0x0313,0x0301,0, 0}},
|
||||
{{0x1F56,0, 0x03C5,0x0313,0x0342,0, 0}},
|
||||
{{0x1FE6,0, 0x03C5,0x0342,0, 0}},
|
||||
{{0x03A6,0x03C6,0x03D5,0, 0}},
|
||||
{{0x03A9,0x03C9,0x2126,0, 0}},
|
||||
{{0x1FF6,0, 0x03C9,0x0342,0, 0}},
|
||||
{{0x1FF7,0, 0x03C9,0x0342,0x03B9,0, 0}},
|
||||
{{0x1FF3,0x1FFC,0, 0x03C9,0x03B9,0, 0}},
|
||||
{{0x1FF4,0, 0x03CE,0x03B9,0, 0}},
|
||||
{{0x0587,0, 0x0565,0x0582,0, 0}},
|
||||
{{0xFB14,0, 0x0574,0x0565,0, 0}},
|
||||
{{0xFB15,0, 0x0574,0x056B,0, 0}},
|
||||
{{0xFB17,0, 0x0574,0x056D,0, 0}},
|
||||
{{0xFB13,0, 0x0574,0x0576,0, 0}},
|
||||
{{0xFB16,0, 0x057E,0x0576,0, 0}},
|
||||
{{0x1E60,0x1E61,0x1E9B,0, 0}},
|
||||
{{0x1F80,0x1F88,0, 0x1F00,0x03B9,0, 0}},
|
||||
{{0x1F81,0x1F89,0, 0x1F01,0x03B9,0, 0}},
|
||||
{{0x1F82,0x1F8A,0, 0x1F02,0x03B9,0, 0}},
|
||||
{{0x1F83,0x1F8B,0, 0x1F03,0x03B9,0, 0}},
|
||||
{{0x1F84,0x1F8C,0, 0x1F04,0x03B9,0, 0}},
|
||||
{{0x1F85,0x1F8D,0, 0x1F05,0x03B9,0, 0}},
|
||||
{{0x1F86,0x1F8E,0, 0x1F06,0x03B9,0, 0}},
|
||||
{{0x1F87,0x1F8F,0, 0x1F07,0x03B9,0, 0}},
|
||||
{{0x1F90,0x1F98,0, 0x1F20,0x03B9,0, 0}},
|
||||
{{0x1F91,0x1F99,0, 0x1F21,0x03B9,0, 0}},
|
||||
{{0x1F92,0x1F9A,0, 0x1F22,0x03B9,0, 0}},
|
||||
{{0x1F93,0x1F9B,0, 0x1F23,0x03B9,0, 0}},
|
||||
{{0x1F94,0x1F9C,0, 0x1F24,0x03B9,0, 0}},
|
||||
{{0x1F95,0x1F9D,0, 0x1F25,0x03B9,0, 0}},
|
||||
{{0x1F96,0x1F9E,0, 0x1F26,0x03B9,0, 0}},
|
||||
{{0x1F97,0x1F9F,0, 0x1F27,0x03B9,0, 0}},
|
||||
{{0x1FA0,0x1FA8,0, 0x1F60,0x03B9,0, 0}},
|
||||
{{0x1FA1,0x1FA9,0, 0x1F61,0x03B9,0, 0}},
|
||||
{{0x1FA2,0x1FAA,0, 0x1F62,0x03B9,0, 0}},
|
||||
{{0x1FA3,0x1FAB,0, 0x1F63,0x03B9,0, 0}},
|
||||
{{0x1FA4,0x1FAC,0, 0x1F64,0x03B9,0, 0}},
|
||||
{{0x1FA5,0x1FAD,0, 0x1F65,0x03B9,0, 0}},
|
||||
{{0x1FA6,0x1FAE,0, 0x1F66,0x03B9,0, 0}},
|
||||
{{0x1FA7,0x1FAF,0, 0x1F67,0x03B9,0, 0}},
|
||||
{{0x1FB2,0, 0x1F70,0x03B9,0, 0}},
|
||||
{{0x1FC2,0, 0x1F74,0x03B9,0, 0}},
|
||||
{{0x1FF2,0, 0x1F7C,0x03B9,0, 0}},
|
||||
{{0, 0xD801,0xDC00,0, 0xD801,0xDC28,0, 0}},
|
||||
{{0, 0xD801,0xDC01,0, 0xD801,0xDC29,0, 0}},
|
||||
{{0, 0xD801,0xDC02,0, 0xD801,0xDC2A,0, 0}},
|
||||
{{0, 0xD801,0xDC03,0, 0xD801,0xDC2B,0, 0}},
|
||||
{{0, 0xD801,0xDC04,0, 0xD801,0xDC2C,0, 0}},
|
||||
{{0, 0xD801,0xDC05,0, 0xD801,0xDC2D,0, 0}},
|
||||
{{0, 0xD801,0xDC06,0, 0xD801,0xDC2E,0, 0}},
|
||||
{{0, 0xD801,0xDC07,0, 0xD801,0xDC2F,0, 0}},
|
||||
{{0, 0xD801,0xDC08,0, 0xD801,0xDC30,0, 0}},
|
||||
{{0, 0xD801,0xDC09,0, 0xD801,0xDC31,0, 0}},
|
||||
{{0, 0xD801,0xDC0A,0, 0xD801,0xDC32,0, 0}},
|
||||
{{0, 0xD801,0xDC0B,0, 0xD801,0xDC33,0, 0}},
|
||||
{{0, 0xD801,0xDC0C,0, 0xD801,0xDC34,0, 0}},
|
||||
{{0, 0xD801,0xDC0D,0, 0xD801,0xDC35,0, 0}},
|
||||
{{0, 0xD801,0xDC0E,0, 0xD801,0xDC36,0, 0}},
|
||||
{{0, 0xD801,0xDC0F,0, 0xD801,0xDC37,0, 0}},
|
||||
{{0, 0xD801,0xDC10,0, 0xD801,0xDC38,0, 0}},
|
||||
{{0, 0xD801,0xDC11,0, 0xD801,0xDC39,0, 0}},
|
||||
{{0, 0xD801,0xDC12,0, 0xD801,0xDC3A,0, 0}},
|
||||
{{0, 0xD801,0xDC13,0, 0xD801,0xDC3B,0, 0}},
|
||||
{{0, 0xD801,0xDC14,0, 0xD801,0xDC3C,0, 0}},
|
||||
{{0, 0xD801,0xDC15,0, 0xD801,0xDC3D,0, 0}},
|
||||
{{0, 0xD801,0xDC16,0, 0xD801,0xDC3E,0, 0}},
|
||||
{{0, 0xD801,0xDC17,0, 0xD801,0xDC3F,0, 0}},
|
||||
{{0, 0xD801,0xDC18,0, 0xD801,0xDC40,0, 0}},
|
||||
{{0, 0xD801,0xDC19,0, 0xD801,0xDC41,0, 0}},
|
||||
{{0, 0xD801,0xDC1A,0, 0xD801,0xDC42,0, 0}},
|
||||
{{0, 0xD801,0xDC1B,0, 0xD801,0xDC43,0, 0}},
|
||||
{{0, 0xD801,0xDC1C,0, 0xD801,0xDC44,0, 0}},
|
||||
{{0, 0xD801,0xDC1D,0, 0xD801,0xDC45,0, 0}},
|
||||
{{0, 0xD801,0xDC1E,0, 0xD801,0xDC46,0, 0}},
|
||||
{{0, 0xD801,0xDC1F,0, 0xD801,0xDC47,0, 0}},
|
||||
{{0, 0xD801,0xDC20,0, 0xD801,0xDC48,0, 0}},
|
||||
{{0, 0xD801,0xDC21,0, 0xD801,0xDC49,0, 0}},
|
||||
{{0, 0xD801,0xDC22,0, 0xD801,0xDC4A,0, 0}},
|
||||
{{0, 0xD801,0xDC23,0, 0xD801,0xDC4B,0, 0}},
|
||||
{{0, 0xD801,0xDC24,0, 0xD801,0xDC4C,0, 0}},
|
||||
{{0, 0xD801,0xDC25,0, 0xD801,0xDC4D,0, 0}}
|
||||
};
|
||||
|
||||
#define CASE_PAIRS_LENGTH (sizeof(CASE_PAIRS)/sizeof(CASE_PAIRS[0]))
|
||||
#define CASE_NONPAIRS_LENGTH (sizeof(CASE_NONPAIRS)/sizeof(CASE_NONPAIRS[0]))
|
||||
|
||||
/**
|
||||
* Add to this set all members of the case fold equivalency class
|
||||
* that contains 'folded'.
|
||||
* @param folded a string within a case fold equivalency class.
|
||||
* It must have the property that UCharacter.foldCase(folded,
|
||||
* DEFAULT_CASE_MAP).equals(folded).
|
||||
*/
|
||||
void UnicodeSet::caseCloseOne(const UnicodeString& folded) {
|
||||
if (folded.length() == 1) {
|
||||
caseCloseOne(folded.charAt(0));
|
||||
return;
|
||||
}
|
||||
|
||||
const CaseEquivClass* c = getCaseMapOf(folded);
|
||||
if (c != NULL) {
|
||||
caseCloseOne(*c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Add 'folded' itself; it belongs to no equivalency class.
|
||||
add(folded);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to this set all members of the case fold equivalency class
|
||||
* that contains 'folded'.
|
||||
* @param folded a code UNIT within a case fold equivalency class.
|
||||
* It must have the property that uchar_foldCase(folded,
|
||||
* DEFAULT_CASE_MAP) == folded.
|
||||
*/
|
||||
void UnicodeSet::caseCloseOne(UChar folded) {
|
||||
// We must do a DOUBLE LOOKUP, first in the CompactByteArray that
|
||||
// indexes into CASE_NONPAIRS[] and then into the CASE_PAIRS[]
|
||||
// sorted array. A character will occur in one or the other, or
|
||||
// neither, but not both.
|
||||
|
||||
// Look in the CompactByteArray.
|
||||
const CaseEquivClass* c = getCaseMapOf(folded);
|
||||
if (c != NULL) {
|
||||
caseCloseOne(*c);
|
||||
return;
|
||||
}
|
||||
|
||||
// Binary search in pairs array, looking at only even entries.
|
||||
// The indices low, high, and x will be halved with respect to
|
||||
// CASE_PAIRS[]; that is, they must be doubled before indexing.
|
||||
|
||||
// CASE_PAIRS has 1312 elements, of 656 pairs, so the search
|
||||
// takes no more than 10 passes.
|
||||
int32_t low = 0;
|
||||
int32_t high = (CASE_PAIRS_LENGTH >> 1) - 1;
|
||||
int32_t x;
|
||||
do {
|
||||
x = (low + high) >> 1;
|
||||
UChar ch = CASE_PAIRS[(uint32_t)(x << 1)];
|
||||
if (folded < ch) {
|
||||
high = x - 1;
|
||||
} else if (folded > ch) {
|
||||
low = x + 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} while (low < high);
|
||||
|
||||
x = (low + high) & ~1; // ((low + high) >> 1) << 1
|
||||
if (folded == CASE_PAIRS[x]) {
|
||||
add(CASE_PAIRS[x]);
|
||||
add(CASE_PAIRS[x+1]);
|
||||
} else {
|
||||
// If the search fails, then add folded itself; it is a
|
||||
// case-unique code unit.
|
||||
add(folded);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to this set all members of the given CaseEquivClass object.
|
||||
*/
|
||||
void UnicodeSet::caseCloseOne(const CaseEquivClass& c) {
|
||||
const UChar* p = c.getSingles();
|
||||
while (*p) {
|
||||
add(*p++); // add all single code units
|
||||
}
|
||||
for (c.getStrings(p); *p; c.nextString(p)) {
|
||||
add(p); // add all strings
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a folded string of length >= 2 code units, return the
|
||||
* CaseEquivClass containing this string, or NULL if none.
|
||||
*/
|
||||
const CaseEquivClass* UnicodeSet::getCaseMapOf(const UnicodeString& folded) {
|
||||
umtx_lock(NULL);
|
||||
UBool f = (CASE_EQUIV_HASH == NULL);
|
||||
umtx_unlock(NULL);
|
||||
|
||||
if (f) {
|
||||
// Create the Hashtable, which maps UnicodeStrings to index
|
||||
// values into CASE_NONPAIRS.
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
Hashtable* hash = new Hashtable();
|
||||
if (hash != NULL) {
|
||||
int32_t i;
|
||||
for (i=0; i<(int32_t)CASE_NONPAIRS_LENGTH; ++i) {
|
||||
const CaseEquivClass* c = &CASE_NONPAIRS[i];
|
||||
const UChar* p;
|
||||
for (c->getStrings(p); *p; c->nextString(p)) {
|
||||
hash->put(UnicodeString(p), (void*) c, ec);
|
||||
}
|
||||
}
|
||||
if (U_SUCCESS(ec)) {
|
||||
umtx_lock(NULL);
|
||||
if (CASE_EQUIV_HASH == NULL) {
|
||||
CASE_EQUIV_HASH = hash;
|
||||
hash = NULL;
|
||||
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
}
|
||||
delete hash;
|
||||
}
|
||||
}
|
||||
|
||||
return (CASE_EQUIV_HASH != NULL) ?
|
||||
(const CaseEquivClass*) CASE_EQUIV_HASH->get(folded) : NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a folded code unit, return the CaseEquivClass containing it,
|
||||
* or NULL if none.
|
||||
*/
|
||||
const CaseEquivClass* UnicodeSet::getCaseMapOf(UChar folded) {
|
||||
umtx_lock(NULL);
|
||||
UBool f = (CASE_EQUIV_CBA == NULL);
|
||||
umtx_unlock(NULL);
|
||||
|
||||
if (f) {
|
||||
// Create the CompactByteArray, which maps single code units
|
||||
// to index values into CASE_NONPAIRS.
|
||||
CompactByteArray* cba = ucmp8_open(-1);
|
||||
if (ucmp8_isBogus(cba)) {
|
||||
ucmp8_close(cba);
|
||||
cba = NULL;
|
||||
} else {
|
||||
int32_t i;
|
||||
for (i=0; i<(int32_t)CASE_NONPAIRS_LENGTH; ++i) {
|
||||
const UChar* p = CASE_NONPAIRS[i].getSingles();
|
||||
UChar ch;
|
||||
while ((ch = *p++) != 0) {
|
||||
ucmp8_set(cba, ch, (int8_t) i);
|
||||
}
|
||||
}
|
||||
ucmp8_compact(cba, 256);
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
if (CASE_EQUIV_CBA == NULL) {
|
||||
CASE_EQUIV_CBA = cba;
|
||||
cba = NULL;
|
||||
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
if (cba != NULL) {
|
||||
ucmp8_close(cba);
|
||||
}
|
||||
}
|
||||
|
||||
if (CASE_EQUIV_CBA != NULL) {
|
||||
int32_t index = ucmp8_getu(CASE_EQUIV_CBA, folded);
|
||||
if (index != 255) {
|
||||
return &CASE_NONPAIRS[index];
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -262,7 +262,7 @@ isAcceptable(void * /* context */,
|
||||
static UBool U_CALLCONV
|
||||
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
|
||||
/* add the start code point to the USet */
|
||||
USetAdder *sa=(USetAdder *)context;
|
||||
const USetAdder *sa=(const USetAdder *)context;
|
||||
sa->add(sa->set, start);
|
||||
return TRUE;
|
||||
}
|
||||
@ -1129,7 +1129,7 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
UChar c;
|
||||
|
||||
if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) {
|
||||
|
@ -452,7 +452,7 @@ unorm_getNX(int32_t options, UErrorCode *pErrorCode);
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
|
||||
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Swap unorm.icu. See udataswp.h.
|
||||
|
@ -400,7 +400,7 @@ uprops_getSource(UProperty which) {
|
||||
#if 0
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getInclusions(USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode) {
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -305,7 +305,7 @@ uprv_getMaxISOCommentLength();
|
||||
* @param sa USetAdder to receive characters.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getCharNameCharacters(USetAdder *sa);
|
||||
uprv_getCharNameCharacters(const USetAdder *sa);
|
||||
|
||||
#if 0
|
||||
/*
|
||||
@ -318,7 +318,7 @@ urename.h and unames.c changed accordingly.
|
||||
* @param sa USetAdder to receive characters.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getISOCommentCharacters(USetAdder *sa);
|
||||
uprv_getISOCommentCharacters(const USetAdder *sa);
|
||||
*/
|
||||
#endif
|
||||
|
||||
@ -360,14 +360,14 @@ uprops_getSource(UProperty which);
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
|
||||
uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Same as uchar_addPropertyStarts() but only for Hangul_Syllable_Type.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
|
||||
uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Return a set of characters for property enumeration.
|
||||
@ -378,7 +378,7 @@ uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_getInclusions(USetAdder *sa, UErrorCode *pErrorCode);
|
||||
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Swap the ICU Unicode properties file. See uchar.c.
|
||||
|
@ -963,6 +963,32 @@ void UnicodeSetTest::TestCloseOver() {
|
||||
CASE,
|
||||
"[ABC]","[A-Ca-c]",
|
||||
|
||||
CASE, "[i]", "[iI]",
|
||||
|
||||
CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
|
||||
CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
|
||||
|
||||
CASE, "[\\u0131]", "[\\u0131]", // dotless i
|
||||
|
||||
CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
|
||||
|
||||
CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
|
||||
|
||||
CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
|
||||
|
||||
CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
|
||||
|
||||
CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
|
||||
|
||||
CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
|
||||
CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
|
||||
|
||||
CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
|
||||
|
||||
CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
|
||||
|
||||
CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
|
||||
|
||||
CASE_MAPPINGS,
|
||||
"[aq\\u00DF{Bc}{bC}{Fi}]",
|
||||
"[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
|
||||
@ -980,6 +1006,7 @@ void UnicodeSetTest::TestCloseOver() {
|
||||
|
||||
UnicodeSet s;
|
||||
UnicodeSet t;
|
||||
UnicodeString buf;
|
||||
for (int32_t i=0; DATA[i]!=NULL; i+=3) {
|
||||
int32_t selector = DATA[i][0];
|
||||
UnicodeString pat(DATA[i+1]);
|
||||
@ -994,12 +1021,72 @@ void UnicodeSetTest::TestCloseOver() {
|
||||
if (s == t) {
|
||||
logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
|
||||
} else {
|
||||
UnicodeString buf;
|
||||
errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
|
||||
s.toPattern(buf, TRUE) + ", expected " + exp);
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
/*
|
||||
* Unused test code.
|
||||
* This was used to compare the old implementation (using USET_CASE)
|
||||
* with the new one (using 0x100 temporarily)
|
||||
* while transitioning from hardcoded case closure tables in uniset.cpp
|
||||
* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
|
||||
* and using ucase.c functions for closure.
|
||||
* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
|
||||
*
|
||||
* Note: The old and new implementation never fully matched because
|
||||
* the old implementation turned out to not map U+0130 and U+0131 correctly
|
||||
* (dotted I and dotless i) and because the old implementation's data tables
|
||||
* were outdated compared to Unicode 4.0.1 at the time of the change to the
|
||||
* new implementation. (So sigmas and some other characters were not handled
|
||||
* according to the newer Unicode version.)
|
||||
*/
|
||||
UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
|
||||
UnicodeSetIterator si(sens);
|
||||
UnicodeString str, buf2;
|
||||
const UnicodeString *pStr;
|
||||
UChar32 c;
|
||||
while(si.next()) {
|
||||
if(!si.isString()) {
|
||||
c=si.getCodepoint();
|
||||
s.clear();
|
||||
s.add(c);
|
||||
|
||||
str.setTo(c);
|
||||
str.foldCase();
|
||||
sens2.add(str);
|
||||
|
||||
t=s;
|
||||
s.closeOver(USET_CASE);
|
||||
t.closeOver(0x100);
|
||||
if(s!=t) {
|
||||
errln("FAIL: closeOver(U+%04x) differs: ", c);
|
||||
errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
|
||||
}
|
||||
}
|
||||
}
|
||||
// remove all code points
|
||||
// should contain all full case folding mapping strings
|
||||
sens2.remove(0, 0x10ffff);
|
||||
si.reset(sens2);
|
||||
while(si.next()) {
|
||||
if(si.isString()) {
|
||||
pStr=&si.getString();
|
||||
s.clear();
|
||||
s.add(*pStr);
|
||||
t=s2=s;
|
||||
s.closeOver(USET_CASE);
|
||||
t.closeOver(0x100);
|
||||
if(s!=t) {
|
||||
errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
|
||||
errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Test the pattern API
|
||||
s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
|
@ -28,6 +28,21 @@ U_CDECL_BEGIN
|
||||
#define UGENCASE_EXC_SHIFT 16
|
||||
#define UGENCASE_EXC_MASK 0xffff0000
|
||||
|
||||
/*
|
||||
* Values for the ucase.icu unfold[] data array, see store.c.
|
||||
* The values are stored in ucase.icu so that the runtime code will work with
|
||||
* changing values, but they are hardcoded for gencase for simplicity.
|
||||
* They are optimized, that is, provide for minimal table column widths,
|
||||
* for the actual Unicode data, so that the table size is minimized.
|
||||
* Future versions of Unicode may require increases of some of these values.
|
||||
*/
|
||||
enum {
|
||||
UGENCASE_UNFOLD_STRING_WIDTH=3,
|
||||
UGENCASE_UNFOLD_CP_WIDTH=2,
|
||||
UGENCASE_UNFOLD_WIDTH=UGENCASE_UNFOLD_STRING_WIDTH+UGENCASE_UNFOLD_CP_WIDTH,
|
||||
UGENCASE_UNFOLD_MAX_ROWS=250
|
||||
};
|
||||
|
||||
/* special casing data */
|
||||
typedef struct {
|
||||
UChar32 code;
|
||||
@ -45,6 +60,7 @@ typedef struct {
|
||||
/* case mapping properties */
|
||||
typedef struct {
|
||||
UChar32 code, lowerCase, upperCase, titleCase;
|
||||
UChar32 closure[8];
|
||||
SpecialCasing *specialCasing;
|
||||
CaseFolding *caseFolding;
|
||||
uint8_t gc, cc;
|
||||
|
@ -26,11 +26,14 @@
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "utrie.h"
|
||||
#include "uarrsort.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "propsvec.h"
|
||||
#include "gencase.h"
|
||||
|
||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
/* Unicode case mapping properties file format ---------------------------------
|
||||
|
||||
The file format prepared and written here contains several data
|
||||
@ -41,7 +44,9 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
||||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 1 .
|
||||
The following is a description of format version 1.1 .
|
||||
|
||||
Format version 1.1 adds data for case closure.
|
||||
|
||||
The file contains the following structures:
|
||||
|
||||
@ -52,16 +57,19 @@ The file contains the following structures:
|
||||
i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
|
||||
i2 trieSize; -- size in bytes of the case mapping properties trie
|
||||
i3 exceptionsLength; -- length in uint16_t of the exceptions array
|
||||
i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
|
||||
|
||||
i4..i14 reservedIndexes; -- reserved values; 0 for now
|
||||
i5..i14 reservedIndexes; -- reserved values; 0 for now
|
||||
|
||||
i15 maxFullLength; -- maximum length of a full case mapping/folding string
|
||||
|
||||
|
||||
Serizalied trie, see utrie.h;
|
||||
Serialized trie, see utrie.h;
|
||||
|
||||
const uint16_t exceptions[exceptionsLength];
|
||||
|
||||
const UChar unfold[unfoldLength];
|
||||
|
||||
|
||||
Trie data word:
|
||||
Bits
|
||||
@ -117,12 +125,24 @@ Optional-value slots:
|
||||
1 case folding (code point)
|
||||
2 uppercase mapping (code point)
|
||||
3 titlecase mapping (code point)
|
||||
4..6 reserved
|
||||
4 reserved
|
||||
5 reserved
|
||||
6 closure mappings (new in format version 1.1)
|
||||
7 there is at least one full (string) case mapping
|
||||
the length of each is encoded in a nibble of this optional value,
|
||||
and the strings follow this optional value in the same order:
|
||||
lower/fold/upper/title
|
||||
|
||||
The optional closure mappings value is used as follows:
|
||||
Bits 0..3 contain the length of a string of code points for case closure.
|
||||
The string immediately follows the full case mappings, or the closure value
|
||||
slot if there are no full case mappings.
|
||||
Bits 4..15 are reserved and could be used in the future to indicate the
|
||||
number of strings for case closure.
|
||||
Complete case closure for a code point is given by the union of all simple
|
||||
and full case mappings and foldings, plus the case closure code points
|
||||
(and potentially, in the future, case closure strings).
|
||||
|
||||
For space saving, some values are not stored. Lookups are as follows:
|
||||
- If special casing is conditional, then no full lower/upper/title mapping
|
||||
strings are stored.
|
||||
@ -135,6 +155,28 @@ For space saving, some values are not stored. Lookups are as follows:
|
||||
simple title->simple upper
|
||||
finally, the original code point (no mapping)
|
||||
|
||||
This fallback order is strict:
|
||||
In particular, the fallback from full case folding is to simple case folding,
|
||||
not to full lowercase mapping.
|
||||
|
||||
Reverse case folding data ("unfold") array: (new in format version 1.1)
|
||||
|
||||
This array stores some miscellaneous values followed by a table. The data maps
|
||||
back from multi-character strings to their original code points, for use
|
||||
in case closure.
|
||||
|
||||
The table contains two columns of strings.
|
||||
The string in the first column is the case folding of each of the code points
|
||||
in the second column. The strings are terminated with NUL or by the end of the
|
||||
column, whichever comes first.
|
||||
|
||||
The miscellaneous data takes up one pseudo-row and includes:
|
||||
- number of rows
|
||||
- number of UChars per row
|
||||
- number of UChars in the left (folding string) column
|
||||
|
||||
The table is sorted by its first column. Values in the first column are unique.
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
@ -149,7 +191,7 @@ static UDataInfo dataInfo={
|
||||
|
||||
/* dataFormat="cAsE" */
|
||||
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
|
||||
{ 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 4, 0, 1, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
@ -167,6 +209,13 @@ static uint16_t exceptionsCount=0;
|
||||
/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
|
||||
static int32_t maxFullLength=U16_MAX_LENGTH;
|
||||
|
||||
/* reverse case folding ("unfold") data */
|
||||
static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
|
||||
0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
|
||||
};
|
||||
static uint16_t unfoldRows=0;
|
||||
static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
@ -176,6 +225,29 @@ setUnicodeVersion(const char *v) {
|
||||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||
}
|
||||
|
||||
static void
|
||||
addUnfolding(UChar32 c, const UChar *s, int32_t length) {
|
||||
int32_t i;
|
||||
|
||||
if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
|
||||
fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
|
||||
(long)length, UGENCASE_UNFOLD_STRING_WIDTH);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
if(unfoldTop>=LENGTHOF(unfold)) {
|
||||
fprintf(stderr, "gencase error: too many multi-character case foldings\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
|
||||
u_memcpy(unfold+unfoldTop, s, length);
|
||||
|
||||
i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
|
||||
U16_APPEND_UNSAFE(unfold, i, c);
|
||||
|
||||
++unfoldRows;
|
||||
unfoldTop+=UGENCASE_UNFOLD_WIDTH;
|
||||
}
|
||||
|
||||
/* store a character's properties ------------------------------------------- */
|
||||
|
||||
extern void
|
||||
@ -214,6 +286,9 @@ setProps(Props *p) {
|
||||
if(p->upperCase!=p->titleCase) {
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
if(p->closure[0]!=0) {
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
if(p->specialCasing!=NULL) {
|
||||
value|=UCASE_EXCEPTION;
|
||||
}
|
||||
@ -286,6 +361,14 @@ setProps(Props *p) {
|
||||
u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
/* add the multi-character case folding to the "unfold" data */
|
||||
if(p->caseFolding!=NULL) {
|
||||
int32_t length=p->caseFolding->full[0];
|
||||
if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
|
||||
addUnfolding(p->code, p->caseFolding->full+1, length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
@ -298,13 +381,368 @@ addCaseSensitive(UChar32 first, UChar32 last) {
|
||||
}
|
||||
}
|
||||
|
||||
/* finalize reverse case folding ("unfold") data ---------------------------- */
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
compareUnfold(const void *context, const void *left, const void *right) {
|
||||
return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
|
||||
}
|
||||
|
||||
static void
|
||||
makeUnfoldData() {
|
||||
static const UChar
|
||||
iDot[2]= { 0x69, 0x307 };
|
||||
|
||||
UChar *p, *q;
|
||||
int32_t i, j, k;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/*
|
||||
* add a case folding that we missed because it's conditional:
|
||||
* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||||
*/
|
||||
addUnfolding(0x130, iDot, 2);
|
||||
|
||||
/* sort the data */
|
||||
errorCode=U_ZERO_ERROR;
|
||||
uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
|
||||
compareUnfold, NULL, FALSE, &errorCode);
|
||||
|
||||
/* make unique-string rows by merging adjacent ones' code point columns */
|
||||
|
||||
/* make p point to row i-1 */
|
||||
p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
|
||||
|
||||
for(i=1; i<unfoldRows;) {
|
||||
if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
|
||||
/* concatenate code point columns */
|
||||
q=p+UGENCASE_UNFOLD_STRING_WIDTH;
|
||||
for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
|
||||
for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
|
||||
q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
|
||||
}
|
||||
if(j>UGENCASE_UNFOLD_CP_WIDTH) {
|
||||
fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
|
||||
(long)j, UGENCASE_UNFOLD_CP_WIDTH);
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
|
||||
/* move following rows up one */
|
||||
--unfoldRows;
|
||||
unfoldTop-=UGENCASE_UNFOLD_WIDTH;
|
||||
u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
|
||||
} else {
|
||||
p+=UGENCASE_UNFOLD_WIDTH;
|
||||
++i;
|
||||
}
|
||||
}
|
||||
|
||||
unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
|
||||
|
||||
if(beVerbose) {
|
||||
puts("unfold data:");
|
||||
|
||||
p=(UChar *)unfold;
|
||||
for(i=0; i<unfoldRows; ++i) {
|
||||
p+=UGENCASE_UNFOLD_WIDTH;
|
||||
printf("[%2d] %04x %04x %04x <- %04x %04x\n",
|
||||
i, p[0], p[1], p[2], p[3], p[4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* case closure ------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
addClosureMapping(UChar32 src, UChar32 dest) {
|
||||
uint32_t value;
|
||||
|
||||
if(beVerbose) {
|
||||
printf("add closure mapping U+%04lx->U+%04lx\n",
|
||||
(unsigned long)src, (unsigned long)dest);
|
||||
}
|
||||
|
||||
value=upvec_getValue(pv, src, 0);
|
||||
if(value&UCASE_EXCEPTION) {
|
||||
Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
|
||||
int32_t i;
|
||||
|
||||
/* append dest to src's closure array */
|
||||
for(i=0;; ++i) {
|
||||
if(i==LENGTHOF(p->closure)) {
|
||||
fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
|
||||
(unsigned long)src, (unsigned long)dest);
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
} else if(p->closure[i]==dest) {
|
||||
break; /* do not store duplicates */
|
||||
} else if(p->closure[i]==0) {
|
||||
p->closure[i]=dest;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Props p2={ 0 };
|
||||
UChar32 next;
|
||||
UErrorCode errorCode;
|
||||
|
||||
/*
|
||||
* decode value into p2 (enough for makeException() to work properly),
|
||||
* add the closure mapping,
|
||||
* and set the new exception for src
|
||||
*/
|
||||
p2.code=src;
|
||||
p2.closure[0]=dest;
|
||||
|
||||
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
|
||||
/* one simple case mapping, don't care which one */
|
||||
next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
|
||||
if(next!=src) {
|
||||
if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
|
||||
p2.upperCase=p2.titleCase=next;
|
||||
} else {
|
||||
p2.lowerCase=next;
|
||||
}
|
||||
}
|
||||
} else if(value&UCASE_DELTA_MASK) {
|
||||
fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
|
||||
(unsigned long)src);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
|
||||
value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
|
||||
value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
|
||||
value|=UCASE_EXCEPTION;
|
||||
uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
|
||||
if(++exceptionsCount==MAX_EXC_COUNT) {
|
||||
fprintf(stderr, "gencase: too many exceptions\n");
|
||||
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
if(!upvec_setValue(pv, src, src+1, 0, value, 0xffffffff, &errorCode)) {
|
||||
fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
|
||||
u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find missing case mapping relationships and add mappings for case closure.
|
||||
* This function starts from an "original" code point and recursively
|
||||
* finds its case mappings and the case mappings of where it maps to.
|
||||
*
|
||||
* The recursion depth is capped at 3 nested calls of this function.
|
||||
* In each call, the current code point is c, and the function enumerates
|
||||
* all of c's simple (single-code point) case mappings.
|
||||
* prev is the code point that case-mapped to c.
|
||||
* prev2 is the code point that case-mapped to prev.
|
||||
*
|
||||
* The initial function call has prev2<0, prev<0, and c==orig
|
||||
* (marking no code points).
|
||||
* It enumerates c's case mappings and recurses without further action.
|
||||
*
|
||||
* The second-level function call has prev2<0, prev==orig, and c is
|
||||
* the destination code point of one of prev's case mappings.
|
||||
* The function checks if any of c's case mappings go back to orig
|
||||
* and adds a closure mapping if not.
|
||||
* In other words, it turns a case mapping relationship of
|
||||
* orig->c
|
||||
* into
|
||||
* orig<->c
|
||||
*
|
||||
* The third-level function call has prev2==orig, prev>=0, and c is
|
||||
* the destination code point of one of prev's case mappings.
|
||||
* (And prev is the destination of one of prev2's case mappings.)
|
||||
* The function checks if any of c's case mappings go back to orig
|
||||
* and adds a closure mapping if not.
|
||||
* In other words, it turns case mapping relationships of
|
||||
* orig->prev->c or orig->prev<->c
|
||||
* into
|
||||
* orig->prev->c->orig or orig->prev<->c->orig
|
||||
* etc.
|
||||
* (Graphically, this closes a triangle.)
|
||||
*
|
||||
* With repeated application on all code points until no more closure mappings
|
||||
* are added, all case equivalence groups get complete mappings.
|
||||
* That is, in each group of code points with case relationships
|
||||
* each code point will in the end have some mapping to each other
|
||||
* code point in the group.
|
||||
*
|
||||
* @return TRUE if a closure mapping was added
|
||||
*/
|
||||
static UBool
|
||||
addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
|
||||
UChar32 next;
|
||||
UBool someMappingsAdded=FALSE;
|
||||
|
||||
if(c!=orig) {
|
||||
/* get the properties for c */
|
||||
value=upvec_getValue(pv, c, 0);
|
||||
}
|
||||
/* else if c==orig then c's value was passed in */
|
||||
|
||||
if(value&UCASE_EXCEPTION) {
|
||||
UChar32 set[32];
|
||||
int32_t i, count=0;
|
||||
|
||||
Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
|
||||
|
||||
/*
|
||||
* marker for whether any of c's mappings goes to orig
|
||||
* c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
|
||||
*/
|
||||
UBool mapsToOrig=(UBool)(c==orig);
|
||||
|
||||
/* collect c's case mapping destinations in set[] */
|
||||
if((next=p->upperCase)!=0 && next!=c) {
|
||||
set[count++]=next;
|
||||
}
|
||||
if((next=p->lowerCase)!=0 && next!=c) {
|
||||
set[count++]=next;
|
||||
}
|
||||
if(p->upperCase!=(next=p->titleCase) && next!=c) {
|
||||
set[count++]=next;
|
||||
}
|
||||
if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
|
||||
set[count++]=next;
|
||||
}
|
||||
|
||||
/* append c's current closure mappings to set[] */
|
||||
for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
|
||||
set[count++]=next;
|
||||
}
|
||||
|
||||
/* process all code points to which c case-maps */
|
||||
for(i=0; i<count; ++i) {
|
||||
next=set[i]; /* next!=c */
|
||||
|
||||
if(next==orig) {
|
||||
mapsToOrig=TRUE; /* remember that we map to orig */
|
||||
} else if(prev2<0 && next!=prev) {
|
||||
/*
|
||||
* recurse unless
|
||||
* we have reached maximum depth (prev2>=0) or
|
||||
* this is a mapping to one of the previous code points (orig, prev, c)
|
||||
*/
|
||||
someMappingsAdded|=addClosure(orig, prev, c, next, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if(!mapsToOrig) {
|
||||
addClosureMapping(c, orig);
|
||||
return TRUE;
|
||||
}
|
||||
} else {
|
||||
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
|
||||
/* one simple case mapping, don't care which one */
|
||||
next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
|
||||
if(next!=c) {
|
||||
/*
|
||||
* recurse unless
|
||||
* we have reached maximum depth (prev2>=0) or
|
||||
* this is a mapping to one of the previous code points (orig, prev, c)
|
||||
*/
|
||||
if(prev2<0 && next!=orig && next!=prev) {
|
||||
someMappingsAdded|=addClosure(orig, prev, c, next, 0);
|
||||
}
|
||||
|
||||
if(c!=orig && next!=orig) {
|
||||
/* c does not map to orig, add a closure mapping c->orig */
|
||||
addClosureMapping(c, orig);
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return someMappingsAdded;
|
||||
}
|
||||
|
||||
extern void
|
||||
makeCaseClosure() {
|
||||
/* TODO */
|
||||
UChar *p;
|
||||
uint32_t *row;
|
||||
uint32_t value;
|
||||
UChar32 start, limit, c, c2;
|
||||
int32_t i, j;
|
||||
UBool someMappingsAdded;
|
||||
|
||||
/*
|
||||
* finalize the "unfold" data because we need to use it to add closure mappings
|
||||
* for situations like FB05->"st"<-FB06
|
||||
* where we would otherwise miss the FB05<->FB06 relationship
|
||||
*/
|
||||
makeUnfoldData();
|
||||
|
||||
/* use the "unfold" data to add mappings */
|
||||
|
||||
/* p always points to the code points; this loop ignores the strings completely */
|
||||
p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
|
||||
|
||||
for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
|
||||
j=0;
|
||||
U16_NEXT_UNSAFE(p, j, c);
|
||||
while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
|
||||
U16_NEXT_UNSAFE(p, j, c2);
|
||||
addClosure(c, U_SENTINEL, c, c2, 0);
|
||||
}
|
||||
}
|
||||
|
||||
if(beVerbose) {
|
||||
puts("---- ---- ---- ---- (done with closures from unfolding)");
|
||||
}
|
||||
|
||||
/* add further closure mappings from analyzing simple mappings */
|
||||
do {
|
||||
someMappingsAdded=FALSE;
|
||||
|
||||
i=0;
|
||||
while((row=upvec_getRow(pv, i, &start, &limit))!=NULL) {
|
||||
value=*row;
|
||||
if(value!=0) {
|
||||
while(start<limit) {
|
||||
if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
|
||||
someMappingsAdded=TRUE;
|
||||
|
||||
/*
|
||||
* stop this loop because pv was changed and row is not valid any more
|
||||
* skip all rows below the current start
|
||||
*/
|
||||
while((row=upvec_getRow(pv, i, NULL, &limit))!=NULL && start>=limit) {
|
||||
++i;
|
||||
}
|
||||
row=NULL; /* signal to continue with outer loop, without further ++i */
|
||||
break;
|
||||
}
|
||||
++start;
|
||||
}
|
||||
if(row==NULL) {
|
||||
continue; /* see row=NULL above */
|
||||
}
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
if(beVerbose && someMappingsAdded) {
|
||||
puts("---- ---- ---- ----");
|
||||
}
|
||||
} while(someMappingsAdded);
|
||||
}
|
||||
|
||||
/* exceptions --------------------------------------------------------------- */
|
||||
|
||||
/* get the string length from zero-terminated code points in a limited-length array */
|
||||
static int32_t
|
||||
getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
|
||||
int32_t i, length;
|
||||
|
||||
for(i=length=0; i<maxLength && s[i]!=0; ++i) {
|
||||
length+=U16_LENGTH(s[i]);
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
static UBool
|
||||
fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
|
||||
int32_t i, length;
|
||||
@ -441,6 +879,15 @@ makeException(uint32_t value, Props *p) {
|
||||
excWord|=U_MASK(UCASE_EXC_TITLE);
|
||||
}
|
||||
|
||||
/* length of case closure */
|
||||
if(p->closure[0]!=0) {
|
||||
length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
|
||||
slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
|
||||
slotBits|=slots[count];
|
||||
++count;
|
||||
excWord|=U_MASK(UCASE_EXC_CLOSURE);
|
||||
}
|
||||
|
||||
/* lengths of full case mapping strings, stored in the last slot */
|
||||
fullLengths=0;
|
||||
if(p->specialCasing!=NULL) {
|
||||
@ -493,6 +940,15 @@ makeException(uint32_t value, Props *p) {
|
||||
excTop+=length;
|
||||
}
|
||||
|
||||
/* write the closure data */
|
||||
if(p->closure[0]!=0) {
|
||||
UChar32 c;
|
||||
|
||||
for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
|
||||
U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
|
||||
}
|
||||
}
|
||||
|
||||
exceptionsTop=excTop;
|
||||
|
||||
/* write the main exceptions word */
|
||||
@ -559,7 +1015,8 @@ generateData(const char *dataDir) {
|
||||
|
||||
indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
|
||||
indexes[UCASE_IX_TRIE_SIZE]=trieSize;
|
||||
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;
|
||||
indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
|
||||
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
|
||||
|
||||
indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
|
||||
|
||||
@ -567,6 +1024,7 @@ generateData(const char *dataDir) {
|
||||
printf("trie size in bytes: %5d\n", (int)trieSize);
|
||||
printf("number of code points with exceptions: %5d\n", exceptionsCount);
|
||||
printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
|
||||
printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop);
|
||||
printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
|
||||
}
|
||||
|
||||
@ -581,6 +1039,7 @@ generateData(const char *dataDir) {
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, trieBlock, trieSize);
|
||||
udata_writeBlock(pData, exceptions, 2*exceptionsTop);
|
||||
udata_writeBlock(pData, unfold, 2*unfoldTop);
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
|
Loading…
Reference in New Issue
Block a user