ICU-3432 move uniset.cpp data for case closure to ucase.icu; have gencase build case closure data; ucase.c use it; UnicodeSet::closeOver() call that

X-SVN-Rev: 16902
This commit is contained in:
Markus Scherer 2004-12-02 04:18:35 +00:00
parent 67f46c57e8
commit ca77616509
25 changed files with 1002 additions and 690 deletions

View File

@ -33,6 +33,7 @@ struct UCaseProps {
UDataMemory *mem;
const int32_t *indexes;
const uint16_t *exceptions;
const UChar *unfold;
UTrie trie;
uint8_t formatVersion[4];
@ -68,38 +69,50 @@ static UCaseProps *
ucase_openData(UCaseProps *cspProto,
const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
UCaseProps *csp;
int32_t size, trieSize;
int32_t size;
cspProto->indexes=(const int32_t *)bin;
if( cspProto->indexes[UCASE_IX_INDEX_TOP]<16 ||
(length>=0 && length<cspProto->indexes[UCASE_IX_LENGTH])
if( (length>=0 && length<16*4) ||
cspProto->indexes[UCASE_IX_INDEX_TOP]<16
) {
/* length or indexes[] too short for minimum indexes[] length of 16 */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
/* get the trie address, after indexes[] */
size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
bin+=size;
if(length>=0 && (length-=size)<16) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
if(length>=0) {
if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
length-=size;
} else {
/* length too short for indexes[] or for the whole data length */
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
}
bin+=size;
/* from here on, assume that the sizes of the items fit into the total length */
/* unserialize the trie */
trieSize=cspProto->indexes[UCASE_IX_TRIE_SIZE];
trieSize=utrie_unserialize(&cspProto->trie, bin, length>=0 ? length : trieSize, pErrorCode);
/* unserialize the trie, after indexes[] */
size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
bin+=size;
/* get exceptions[] */
bin+=trieSize;
if(length>=0 && (length-=trieSize)<2*cspProto->indexes[UCASE_IX_EXC_LENGTH]) {
*pErrorCode=U_INVALID_FORMAT_ERROR;
return NULL;
}
size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
cspProto->exceptions=(const uint16_t *)bin;
bin+=size;
/* get unfold[] */
size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
if(size!=0) {
cspProto->unfold=(const UChar *)bin;
bin+=size;
} else {
cspProto->unfold=NULL;
}
/* allocate, copy, and return the new UCaseProps */
csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
@ -322,8 +335,8 @@ ucase_swap(const UDataSwapper *ds,
utrie_swap(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
offset+=count;
/* swap the uint16_t exceptions[] */
count=indexes[UCASE_IX_EXC_LENGTH]*2;
/* swap the uint16_t exceptions[] and unfold[] */
count=(indexes[UCASE_IX_EXC_LENGTH]+indexes[UCASE_IX_UNFOLD_LENGTH])*2;
ds->swapArray16(ds, inBytes+offset, count, outBytes+offset, pErrorCode);
offset+=count;
@ -338,13 +351,13 @@ ucase_swap(const UDataSwapper *ds,
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
/* add the start code point to the USet */
USetAdder *sa=(USetAdder *)context;
const USetAdder *sa=(const USetAdder *)context;
sa->add(sa->set, start);
return TRUE;
}
U_CAPI void U_EXPORT2
ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode) {
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return;
}
@ -368,8 +381,6 @@ ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pError
#define GET_PROPS(csp, c, result) \
UTRIE_GET16(&(csp)->trie, c, result);
#define GET_CASE_TYPE(props) ((props)&UCASE_TYPE_MASK)
#define GET_SIGNED_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
#define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
#define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
@ -423,8 +434,8 @@ ucase_tolower(const UCaseProps *csp, UChar32 c) {
uint16_t props;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
c+=GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
c+=UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
@ -441,8 +452,8 @@ ucase_toupper(const UCaseProps *csp, UChar32 c) {
uint16_t props;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
c+=GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
@ -459,8 +470,8 @@ ucase_totitle(const UCaseProps *csp, UChar32 c) {
uint16_t props;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
c+=GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
c+=UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
@ -478,12 +489,231 @@ ucase_totitle(const UCaseProps *csp, UChar32 c) {
return c;
}
U_CAPI void U_EXPORT2
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
uint16_t props;
/*
* Hardcode the case closure of i and its relatives and ignore the
* data file data for these characters.
* The Turkic dotless i and dotted I with their case mapping conditions
* and case folding option make the related characters behave specially.
* This code matches their closure behavior to their case folding behavior.
*/
static const UChar
iDot[2]= { 0x69, 0x307 };
switch(c) {
case 0x49:
/* regular i and I are in one equivalence class */
sa->add(sa->set, 0x69);
return;
case 0x69:
sa->add(sa->set, 0x49);
return;
case 0x130:
/* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
sa->addString(sa->set, iDot, 2);
return;
case 0x131:
/* dotless i is in a class by itself */
return;
default:
/* otherwise use the data file data */
break;
}
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
/* add the one simple case mapping, no matter what type it is */
int32_t delta=UCASE_GET_DELTA(props);
if(delta!=0) {
sa->add(sa->set, c+delta);
}
}
} else {
/*
* c has exceptions, so there may be multiple simple and/or
* full case mappings. Add them all.
*/
const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
const UChar *closure;
uint16_t excWord=*pe++;
int32_t index, closureLength, fullLength, length;
pe0=pe;
/* add all simple case mappings */
for(index=UCASE_EXC_LOWER; index<=UCASE_EXC_TITLE; ++index) {
if(HAS_SLOT(excWord, index)) {
pe=pe0;
GET_SLOT_VALUE(excWord, index, pe, c);
sa->add(sa->set, c);
}
}
/* get the closure string pointer & length */
if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
} else {
closureLength=0;
}
#if 0
/* add all full case mappings */
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
++pe;
fullLength&=0xffff; /* bits 16 and higher are reserved */
while(fullLength!=0) {
length=fullLength&0xf;
if(length!=0) {
sa->addString(sa->set, (const UChar *)pe, length);
pe+=length;
}
fullLength>>=4;
}
closure=(const UChar *)pe; /* behind full case mappings */
}
#endif
/* add the full case folding */
if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
pe=pe0;
GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
/* start of full case mapping strings */
++pe;
fullLength&=0xffff; /* bits 16 and higher are reserved */
/* skip the lowercase result string */
pe+=fullLength&UCASE_FULL_LOWER;
fullLength>>=4;
/* add the full case folding string */
length=fullLength&0xf;
if(length!=0) {
sa->addString(sa->set, (const UChar *)pe, length);
pe+=length;
}
/* skip the uppercase and titlecase strings */
fullLength>>=4;
pe+=fullLength&0xf;
fullLength>>=4;
pe+=fullLength;
closure=(const UChar *)pe; /* behind full case mappings */
}
/* add each code point in the closure string */
for(index=0; index<closureLength;) {
U16_NEXT_UNSAFE(closure, index, c);
sa->add(sa->set, c);
}
}
}
/*
* compare s, which has a length, with t, which has a maximum length or is NUL-terminated
* must be length>0 and max>0 and length<=max
*/
static U_INLINE int32_t
strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
int32_t c1, c2;
max-=length; /* we require length<=max, so no need to decrement max in the loop */
do {
c1=*s++;
c2=*t++;
if(c2==0) {
return 1; /* reached the end of t but not of s */
}
c1-=c2;
if(c1!=0) {
return c1; /* return difference result */
}
} while(--length>0);
/* ends with length==0 */
if(max==0 || *t==0) {
return 0; /* equal to length of both strings */
} else {
return -max; /* return lengh difference */
}
}
U_CAPI UBool U_EXPORT2
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
const UChar *unfold, *p;
int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth, unfoldCPWidth;
if(csp->unfold==NULL || s==NULL) {
return FALSE; /* no reverse case folding data, or no string */
}
if(length<=1) {
/* the string is too short to find any match */
/*
* more precise would be:
* if(!u_strHasMoreChar32Than(s, length, 1))
* but this does not make much practical difference because
* a single supplementary code point would just not be found
*/
return FALSE;
}
unfold=csp->unfold;
unfoldRows=unfold[UCASE_UNFOLD_ROWS];
unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
unfoldCPWidth=unfoldRowWidth-unfoldStringWidth;
unfold+=unfoldRowWidth;
if(length>unfoldStringWidth) {
/* the string is too long to find any match */
return FALSE;
}
/* do a binary search for the string */
start=0;
limit=unfoldRows;
while(start<limit) {
i=(start+limit)/2;
p=unfold+(i*unfoldRowWidth);
result=strcmpMax(s, length, p, unfoldStringWidth);
if(result==0) {
/* found the string: add each code point, and its case closure */
UChar32 c;
for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
U16_NEXT_UNSAFE(p, i, c);
sa->add(sa->set, c);
ucase_addCaseClosure(csp, c, sa);
}
return TRUE;
} else if(result<0) {
limit=i;
} else /* result>0 */ {
start=i+1;
}
}
return FALSE; /* string not found */
}
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
U_CAPI int32_t U_EXPORT2
ucase_getType(const UCaseProps *csp, UChar32 c) {
uint16_t props;
GET_PROPS(csp, c, props);
return GET_CASE_TYPE(props);
return UCASE_GET_TYPE(props);
}
/** @return same as ucase_getType(), or <0 if c is case-ignorable */
@ -492,7 +722,7 @@ ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
int32_t type;
uint16_t props;
GET_PROPS(csp, c, props);
type=GET_CASE_TYPE(props);
type=UCASE_GET_TYPE(props);
if(type!=UCASE_NONE) {
return type;
} else if(
@ -775,7 +1005,7 @@ isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void
for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
GET_PROPS(csp, c, props);
if(GET_CASE_TYPE(props)!=UCASE_NONE) {
if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
return TRUE; /* followed by cased letter */
} else if(c==0x307 || (props&(UCASE_EXCEPTION|UCASE_CASE_IGNORABLE))==UCASE_CASE_IGNORABLE) {
/* case-ignorable, continue with the loop */
@ -934,8 +1164,8 @@ ucase_toFullLower(const UCaseProps *csp, UChar32 c,
result=c;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
result=c+GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
result=c+UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
@ -1081,8 +1311,8 @@ toUpperOrTitle(const UCaseProps *csp, UChar32 c,
result=c;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)==UCASE_LOWER) {
result=c+GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
result=c+UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
@ -1236,8 +1466,8 @@ ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options) {
uint16_t props;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
c+=GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
c+=UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props);
@ -1305,8 +1535,8 @@ ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
result=c;
GET_PROPS(csp, c, props);
if(!PROPS_HAS_EXCEPTION(props)) {
if(GET_CASE_TYPE(props)>=UCASE_UPPER) {
result=c+GET_SIGNED_DELTA(props);
if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
result=c+UCASE_GET_DELTA(props);
}
} else {
const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;

View File

@ -51,7 +51,7 @@ ucase_swap(const UDataSwapper *ds,
UErrorCode *pErrorCode);
U_CAPI void U_EXPORT2
ucase_addPropertyStarts(const UCaseProps *csp, USetAdder *sa, UErrorCode *pErrorCode);
ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
/**
* Bit mask for getting just the options from a string compare options word
@ -83,6 +83,33 @@ ucase_totitle(const UCaseProps *csp, UChar32 c);
U_CAPI UChar32 U_EXPORT2
ucase_fold(UCaseProps *csp, UChar32 c, uint32_t options);
/**
* Adds all simple case mappings and the full case folding for c to sa,
* and also adds special case closure mappings.
* c itself is not added.
* For example, the mappings
* - for s include long s
* - for sharp s include ss
* - for k include the Kelvin sign
*/
U_CAPI void U_EXPORT2
ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
/**
* Maps the string to single code points and adds the associated case closure
* mappings.
* The string is mapped to code points if it is their full case folding string.
* In other words, this performs a reverse full case folding and then
* adds the case closure items of the resulting code points.
* If the string is found and its closure applied, then
* the string itself is added as well as part of its code points' closure.
* It must be length>=0.
*
* @return TRUE if the string was found
*/
U_CAPI UBool U_EXPORT2
ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
/** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
U_CAPI int32_t U_EXPORT2
ucase_getType(const UCaseProps *csp, UChar32 c);
@ -211,6 +238,7 @@ enum {
UCASE_IX_LENGTH,
UCASE_IX_TRIE_SIZE,
UCASE_IX_EXC_LENGTH,
UCASE_IX_UNFOLD_LENGTH,
UCASE_IX_MAX_FULL_LENGTH=15,
UCASE_IX_TOP=16
@ -227,6 +255,8 @@ enum {
UCASE_TITLE
};
#define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
#define UCASE_SENSITIVE 4
#define UCASE_EXCEPTION 8
@ -264,7 +294,7 @@ enum {
UCASE_EXC_TITLE,
UCASE_EXC_4, /* reserved */
UCASE_EXC_5, /* reserved */
UCASE_EXC_6, /* reserved */
UCASE_EXC_CLOSURE,
UCASE_EXC_FULL_MAPPINGS,
UCASE_EXC_ALL_SLOTS /* one past the last slot */
};
@ -296,6 +326,17 @@ enum {
#define UCASE_FULL_UPPER 0xf00
#define UCASE_FULL_TITLE 0xf000
/* maximum lengths */
#define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
#define UCASE_CLOSURE_MAX_LENGTH 0xf
/* constants for reverse case folding ("unfold") data */
enum {
UCASE_UNFOLD_ROWS,
UCASE_UNFOLD_ROW_WIDTH,
UCASE_UNFOLD_STRING_WIDTH
};
U_CDECL_END
#endif

View File

@ -997,7 +997,7 @@ ublock_getCode(UChar32 c) {
/* for Hangul_Syllable_Type */
U_CAPI void U_EXPORT2
uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
UChar32 c;
int32_t value, value2;
@ -1061,7 +1061,7 @@ uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
/* add the start code point to the USet */
USetAdder *sa=(USetAdder *)context;
const USetAdder *sa=(const USetAdder *)context;
sa->add(sa->set, start);
return TRUE;
}
@ -1069,7 +1069,7 @@ _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 limit, uint
#define USET_ADD_CP_AND_NEXT(sa, cp) sa->add(sa->set, cp); sa->add(sa->set, cp+1)
U_CAPI void U_EXPORT2
uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return;
}

View File

@ -2979,7 +2979,7 @@ _ISO_2022_SafeClone(
static void
_ISO_2022_GetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode)
{

View File

@ -28,7 +28,7 @@
U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
sa->addRange(sa->set, 0, 0x10ffff);
@ -36,7 +36,7 @@ ucnv_getCompleteUnicodeSet(const UConverter *cnv,
U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
sa->addRange(sa->set, 0, 0xd7ff);

View File

@ -171,7 +171,7 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
* For more documentation, see ucnv_getUnicodeSet() in ucnv.h.
*/
typedef void (*UConverterGetUnicodeSet) (const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
@ -246,13 +246,13 @@ U_CDECL_END
U_CFUNC void
ucnv_getCompleteUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_getNonSurrogateUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);

View File

@ -932,7 +932,7 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
static void
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
const int32_t *cx,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
int32_t minLength,
UChar32 c,
@ -989,7 +989,7 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
const int32_t *cx;

View File

@ -384,7 +384,7 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
U_CFUNC void
ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);

View File

@ -664,7 +664,7 @@ _LMBCSSafeClone(const UConverter *cnv,
static void
_LMBCSGetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
/* all but U+F6xx, see LMBCS explanation above (search for F6xx) */

View File

@ -510,7 +510,7 @@ _HZ_SafeClone(const UConverter *cnv,
static void
_HZ_GetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
/* the tilde '~' is hardcoded in the converter */

View File

@ -1332,7 +1332,7 @@ _ISCII_SafeClone(const UConverter *cnv,
static void
_ISCIIGetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode)
{

View File

@ -332,7 +332,7 @@ noMoreInput:
static void
_Latin1GetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
sa->addRange(sa->set, 0, 0xff);
@ -534,7 +534,7 @@ _ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
static void
_ASCIIGetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
sa->addRange(sa->set, 0, 0x7f);

View File

@ -344,7 +344,7 @@ gb18030Ranges[13][4]={
static void
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
@ -421,7 +421,7 @@ _getUnicodeSetForBytes(const UConverterSharedData *sharedData,
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode) {
@ -434,7 +434,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
U_CFUNC void
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
const UConverterMBCSTable *mbcsTable;
@ -571,7 +571,7 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
static void
ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode) {
if(cnv->options&_MBCS_OPTION_GB18030) {

View File

@ -373,7 +373,7 @@ ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode);
@ -388,7 +388,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
USetAdder *sa,
const USetAdder *sa,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);

View File

@ -1718,7 +1718,7 @@ uprv_getMaxISOCommentLength() {
* @param uset USet to receive characters. Existing contents are deleted.
*/
static void
charSetToUSet(uint32_t cset[8], USetAdder *sa) {
charSetToUSet(uint32_t cset[8], const USetAdder *sa) {
UChar us[256];
char cs[256];
@ -1755,7 +1755,7 @@ charSetToUSet(uint32_t cset[8], USetAdder *sa) {
* @param set USet to receive characters.
*/
U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(USetAdder *sa) {
uprv_getCharNameCharacters(const USetAdder *sa) {
charSetToUSet(gNameSet, sa);
}
@ -1769,7 +1769,7 @@ urename.h and uprops.h changed accordingly.
* @param set USetAdder to receive characters.
*/
U_CAPI void U_EXPORT2
uprv_getISOCommentCharacters(USetAdder *sa) {
uprv_getISOCommentCharacters(const USetAdder *sa) {
charSetToUSet(gISOCommentSet, sa);
}
#endif

View File

@ -19,7 +19,6 @@ U_NAMESPACE_BEGIN
class ParsePosition;
class SymbolTable;
class UVector;
class CaseEquivClass;
class RuleCharacterIterator;
/**
@ -1324,20 +1323,6 @@ private:
static const UnicodeSet* getInclusions(int32_t src, UErrorCode &errorCode);
friend class UnicodeSetIterator;
//----------------------------------------------------------------
// Implementation: closeOver
//----------------------------------------------------------------
void caseCloseOne(const UnicodeString& folded);
void caseCloseOne(const CaseEquivClass& c);
void caseCloseOne(UChar folded);
static const CaseEquivClass* getCaseMapOf(const UnicodeString& folded);
static const CaseEquivClass* getCaseMapOf(UChar folded);
};
inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {

View File

@ -57,6 +57,25 @@ enum {
* will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will
* match all except 'a', 'A', 'b', and 'B'. This performs a full
* closure over case mappings, e.g. U+017F for s.
*
* The resulting set is a superset of the input for the code points but
* not for the strings.
* It performs a case mapping closure of the code points and adds
* full case folding strings for the code points, and reduces strings of
* the original set to their full case folding equivalents.
*
* This is designed for case-insensitive matches, for example
* in regular expressions. The full code point case closure allows to check
* an input character directly against the closure set.
* Strings are matched by comparing the case-folded form from the closure
* set with an incremental case folding of the string in question.
*
* The closure set will also contain single code points if the original
* set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
* This is not necessary (that is, redundant) for the above matching method
* but results in the same closure sets regardless of whether the original
* set contained the code point or a string.
*
* @stable ICU 2.4
*/
USET_CASE_INSENSITIVE = 2,

View File

@ -28,7 +28,6 @@
#include "uset_imp.h"
#include "ruleiter.h"
#include "cmemory.h"
#include "uhash.h"
#include "ucln_cmn.h"
#include "util.h"
#include "uvector.h"
@ -42,7 +41,6 @@
#include "mutex.h"
#include "uassert.h"
#include "hash.h"
#include "ucmp8.h"
// initial storage. Must be >= 0
// *** same as in uniset.cpp ! ***
@ -157,10 +155,6 @@ U_NAMESPACE_BEGIN
static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
static Hashtable* CASE_EQUIV_HASH = NULL; // for closeOver(USET_CASE)
static CompactByteArray* CASE_EQUIV_CBA = NULL; // for closeOver(USET_CASE)
// helper functions for matching of pattern syntax pieces ------------------ ***
// these functions are parallel to the PERL_OPEN etc. strings above
@ -1318,16 +1312,6 @@ static UBool U_CALLCONV uset_cleanup(void) {
}
}
if (CASE_EQUIV_HASH != NULL) {
delete CASE_EQUIV_HASH;
CASE_EQUIV_HASH = NULL;
}
if (CASE_EQUIV_CBA != NULL) {
ucmp8_close(CASE_EQUIV_CBA);
CASE_EQUIV_CBA = NULL;
}
return TRUE;
}
@ -1406,33 +1390,26 @@ addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString
}
UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
if ((attribute & USET_CASE) != 0) {
UnicodeSet foldSet;
UnicodeString str;
int32_t n = getRangeCount();
for (int32_t i=0; i<n; ++i) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
for (UChar32 cp=start; cp<=end; ++cp) {
str.truncate(0);
str.append(u_foldCase(cp, U_FOLD_CASE_DEFAULT));
foldSet.caseCloseOne(str);
}
}
if (strings != NULL && strings->size() > 0) {
for (int32_t j=0; j<strings->size(); ++j) {
str = * (const UnicodeString*) strings->elementAt(j);
foldSet.caseCloseOne(str.foldCase());
}
}
*this = foldSet;
}
else if ((attribute & USET_ADD_CASE_MAPPINGS)) {
UnicodeSet foldSet(*this);
UnicodeString str;
if (attribute & (USET_CASE | USET_ADD_CASE_MAPPINGS)) {
UErrorCode status = U_ZERO_ERROR;
UCaseProps *csp = ucase_getSingleton(&status);
if (U_SUCCESS(status)) {
UnicodeSet foldSet(*this);
UnicodeString str;
USetAdder sa = {
(USet *)&foldSet,
_set_add,
_set_addRange,
_set_addString
};
// start with input set to guarantee inclusion
// USET_CASE: remove strings because the strings will actually be reduced (folded);
// therefore, start with no strings and add only those needed
if (attribute & USET_CASE) {
foldSet.strings->removeAllElements();
}
int32_t n = getRangeCount();
UChar32 result;
const UChar *full;
@ -1442,45 +1419,64 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
UChar32 start = getRangeStart(i);
UChar32 end = getRangeEnd(i);
for (UChar32 cp=start; cp<=end; ++cp) {
result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
if (attribute & USET_CASE) {
// full case closure
for (UChar32 cp=start; cp<=end; ++cp) {
ucase_addCaseClosure(csp, cp, &sa);
}
} else {
// add case mappings
// (does not add long s for regular s, or Kelvin for k, for example)
for (UChar32 cp=start; cp<=end; ++cp) {
result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullFolding(csp, cp, &full, 0);
addCaseMapping(foldSet, result, full, str);
result = ucase_toFullFolding(csp, cp, &full, 0);
addCaseMapping(foldSet, result, full, str);
}
}
}
if (strings != NULL && strings->size() > 0) {
Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
#endif
if (U_SUCCESS(status)) {
const UnicodeString *pStr;
if (attribute & USET_CASE) {
for (int32_t j=0; j<strings->size(); ++j) {
pStr = (const UnicodeString *) strings->elementAt(j);
(str = *pStr).toLower(root);
foldSet.add(str);
#if !UCONFIG_NO_BREAK_ITERATION
(str = *pStr).toTitle(bi, root);
foldSet.add(str);
#endif
(str = *pStr).toUpper(root);
foldSet.add(str);
(str = *pStr).foldCase();
foldSet.add(str);
str = *(const UnicodeString *) strings->elementAt(j);
str.foldCase();
if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
foldSet.add(str); // does not map to code points: add the folded string itself
}
}
}
} else {
Locale root("");
#if !UCONFIG_NO_BREAK_ITERATION
delete bi;
BreakIterator *bi = BreakIterator::createWordInstance(root, status);
#endif
if (U_SUCCESS(status)) {
const UnicodeString *pStr;
for (int32_t j=0; j<strings->size(); ++j) {
pStr = (const UnicodeString *) strings->elementAt(j);
(str = *pStr).toLower(root);
foldSet.add(str);
#if !UCONFIG_NO_BREAK_ITERATION
(str = *pStr).toTitle(bi, root);
foldSet.add(str);
#endif
(str = *pStr).toUpper(root);
foldSet.add(str);
(str = *pStr).foldCase();
foldSet.add(str);
}
}
#if !UCONFIG_NO_BREAK_ITERATION
delete bi;
#endif
}
}
*this = foldSet;
}
@ -1488,525 +1484,4 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
return *this;
}
//----------------------------------------------------------------
// Case folding implementation
//----------------------------------------------------------------
/**
* Data structure representing a case-fold equivalency class. It is a
* SET containing 0 or more code units, and 0 or more strings of
* length 2 code units or longer.
*
* This class is implemented as a 8-UChar buffer with a few
* convenience methods on it. The format of the buffer:
* - All single code units in this set, followed by a terminating
* zero. If none, then just a terminating zero.
* - Zero or more 0-terminated strings, each of length >= 2
* code units.
* - A single terminating (UChar)0.
*
* Usage:
*
* const CaseEquivClass& c = ...;
* const UChar* p;
* for (c.getStrings(p); *p; c.nextString(p)) {
* foo(p);
* }
*/
class CaseEquivClass {
public:
UChar data[8];
/**
* Return the string of single code units. May be "". Will never
* be NULL.
*/
const UChar* getSingles() const {
return data;
}
/**
* Return the first multi-code-unit string. May be "" if there
* are none. Will never be NULL.
* @param p pointer to be set to point to the first string.
*/
void getStrings(const UChar*& p) const {
p = data;
nextString(p);
}
/**
* Advance a pointer from one multi-code-unit string to the next.
* May advance 'p' to point to "" if there are no more.
* Do NOT call if *p == 0.
* @param p pointer to be advanced to point to the next string.
*/
static void nextString(const UChar*& p) {
while (*p++) {}
}
};
/**
* IMPORTANT: The following two static data arrays represent the
* information used to do case closure. The first array is an array
* of pairs. That is, for each even index e, entries [e] and [e+1]
* form a pair of case equivalent code units. The entry at [e] is the
* folded one, that is, the one for which u_foldCase(x)==x.
*
* The second static array is an array of CaseEquivClass objects.
* Since these objects are just adorned UChar[] arrays, they can be
* initialized in place in the array, and all of them can live in a
* single piece of static memory, with no heap allocation.
*/
// MACHINE-GENERATED: Do not edit (see com.ibm.icu.dev.tools.translit.UnicodeSetCloseOver)
static const UChar CASE_PAIRS[] = {
0x0061,0x0041,0x0062,0x0042,0x0063,0x0043,0x0064,0x0044,0x0065,0x0045,
0x0066,0x0046,0x0067,0x0047,0x0068,0x0048,0x0069,0x0049,0x006A,0x004A,
0x006C,0x004C,0x006D,0x004D,0x006E,0x004E,0x006F,0x004F,0x0070,0x0050,
0x0071,0x0051,0x0072,0x0052,0x0074,0x0054,0x0075,0x0055,0x0076,0x0056,
0x0077,0x0057,0x0078,0x0058,0x0079,0x0059,0x007A,0x005A,0x00E0,0x00C0,
0x00E1,0x00C1,0x00E2,0x00C2,0x00E3,0x00C3,0x00E4,0x00C4,0x00E6,0x00C6,
0x00E7,0x00C7,0x00E8,0x00C8,0x00E9,0x00C9,0x00EA,0x00CA,0x00EB,0x00CB,
0x00EC,0x00CC,0x00ED,0x00CD,0x00EE,0x00CE,0x00EF,0x00CF,0x00F0,0x00D0,
0x00F1,0x00D1,0x00F2,0x00D2,0x00F3,0x00D3,0x00F4,0x00D4,0x00F5,0x00D5,
0x00F6,0x00D6,0x00F8,0x00D8,0x00F9,0x00D9,0x00FA,0x00DA,0x00FB,0x00DB,
0x00FC,0x00DC,0x00FD,0x00DD,0x00FE,0x00DE,0x00FF,0x0178,0x0101,0x0100,
0x0103,0x0102,0x0105,0x0104,0x0107,0x0106,0x0109,0x0108,0x010B,0x010A,
0x010D,0x010C,0x010F,0x010E,0x0111,0x0110,0x0113,0x0112,0x0115,0x0114,
0x0117,0x0116,0x0119,0x0118,0x011B,0x011A,0x011D,0x011C,0x011F,0x011E,
0x0121,0x0120,0x0123,0x0122,0x0125,0x0124,0x0127,0x0126,0x0129,0x0128,
0x012B,0x012A,0x012D,0x012C,0x012F,0x012E,0x0133,0x0132,0x0135,0x0134,
0x0137,0x0136,0x013A,0x0139,0x013C,0x013B,0x013E,0x013D,0x0140,0x013F,
0x0142,0x0141,0x0144,0x0143,0x0146,0x0145,0x0148,0x0147,0x014B,0x014A,
0x014D,0x014C,0x014F,0x014E,0x0151,0x0150,0x0153,0x0152,0x0155,0x0154,
0x0157,0x0156,0x0159,0x0158,0x015B,0x015A,0x015D,0x015C,0x015F,0x015E,
0x0161,0x0160,0x0163,0x0162,0x0165,0x0164,0x0167,0x0166,0x0169,0x0168,
0x016B,0x016A,0x016D,0x016C,0x016F,0x016E,0x0171,0x0170,0x0173,0x0172,
0x0175,0x0174,0x0177,0x0176,0x017A,0x0179,0x017C,0x017B,0x017E,0x017D,
0x0183,0x0182,0x0185,0x0184,0x0188,0x0187,0x018C,0x018B,0x0192,0x0191,
0x0195,0x01F6,0x0199,0x0198,0x019E,0x0220,0x01A1,0x01A0,0x01A3,0x01A2,
0x01A5,0x01A4,0x01A8,0x01A7,0x01AD,0x01AC,0x01B0,0x01AF,0x01B4,0x01B3,
0x01B6,0x01B5,0x01B9,0x01B8,0x01BD,0x01BC,0x01BF,0x01F7,0x01CE,0x01CD,
0x01D0,0x01CF,0x01D2,0x01D1,0x01D4,0x01D3,0x01D6,0x01D5,0x01D8,0x01D7,
0x01DA,0x01D9,0x01DC,0x01DB,0x01DD,0x018E,0x01DF,0x01DE,0x01E1,0x01E0,
0x01E3,0x01E2,0x01E5,0x01E4,0x01E7,0x01E6,0x01E9,0x01E8,0x01EB,0x01EA,
0x01ED,0x01EC,0x01EF,0x01EE,0x01F5,0x01F4,0x01F9,0x01F8,0x01FB,0x01FA,
0x01FD,0x01FC,0x01FF,0x01FE,0x0201,0x0200,0x0203,0x0202,0x0205,0x0204,
0x0207,0x0206,0x0209,0x0208,0x020B,0x020A,0x020D,0x020C,0x020F,0x020E,
0x0211,0x0210,0x0213,0x0212,0x0215,0x0214,0x0217,0x0216,0x0219,0x0218,
0x021B,0x021A,0x021D,0x021C,0x021F,0x021E,0x0223,0x0222,0x0225,0x0224,
0x0227,0x0226,0x0229,0x0228,0x022B,0x022A,0x022D,0x022C,0x022F,0x022E,
0x0231,0x0230,0x0233,0x0232,0x0253,0x0181,0x0254,0x0186,0x0256,0x0189,
0x0257,0x018A,0x0259,0x018F,0x025B,0x0190,0x0260,0x0193,0x0263,0x0194,
0x0268,0x0197,0x0269,0x0196,0x026F,0x019C,0x0272,0x019D,0x0275,0x019F,
0x0280,0x01A6,0x0283,0x01A9,0x0288,0x01AE,0x028A,0x01B1,0x028B,0x01B2,
0x0292,0x01B7,0x03AC,0x0386,0x03AD,0x0388,0x03AE,0x0389,0x03AF,0x038A,
0x03B1,0x0391,0x03B3,0x0393,0x03B4,0x0394,0x03B6,0x0396,0x03B7,0x0397,
0x03BB,0x039B,0x03BD,0x039D,0x03BE,0x039E,0x03BF,0x039F,0x03C4,0x03A4,
0x03C5,0x03A5,0x03C7,0x03A7,0x03C8,0x03A8,0x03CA,0x03AA,0x03CB,0x03AB,
0x03CC,0x038C,0x03CD,0x038E,0x03CE,0x038F,0x03D9,0x03D8,0x03DB,0x03DA,
0x03DD,0x03DC,0x03DF,0x03DE,0x03E1,0x03E0,0x03E3,0x03E2,0x03E5,0x03E4,
0x03E7,0x03E6,0x03E9,0x03E8,0x03EB,0x03EA,0x03ED,0x03EC,0x03EF,0x03EE,
0x0430,0x0410,0x0431,0x0411,0x0432,0x0412,0x0433,0x0413,0x0434,0x0414,
0x0435,0x0415,0x0436,0x0416,0x0437,0x0417,0x0438,0x0418,0x0439,0x0419,
0x043A,0x041A,0x043B,0x041B,0x043C,0x041C,0x043D,0x041D,0x043E,0x041E,
0x043F,0x041F,0x0440,0x0420,0x0441,0x0421,0x0442,0x0422,0x0443,0x0423,
0x0444,0x0424,0x0445,0x0425,0x0446,0x0426,0x0447,0x0427,0x0448,0x0428,
0x0449,0x0429,0x044A,0x042A,0x044B,0x042B,0x044C,0x042C,0x044D,0x042D,
0x044E,0x042E,0x044F,0x042F,0x0450,0x0400,0x0451,0x0401,0x0452,0x0402,
0x0453,0x0403,0x0454,0x0404,0x0455,0x0405,0x0456,0x0406,0x0457,0x0407,
0x0458,0x0408,0x0459,0x0409,0x045A,0x040A,0x045B,0x040B,0x045C,0x040C,
0x045D,0x040D,0x045E,0x040E,0x045F,0x040F,0x0461,0x0460,0x0463,0x0462,
0x0465,0x0464,0x0467,0x0466,0x0469,0x0468,0x046B,0x046A,0x046D,0x046C,
0x046F,0x046E,0x0471,0x0470,0x0473,0x0472,0x0475,0x0474,0x0477,0x0476,
0x0479,0x0478,0x047B,0x047A,0x047D,0x047C,0x047F,0x047E,0x0481,0x0480,
0x048B,0x048A,0x048D,0x048C,0x048F,0x048E,0x0491,0x0490,0x0493,0x0492,
0x0495,0x0494,0x0497,0x0496,0x0499,0x0498,0x049B,0x049A,0x049D,0x049C,
0x049F,0x049E,0x04A1,0x04A0,0x04A3,0x04A2,0x04A5,0x04A4,0x04A7,0x04A6,
0x04A9,0x04A8,0x04AB,0x04AA,0x04AD,0x04AC,0x04AF,0x04AE,0x04B1,0x04B0,
0x04B3,0x04B2,0x04B5,0x04B4,0x04B7,0x04B6,0x04B9,0x04B8,0x04BB,0x04BA,
0x04BD,0x04BC,0x04BF,0x04BE,0x04C2,0x04C1,0x04C4,0x04C3,0x04C6,0x04C5,
0x04C8,0x04C7,0x04CA,0x04C9,0x04CC,0x04CB,0x04CE,0x04CD,0x04D1,0x04D0,
0x04D3,0x04D2,0x04D5,0x04D4,0x04D7,0x04D6,0x04D9,0x04D8,0x04DB,0x04DA,
0x04DD,0x04DC,0x04DF,0x04DE,0x04E1,0x04E0,0x04E3,0x04E2,0x04E5,0x04E4,
0x04E7,0x04E6,0x04E9,0x04E8,0x04EB,0x04EA,0x04ED,0x04EC,0x04EF,0x04EE,
0x04F1,0x04F0,0x04F3,0x04F2,0x04F5,0x04F4,0x04F9,0x04F8,0x0501,0x0500,
0x0503,0x0502,0x0505,0x0504,0x0507,0x0506,0x0509,0x0508,0x050B,0x050A,
0x050D,0x050C,0x050F,0x050E,0x0561,0x0531,0x0562,0x0532,0x0563,0x0533,
0x0564,0x0534,0x0565,0x0535,0x0566,0x0536,0x0567,0x0537,0x0568,0x0538,
0x0569,0x0539,0x056A,0x053A,0x056B,0x053B,0x056C,0x053C,0x056D,0x053D,
0x056E,0x053E,0x056F,0x053F,0x0570,0x0540,0x0571,0x0541,0x0572,0x0542,
0x0573,0x0543,0x0574,0x0544,0x0575,0x0545,0x0576,0x0546,0x0577,0x0547,
0x0578,0x0548,0x0579,0x0549,0x057A,0x054A,0x057B,0x054B,0x057C,0x054C,
0x057D,0x054D,0x057E,0x054E,0x057F,0x054F,0x0580,0x0550,0x0581,0x0551,
0x0582,0x0552,0x0583,0x0553,0x0584,0x0554,0x0585,0x0555,0x0586,0x0556,
0x1E01,0x1E00,0x1E03,0x1E02,0x1E05,0x1E04,0x1E07,0x1E06,0x1E09,0x1E08,
0x1E0B,0x1E0A,0x1E0D,0x1E0C,0x1E0F,0x1E0E,0x1E11,0x1E10,0x1E13,0x1E12,
0x1E15,0x1E14,0x1E17,0x1E16,0x1E19,0x1E18,0x1E1B,0x1E1A,0x1E1D,0x1E1C,
0x1E1F,0x1E1E,0x1E21,0x1E20,0x1E23,0x1E22,0x1E25,0x1E24,0x1E27,0x1E26,
0x1E29,0x1E28,0x1E2B,0x1E2A,0x1E2D,0x1E2C,0x1E2F,0x1E2E,0x1E31,0x1E30,
0x1E33,0x1E32,0x1E35,0x1E34,0x1E37,0x1E36,0x1E39,0x1E38,0x1E3B,0x1E3A,
0x1E3D,0x1E3C,0x1E3F,0x1E3E,0x1E41,0x1E40,0x1E43,0x1E42,0x1E45,0x1E44,
0x1E47,0x1E46,0x1E49,0x1E48,0x1E4B,0x1E4A,0x1E4D,0x1E4C,0x1E4F,0x1E4E,
0x1E51,0x1E50,0x1E53,0x1E52,0x1E55,0x1E54,0x1E57,0x1E56,0x1E59,0x1E58,
0x1E5B,0x1E5A,0x1E5D,0x1E5C,0x1E5F,0x1E5E,0x1E63,0x1E62,0x1E65,0x1E64,
0x1E67,0x1E66,0x1E69,0x1E68,0x1E6B,0x1E6A,0x1E6D,0x1E6C,0x1E6F,0x1E6E,
0x1E71,0x1E70,0x1E73,0x1E72,0x1E75,0x1E74,0x1E77,0x1E76,0x1E79,0x1E78,
0x1E7B,0x1E7A,0x1E7D,0x1E7C,0x1E7F,0x1E7E,0x1E81,0x1E80,0x1E83,0x1E82,
0x1E85,0x1E84,0x1E87,0x1E86,0x1E89,0x1E88,0x1E8B,0x1E8A,0x1E8D,0x1E8C,
0x1E8F,0x1E8E,0x1E91,0x1E90,0x1E93,0x1E92,0x1E95,0x1E94,0x1EA1,0x1EA0,
0x1EA3,0x1EA2,0x1EA5,0x1EA4,0x1EA7,0x1EA6,0x1EA9,0x1EA8,0x1EAB,0x1EAA,
0x1EAD,0x1EAC,0x1EAF,0x1EAE,0x1EB1,0x1EB0,0x1EB3,0x1EB2,0x1EB5,0x1EB4,
0x1EB7,0x1EB6,0x1EB9,0x1EB8,0x1EBB,0x1EBA,0x1EBD,0x1EBC,0x1EBF,0x1EBE,
0x1EC1,0x1EC0,0x1EC3,0x1EC2,0x1EC5,0x1EC4,0x1EC7,0x1EC6,0x1EC9,0x1EC8,
0x1ECB,0x1ECA,0x1ECD,0x1ECC,0x1ECF,0x1ECE,0x1ED1,0x1ED0,0x1ED3,0x1ED2,
0x1ED5,0x1ED4,0x1ED7,0x1ED6,0x1ED9,0x1ED8,0x1EDB,0x1EDA,0x1EDD,0x1EDC,
0x1EDF,0x1EDE,0x1EE1,0x1EE0,0x1EE3,0x1EE2,0x1EE5,0x1EE4,0x1EE7,0x1EE6,
0x1EE9,0x1EE8,0x1EEB,0x1EEA,0x1EED,0x1EEC,0x1EEF,0x1EEE,0x1EF1,0x1EF0,
0x1EF3,0x1EF2,0x1EF5,0x1EF4,0x1EF7,0x1EF6,0x1EF9,0x1EF8,0x1F00,0x1F08,
0x1F01,0x1F09,0x1F02,0x1F0A,0x1F03,0x1F0B,0x1F04,0x1F0C,0x1F05,0x1F0D,
0x1F06,0x1F0E,0x1F07,0x1F0F,0x1F10,0x1F18,0x1F11,0x1F19,0x1F12,0x1F1A,
0x1F13,0x1F1B,0x1F14,0x1F1C,0x1F15,0x1F1D,0x1F20,0x1F28,0x1F21,0x1F29,
0x1F22,0x1F2A,0x1F23,0x1F2B,0x1F24,0x1F2C,0x1F25,0x1F2D,0x1F26,0x1F2E,
0x1F27,0x1F2F,0x1F30,0x1F38,0x1F31,0x1F39,0x1F32,0x1F3A,0x1F33,0x1F3B,
0x1F34,0x1F3C,0x1F35,0x1F3D,0x1F36,0x1F3E,0x1F37,0x1F3F,0x1F40,0x1F48,
0x1F41,0x1F49,0x1F42,0x1F4A,0x1F43,0x1F4B,0x1F44,0x1F4C,0x1F45,0x1F4D,
0x1F51,0x1F59,0x1F53,0x1F5B,0x1F55,0x1F5D,0x1F57,0x1F5F,0x1F60,0x1F68,
0x1F61,0x1F69,0x1F62,0x1F6A,0x1F63,0x1F6B,0x1F64,0x1F6C,0x1F65,0x1F6D,
0x1F66,0x1F6E,0x1F67,0x1F6F,0x1F70,0x1FBA,0x1F71,0x1FBB,0x1F72,0x1FC8,
0x1F73,0x1FC9,0x1F74,0x1FCA,0x1F75,0x1FCB,0x1F76,0x1FDA,0x1F77,0x1FDB,
0x1F78,0x1FF8,0x1F79,0x1FF9,0x1F7A,0x1FEA,0x1F7B,0x1FEB,0x1F7C,0x1FFA,
0x1F7D,0x1FFB,0x1FB0,0x1FB8,0x1FB1,0x1FB9,0x1FD0,0x1FD8,0x1FD1,0x1FD9,
0x1FE0,0x1FE8,0x1FE1,0x1FE9,0x1FE5,0x1FEC,0x2170,0x2160,0x2171,0x2161,
0x2172,0x2162,0x2173,0x2163,0x2174,0x2164,0x2175,0x2165,0x2176,0x2166,
0x2177,0x2167,0x2178,0x2168,0x2179,0x2169,0x217A,0x216A,0x217B,0x216B,
0x217C,0x216C,0x217D,0x216D,0x217E,0x216E,0x217F,0x216F,0x24D0,0x24B6,
0x24D1,0x24B7,0x24D2,0x24B8,0x24D3,0x24B9,0x24D4,0x24BA,0x24D5,0x24BB,
0x24D6,0x24BC,0x24D7,0x24BD,0x24D8,0x24BE,0x24D9,0x24BF,0x24DA,0x24C0,
0x24DB,0x24C1,0x24DC,0x24C2,0x24DD,0x24C3,0x24DE,0x24C4,0x24DF,0x24C5,
0x24E0,0x24C6,0x24E1,0x24C7,0x24E2,0x24C8,0x24E3,0x24C9,0x24E4,0x24CA,
0x24E5,0x24CB,0x24E6,0x24CC,0x24E7,0x24CD,0x24E8,0x24CE,0x24E9,0x24CF,
0xFF41,0xFF21,0xFF42,0xFF22,0xFF43,0xFF23,0xFF44,0xFF24,0xFF45,0xFF25,
0xFF46,0xFF26,0xFF47,0xFF27,0xFF48,0xFF28,0xFF49,0xFF29,0xFF4A,0xFF2A,
0xFF4B,0xFF2B,0xFF4C,0xFF2C,0xFF4D,0xFF2D,0xFF4E,0xFF2E,0xFF4F,0xFF2F,
0xFF50,0xFF30,0xFF51,0xFF31,0xFF52,0xFF32,0xFF53,0xFF33,0xFF54,0xFF34,
0xFF55,0xFF35,0xFF56,0xFF36,0xFF57,0xFF37,0xFF58,0xFF38,0xFF59,0xFF39,
0xFF5A,0xFF3A,
};
// MACHINE-GENERATED: Do not edit (see com.ibm.icu.dev.tools.translit.UnicodeSetCloseOver)
static const CaseEquivClass CASE_NONPAIRS[] = {
{{0x1E9A,0, 0x0061,0x02BE,0, 0}},
{{0xFB00,0, 0x0066,0x0066,0, 0}},
{{0xFB03,0, 0x0066,0x0066,0x0069,0, 0}},
{{0xFB04,0, 0x0066,0x0066,0x006C,0, 0}},
{{0xFB01,0, 0x0066,0x0069,0, 0}},
{{0xFB02,0, 0x0066,0x006C,0, 0}},
{{0x1E96,0, 0x0068,0x0331,0, 0}},
{{0x0130,0, 0x0069,0x0307,0, 0}},
{{0x01F0,0, 0x006A,0x030C,0, 0}},
{{0x004B,0x006B,0x212A,0, 0}},
{{0x0053,0x0073,0x017F,0, 0}},
{{0x00DF,0, 0x0073,0x0073,0, 0}},
{{0xFB05,0xFB06,0, 0x0073,0x0074,0, 0}},
{{0x1E97,0, 0x0074,0x0308,0, 0}},
{{0x1E98,0, 0x0077,0x030A,0, 0}},
{{0x1E99,0, 0x0079,0x030A,0, 0}},
{{0x00C5,0x00E5,0x212B,0, 0}},
{{0x01C4,0x01C5,0x01C6,0, 0}},
{{0x01C7,0x01C8,0x01C9,0, 0}},
{{0x01CA,0x01CB,0x01CC,0, 0}},
{{0x01F1,0x01F2,0x01F3,0, 0}},
{{0x0149,0, 0x02BC,0x006E,0, 0}},
{{0x1FB4,0, 0x03AC,0x03B9,0, 0}},
{{0x1FC4,0, 0x03AE,0x03B9,0, 0}},
{{0x1FB6,0, 0x03B1,0x0342,0, 0}},
{{0x1FB7,0, 0x03B1,0x0342,0x03B9,0, 0}},
{{0x1FB3,0x1FBC,0, 0x03B1,0x03B9,0, 0}},
{{0x0392,0x03B2,0x03D0,0, 0}},
{{0x0395,0x03B5,0x03F5,0, 0}},
{{0x1FC6,0, 0x03B7,0x0342,0, 0}},
{{0x1FC7,0, 0x03B7,0x0342,0x03B9,0, 0}},
{{0x1FC3,0x1FCC,0, 0x03B7,0x03B9,0, 0}},
{{0x0398,0x03B8,0x03D1,0x03F4,0, 0}},
{{0x0345,0x0399,0x03B9,0x1FBE,0, 0}},
{{0x1FD2,0, 0x03B9,0x0308,0x0300,0, 0}},
{{0x0390,0x1FD3,0, 0x03B9,0x0308,0x0301,0, 0}},
{{0x1FD7,0, 0x03B9,0x0308,0x0342,0, 0}},
{{0x1FD6,0, 0x03B9,0x0342,0, 0}},
{{0x039A,0x03BA,0x03F0,0, 0}},
{{0x00B5,0x039C,0x03BC,0, 0}},
{{0x03A0,0x03C0,0x03D6,0, 0}},
{{0x03A1,0x03C1,0x03F1,0, 0}},
{{0x1FE4,0, 0x03C1,0x0313,0, 0}},
{{0x03A3,0x03C2,0x03C3,0x03F2,0, 0}},
{{0x1FE2,0, 0x03C5,0x0308,0x0300,0, 0}},
{{0x03B0,0x1FE3,0, 0x03C5,0x0308,0x0301,0, 0}},
{{0x1FE7,0, 0x03C5,0x0308,0x0342,0, 0}},
{{0x1F50,0, 0x03C5,0x0313,0, 0}},
{{0x1F52,0, 0x03C5,0x0313,0x0300,0, 0}},
{{0x1F54,0, 0x03C5,0x0313,0x0301,0, 0}},
{{0x1F56,0, 0x03C5,0x0313,0x0342,0, 0}},
{{0x1FE6,0, 0x03C5,0x0342,0, 0}},
{{0x03A6,0x03C6,0x03D5,0, 0}},
{{0x03A9,0x03C9,0x2126,0, 0}},
{{0x1FF6,0, 0x03C9,0x0342,0, 0}},
{{0x1FF7,0, 0x03C9,0x0342,0x03B9,0, 0}},
{{0x1FF3,0x1FFC,0, 0x03C9,0x03B9,0, 0}},
{{0x1FF4,0, 0x03CE,0x03B9,0, 0}},
{{0x0587,0, 0x0565,0x0582,0, 0}},
{{0xFB14,0, 0x0574,0x0565,0, 0}},
{{0xFB15,0, 0x0574,0x056B,0, 0}},
{{0xFB17,0, 0x0574,0x056D,0, 0}},
{{0xFB13,0, 0x0574,0x0576,0, 0}},
{{0xFB16,0, 0x057E,0x0576,0, 0}},
{{0x1E60,0x1E61,0x1E9B,0, 0}},
{{0x1F80,0x1F88,0, 0x1F00,0x03B9,0, 0}},
{{0x1F81,0x1F89,0, 0x1F01,0x03B9,0, 0}},
{{0x1F82,0x1F8A,0, 0x1F02,0x03B9,0, 0}},
{{0x1F83,0x1F8B,0, 0x1F03,0x03B9,0, 0}},
{{0x1F84,0x1F8C,0, 0x1F04,0x03B9,0, 0}},
{{0x1F85,0x1F8D,0, 0x1F05,0x03B9,0, 0}},
{{0x1F86,0x1F8E,0, 0x1F06,0x03B9,0, 0}},
{{0x1F87,0x1F8F,0, 0x1F07,0x03B9,0, 0}},
{{0x1F90,0x1F98,0, 0x1F20,0x03B9,0, 0}},
{{0x1F91,0x1F99,0, 0x1F21,0x03B9,0, 0}},
{{0x1F92,0x1F9A,0, 0x1F22,0x03B9,0, 0}},
{{0x1F93,0x1F9B,0, 0x1F23,0x03B9,0, 0}},
{{0x1F94,0x1F9C,0, 0x1F24,0x03B9,0, 0}},
{{0x1F95,0x1F9D,0, 0x1F25,0x03B9,0, 0}},
{{0x1F96,0x1F9E,0, 0x1F26,0x03B9,0, 0}},
{{0x1F97,0x1F9F,0, 0x1F27,0x03B9,0, 0}},
{{0x1FA0,0x1FA8,0, 0x1F60,0x03B9,0, 0}},
{{0x1FA1,0x1FA9,0, 0x1F61,0x03B9,0, 0}},
{{0x1FA2,0x1FAA,0, 0x1F62,0x03B9,0, 0}},
{{0x1FA3,0x1FAB,0, 0x1F63,0x03B9,0, 0}},
{{0x1FA4,0x1FAC,0, 0x1F64,0x03B9,0, 0}},
{{0x1FA5,0x1FAD,0, 0x1F65,0x03B9,0, 0}},
{{0x1FA6,0x1FAE,0, 0x1F66,0x03B9,0, 0}},
{{0x1FA7,0x1FAF,0, 0x1F67,0x03B9,0, 0}},
{{0x1FB2,0, 0x1F70,0x03B9,0, 0}},
{{0x1FC2,0, 0x1F74,0x03B9,0, 0}},
{{0x1FF2,0, 0x1F7C,0x03B9,0, 0}},
{{0, 0xD801,0xDC00,0, 0xD801,0xDC28,0, 0}},
{{0, 0xD801,0xDC01,0, 0xD801,0xDC29,0, 0}},
{{0, 0xD801,0xDC02,0, 0xD801,0xDC2A,0, 0}},
{{0, 0xD801,0xDC03,0, 0xD801,0xDC2B,0, 0}},
{{0, 0xD801,0xDC04,0, 0xD801,0xDC2C,0, 0}},
{{0, 0xD801,0xDC05,0, 0xD801,0xDC2D,0, 0}},
{{0, 0xD801,0xDC06,0, 0xD801,0xDC2E,0, 0}},
{{0, 0xD801,0xDC07,0, 0xD801,0xDC2F,0, 0}},
{{0, 0xD801,0xDC08,0, 0xD801,0xDC30,0, 0}},
{{0, 0xD801,0xDC09,0, 0xD801,0xDC31,0, 0}},
{{0, 0xD801,0xDC0A,0, 0xD801,0xDC32,0, 0}},
{{0, 0xD801,0xDC0B,0, 0xD801,0xDC33,0, 0}},
{{0, 0xD801,0xDC0C,0, 0xD801,0xDC34,0, 0}},
{{0, 0xD801,0xDC0D,0, 0xD801,0xDC35,0, 0}},
{{0, 0xD801,0xDC0E,0, 0xD801,0xDC36,0, 0}},
{{0, 0xD801,0xDC0F,0, 0xD801,0xDC37,0, 0}},
{{0, 0xD801,0xDC10,0, 0xD801,0xDC38,0, 0}},
{{0, 0xD801,0xDC11,0, 0xD801,0xDC39,0, 0}},
{{0, 0xD801,0xDC12,0, 0xD801,0xDC3A,0, 0}},
{{0, 0xD801,0xDC13,0, 0xD801,0xDC3B,0, 0}},
{{0, 0xD801,0xDC14,0, 0xD801,0xDC3C,0, 0}},
{{0, 0xD801,0xDC15,0, 0xD801,0xDC3D,0, 0}},
{{0, 0xD801,0xDC16,0, 0xD801,0xDC3E,0, 0}},
{{0, 0xD801,0xDC17,0, 0xD801,0xDC3F,0, 0}},
{{0, 0xD801,0xDC18,0, 0xD801,0xDC40,0, 0}},
{{0, 0xD801,0xDC19,0, 0xD801,0xDC41,0, 0}},
{{0, 0xD801,0xDC1A,0, 0xD801,0xDC42,0, 0}},
{{0, 0xD801,0xDC1B,0, 0xD801,0xDC43,0, 0}},
{{0, 0xD801,0xDC1C,0, 0xD801,0xDC44,0, 0}},
{{0, 0xD801,0xDC1D,0, 0xD801,0xDC45,0, 0}},
{{0, 0xD801,0xDC1E,0, 0xD801,0xDC46,0, 0}},
{{0, 0xD801,0xDC1F,0, 0xD801,0xDC47,0, 0}},
{{0, 0xD801,0xDC20,0, 0xD801,0xDC48,0, 0}},
{{0, 0xD801,0xDC21,0, 0xD801,0xDC49,0, 0}},
{{0, 0xD801,0xDC22,0, 0xD801,0xDC4A,0, 0}},
{{0, 0xD801,0xDC23,0, 0xD801,0xDC4B,0, 0}},
{{0, 0xD801,0xDC24,0, 0xD801,0xDC4C,0, 0}},
{{0, 0xD801,0xDC25,0, 0xD801,0xDC4D,0, 0}}
};
#define CASE_PAIRS_LENGTH (sizeof(CASE_PAIRS)/sizeof(CASE_PAIRS[0]))
#define CASE_NONPAIRS_LENGTH (sizeof(CASE_NONPAIRS)/sizeof(CASE_NONPAIRS[0]))
/**
* Add to this set all members of the case fold equivalency class
* that contains 'folded'.
* @param folded a string within a case fold equivalency class.
* It must have the property that UCharacter.foldCase(folded,
* DEFAULT_CASE_MAP).equals(folded).
*/
void UnicodeSet::caseCloseOne(const UnicodeString& folded) {
if (folded.length() == 1) {
caseCloseOne(folded.charAt(0));
return;
}
const CaseEquivClass* c = getCaseMapOf(folded);
if (c != NULL) {
caseCloseOne(*c);
return;
}
// Add 'folded' itself; it belongs to no equivalency class.
add(folded);
}
/**
* Add to this set all members of the case fold equivalency class
* that contains 'folded'.
* @param folded a code UNIT within a case fold equivalency class.
* It must have the property that uchar_foldCase(folded,
* DEFAULT_CASE_MAP) == folded.
*/
void UnicodeSet::caseCloseOne(UChar folded) {
// We must do a DOUBLE LOOKUP, first in the CompactByteArray that
// indexes into CASE_NONPAIRS[] and then into the CASE_PAIRS[]
// sorted array. A character will occur in one or the other, or
// neither, but not both.
// Look in the CompactByteArray.
const CaseEquivClass* c = getCaseMapOf(folded);
if (c != NULL) {
caseCloseOne(*c);
return;
}
// Binary search in pairs array, looking at only even entries.
// The indices low, high, and x will be halved with respect to
// CASE_PAIRS[]; that is, they must be doubled before indexing.
// CASE_PAIRS has 1312 elements, of 656 pairs, so the search
// takes no more than 10 passes.
int32_t low = 0;
int32_t high = (CASE_PAIRS_LENGTH >> 1) - 1;
int32_t x;
do {
x = (low + high) >> 1;
UChar ch = CASE_PAIRS[(uint32_t)(x << 1)];
if (folded < ch) {
high = x - 1;
} else if (folded > ch) {
low = x + 1;
} else {
break;
}
} while (low < high);
x = (low + high) & ~1; // ((low + high) >> 1) << 1
if (folded == CASE_PAIRS[x]) {
add(CASE_PAIRS[x]);
add(CASE_PAIRS[x+1]);
} else {
// If the search fails, then add folded itself; it is a
// case-unique code unit.
add(folded);
}
}
/**
* Add to this set all members of the given CaseEquivClass object.
*/
void UnicodeSet::caseCloseOne(const CaseEquivClass& c) {
const UChar* p = c.getSingles();
while (*p) {
add(*p++); // add all single code units
}
for (c.getStrings(p); *p; c.nextString(p)) {
add(p); // add all strings
}
}
/**
* Given a folded string of length >= 2 code units, return the
* CaseEquivClass containing this string, or NULL if none.
*/
const CaseEquivClass* UnicodeSet::getCaseMapOf(const UnicodeString& folded) {
umtx_lock(NULL);
UBool f = (CASE_EQUIV_HASH == NULL);
umtx_unlock(NULL);
if (f) {
// Create the Hashtable, which maps UnicodeStrings to index
// values into CASE_NONPAIRS.
UErrorCode ec = U_ZERO_ERROR;
Hashtable* hash = new Hashtable();
if (hash != NULL) {
int32_t i;
for (i=0; i<(int32_t)CASE_NONPAIRS_LENGTH; ++i) {
const CaseEquivClass* c = &CASE_NONPAIRS[i];
const UChar* p;
for (c->getStrings(p); *p; c->nextString(p)) {
hash->put(UnicodeString(p), (void*) c, ec);
}
}
if (U_SUCCESS(ec)) {
umtx_lock(NULL);
if (CASE_EQUIV_HASH == NULL) {
CASE_EQUIV_HASH = hash;
hash = NULL;
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
}
umtx_unlock(NULL);
}
delete hash;
}
}
return (CASE_EQUIV_HASH != NULL) ?
(const CaseEquivClass*) CASE_EQUIV_HASH->get(folded) : NULL;
}
/**
* Given a folded code unit, return the CaseEquivClass containing it,
* or NULL if none.
*/
const CaseEquivClass* UnicodeSet::getCaseMapOf(UChar folded) {
umtx_lock(NULL);
UBool f = (CASE_EQUIV_CBA == NULL);
umtx_unlock(NULL);
if (f) {
// Create the CompactByteArray, which maps single code units
// to index values into CASE_NONPAIRS.
CompactByteArray* cba = ucmp8_open(-1);
if (ucmp8_isBogus(cba)) {
ucmp8_close(cba);
cba = NULL;
} else {
int32_t i;
for (i=0; i<(int32_t)CASE_NONPAIRS_LENGTH; ++i) {
const UChar* p = CASE_NONPAIRS[i].getSingles();
UChar ch;
while ((ch = *p++) != 0) {
ucmp8_set(cba, ch, (int8_t) i);
}
}
ucmp8_compact(cba, 256);
}
umtx_lock(NULL);
if (CASE_EQUIV_CBA == NULL) {
CASE_EQUIV_CBA = cba;
cba = NULL;
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
}
umtx_unlock(NULL);
if (cba != NULL) {
ucmp8_close(cba);
}
}
if (CASE_EQUIV_CBA != NULL) {
int32_t index = ucmp8_getu(CASE_EQUIV_CBA, folded);
if (index != 255) {
return &CASE_NONPAIRS[index];
}
}
return NULL;
}
U_NAMESPACE_END

View File

@ -262,7 +262,7 @@ isAcceptable(void * /* context */,
static UBool U_CALLCONV
_enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*limit*/, uint32_t /*value*/) {
/* add the start code point to the USet */
USetAdder *sa=(USetAdder *)context;
const USetAdder *sa=(const USetAdder *)context;
sa->add(sa->set, start);
return TRUE;
}
@ -1129,7 +1129,7 @@ unorm_isNFSkippable(UChar32 c, UNormalizationMode mode) {
}
U_CAPI void U_EXPORT2
unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode) {
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
UChar c;
if(U_FAILURE(*pErrorCode) || !_haveData(*pErrorCode)) {

View File

@ -452,7 +452,7 @@ unorm_getNX(int32_t options, UErrorCode *pErrorCode);
* @internal
*/
U_CAPI void U_EXPORT2
unorm_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
/**
* Swap unorm.icu. See udataswp.h.

View File

@ -400,7 +400,7 @@ uprops_getSource(UProperty which) {
#if 0
U_CAPI void U_EXPORT2
uprv_getInclusions(USetAdder *sa, UErrorCode *pErrorCode) {
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}

View File

@ -305,7 +305,7 @@ uprv_getMaxISOCommentLength();
* @param sa USetAdder to receive characters.
*/
U_CAPI void U_EXPORT2
uprv_getCharNameCharacters(USetAdder *sa);
uprv_getCharNameCharacters(const USetAdder *sa);
#if 0
/*
@ -318,7 +318,7 @@ urename.h and unames.c changed accordingly.
* @param sa USetAdder to receive characters.
*/
U_CAPI void U_EXPORT2
uprv_getISOCommentCharacters(USetAdder *sa);
uprv_getISOCommentCharacters(const USetAdder *sa);
*/
#endif
@ -360,14 +360,14 @@ uprops_getSource(UProperty which);
* @internal
*/
U_CAPI void U_EXPORT2
uchar_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
/**
* Same as uchar_addPropertyStarts() but only for Hangul_Syllable_Type.
* @internal
*/
U_CAPI void U_EXPORT2
uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
uhst_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
/**
* Return a set of characters for property enumeration.
@ -378,7 +378,7 @@ uhst_addPropertyStarts(USetAdder *sa, UErrorCode *pErrorCode);
* @internal
*/
U_CAPI void U_EXPORT2
uprv_getInclusions(USetAdder *sa, UErrorCode *pErrorCode);
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
/**
* Swap the ICU Unicode properties file. See uchar.c.

View File

@ -963,6 +963,32 @@ void UnicodeSetTest::TestCloseOver() {
CASE,
"[ABC]","[A-Ca-c]",
CASE, "[i]", "[iI]",
CASE, "[\\u0130]", "[\\u0130{i\\u0307}]", // dotted I
CASE, "[{i\\u0307}]", "[\\u0130{i\\u0307}]", // i with dot
CASE, "[\\u0131]", "[\\u0131]", // dotless i
CASE, "[\\u0390]", "[\\u0390\\u1FD3{\\u03B9\\u0308\\u0301}]",
CASE, "[\\u03c2]", "[\\u03a3\\u03c2\\u03c3]", // sigmas
CASE, "[\\u03f2]", "[\\u03f2\\u03f9]", // lunate sigmas
CASE, "[\\u03f7]", "[\\u03f7\\u03f8]",
CASE, "[\\u1fe3]", "[\\u03b0\\u1fe3{\\u03c5\\u0308\\u0301}]",
CASE, "[\\ufb05]", "[\\ufb05\\ufb06{st}]",
CASE, "[{st}]", "[\\ufb05\\ufb06{st}]",
CASE, "[\\U0001044F]", "[\\U00010427\\U0001044F]",
CASE, "[{a\\u02BE}]", "[\\u1E9A{a\\u02BE}]", // first in sorted table
CASE, "[{\\u1f7c\\u03b9}]", "[\\u1ff2{\\u1f7c\\u03b9}]", // last in sorted table
CASE_MAPPINGS,
"[aq\\u00DF{Bc}{bC}{Fi}]",
"[aAqQ\\u00DF{ss}{Ss}{SS}{Bc}{BC}{bC}{bc}{FI}{Fi}{fi}]",
@ -980,6 +1006,7 @@ void UnicodeSetTest::TestCloseOver() {
UnicodeSet s;
UnicodeSet t;
UnicodeString buf;
for (int32_t i=0; DATA[i]!=NULL; i+=3) {
int32_t selector = DATA[i][0];
UnicodeString pat(DATA[i+1]);
@ -994,12 +1021,72 @@ void UnicodeSetTest::TestCloseOver() {
if (s == t) {
logln((UnicodeString)"Ok: " + pat + ".closeOver(" + selector + ") => " + exp);
} else {
UnicodeString buf;
errln((UnicodeString)"FAIL: " + pat + ".closeOver(" + selector + ") => " +
s.toPattern(buf, TRUE) + ", expected " + exp);
}
}
#if 0
/*
* Unused test code.
* This was used to compare the old implementation (using USET_CASE)
* with the new one (using 0x100 temporarily)
* while transitioning from hardcoded case closure tables in uniset.cpp
* (moved to uniset_props.cpp) to building the data by gencase into ucase.icu.
* and using ucase.c functions for closure.
* See Jitterbug 3432 RFE: Move uniset.cpp data to a data file
*
* Note: The old and new implementation never fully matched because
* the old implementation turned out to not map U+0130 and U+0131 correctly
* (dotted I and dotless i) and because the old implementation's data tables
* were outdated compared to Unicode 4.0.1 at the time of the change to the
* new implementation. (So sigmas and some other characters were not handled
* according to the newer Unicode version.)
*/
UnicodeSet sens("[:case_sensitive:]", ec), sens2, s2;
UnicodeSetIterator si(sens);
UnicodeString str, buf2;
const UnicodeString *pStr;
UChar32 c;
while(si.next()) {
if(!si.isString()) {
c=si.getCodepoint();
s.clear();
s.add(c);
str.setTo(c);
str.foldCase();
sens2.add(str);
t=s;
s.closeOver(USET_CASE);
t.closeOver(0x100);
if(s!=t) {
errln("FAIL: closeOver(U+%04x) differs: ", c);
errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
}
}
}
// remove all code points
// should contain all full case folding mapping strings
sens2.remove(0, 0x10ffff);
si.reset(sens2);
while(si.next()) {
if(si.isString()) {
pStr=&si.getString();
s.clear();
s.add(*pStr);
t=s2=s;
s.closeOver(USET_CASE);
t.closeOver(0x100);
if(s!=t) {
errln((UnicodeString)"FAIL: closeOver("+s2.toPattern(buf, TRUE)+") differs: ");
errln((UnicodeString)"old "+s.toPattern(buf, TRUE)+" new: "+t.toPattern(buf2, TRUE));
}
}
}
#endif
// Test the pattern API
s.applyPattern("[abc]", USET_CASE_INSENSITIVE, NULL, ec);
if (U_FAILURE(ec)) {

View File

@ -28,6 +28,21 @@ U_CDECL_BEGIN
#define UGENCASE_EXC_SHIFT 16
#define UGENCASE_EXC_MASK 0xffff0000
/*
* Values for the ucase.icu unfold[] data array, see store.c.
* The values are stored in ucase.icu so that the runtime code will work with
* changing values, but they are hardcoded for gencase for simplicity.
* They are optimized, that is, provide for minimal table column widths,
* for the actual Unicode data, so that the table size is minimized.
* Future versions of Unicode may require increases of some of these values.
*/
enum {
UGENCASE_UNFOLD_STRING_WIDTH=3,
UGENCASE_UNFOLD_CP_WIDTH=2,
UGENCASE_UNFOLD_WIDTH=UGENCASE_UNFOLD_STRING_WIDTH+UGENCASE_UNFOLD_CP_WIDTH,
UGENCASE_UNFOLD_MAX_ROWS=250
};
/* special casing data */
typedef struct {
UChar32 code;
@ -45,6 +60,7 @@ typedef struct {
/* case mapping properties */
typedef struct {
UChar32 code, lowerCase, upperCase, titleCase;
UChar32 closure[8];
SpecialCasing *specialCasing;
CaseFolding *caseFolding;
uint8_t gc, cc;

View File

@ -26,11 +26,14 @@
#include "cstring.h"
#include "filestrm.h"
#include "utrie.h"
#include "uarrsort.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "propsvec.h"
#include "gencase.h"
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
/* Unicode case mapping properties file format ---------------------------------
The file format prepared and written here contains several data
@ -41,7 +44,9 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 1 .
The following is a description of format version 1.1 .
Format version 1.1 adds data for case closure.
The file contains the following structures:
@ -52,16 +57,19 @@ The file contains the following structures:
i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
i2 trieSize; -- size in bytes of the case mapping properties trie
i3 exceptionsLength; -- length in uint16_t of the exceptions array
i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
i4..i14 reservedIndexes; -- reserved values; 0 for now
i5..i14 reservedIndexes; -- reserved values; 0 for now
i15 maxFullLength; -- maximum length of a full case mapping/folding string
Serizalied trie, see utrie.h;
Serialized trie, see utrie.h;
const uint16_t exceptions[exceptionsLength];
const UChar unfold[unfoldLength];
Trie data word:
Bits
@ -117,12 +125,24 @@ Optional-value slots:
1 case folding (code point)
2 uppercase mapping (code point)
3 titlecase mapping (code point)
4..6 reserved
4 reserved
5 reserved
6 closure mappings (new in format version 1.1)
7 there is at least one full (string) case mapping
the length of each is encoded in a nibble of this optional value,
and the strings follow this optional value in the same order:
lower/fold/upper/title
The optional closure mappings value is used as follows:
Bits 0..3 contain the length of a string of code points for case closure.
The string immediately follows the full case mappings, or the closure value
slot if there are no full case mappings.
Bits 4..15 are reserved and could be used in the future to indicate the
number of strings for case closure.
Complete case closure for a code point is given by the union of all simple
and full case mappings and foldings, plus the case closure code points
(and potentially, in the future, case closure strings).
For space saving, some values are not stored. Lookups are as follows:
- If special casing is conditional, then no full lower/upper/title mapping
strings are stored.
@ -135,6 +155,28 @@ For space saving, some values are not stored. Lookups are as follows:
simple title->simple upper
finally, the original code point (no mapping)
This fallback order is strict:
In particular, the fallback from full case folding is to simple case folding,
not to full lowercase mapping.
Reverse case folding data ("unfold") array: (new in format version 1.1)
This array stores some miscellaneous values followed by a table. The data maps
back from multi-character strings to their original code points, for use
in case closure.
The table contains two columns of strings.
The string in the first column is the case folding of each of the code points
in the second column. The strings are terminated with NUL or by the end of the
column, whichever comes first.
The miscellaneous data takes up one pseudo-row and includes:
- number of rows
- number of UChars per row
- number of UChars in the left (folding string) column
The table is sorted by its first column. Values in the first column are unique.
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
@ -149,7 +191,7 @@ static UDataInfo dataInfo={
/* dataFormat="cAsE" */
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
{ 1, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 1, 1, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 4, 0, 1, 0 } /* dataVersion */
};
@ -167,6 +209,13 @@ static uint16_t exceptionsCount=0;
/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
static int32_t maxFullLength=U16_MAX_LENGTH;
/* reverse case folding ("unfold") data */
static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={
0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0
};
static uint16_t unfoldRows=0;
static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH;
/* -------------------------------------------------------------------------- */
extern void
@ -176,6 +225,29 @@ setUnicodeVersion(const char *v) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
static void
addUnfolding(UChar32 c, const UChar *s, int32_t length) {
int32_t i;
if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
(long)length, UGENCASE_UNFOLD_STRING_WIDTH);
exit(U_INTERNAL_PROGRAM_ERROR);
}
if(unfoldTop>=LENGTHOF(unfold)) {
fprintf(stderr, "gencase error: too many multi-character case foldings\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH);
u_memcpy(unfold+unfoldTop, s, length);
i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH;
U16_APPEND_UNSAFE(unfold, i, c);
++unfoldRows;
unfoldTop+=UGENCASE_UNFOLD_WIDTH;
}
/* store a character's properties ------------------------------------------- */
extern void
@ -214,6 +286,9 @@ setProps(Props *p) {
if(p->upperCase!=p->titleCase) {
value|=UCASE_EXCEPTION;
}
if(p->closure[0]!=0) {
value|=UCASE_EXCEPTION;
}
if(p->specialCasing!=NULL) {
value|=UCASE_EXCEPTION;
}
@ -286,6 +361,14 @@ setProps(Props *p) {
u_errorName(errorCode));
exit(errorCode);
}
/* add the multi-character case folding to the "unfold" data */
if(p->caseFolding!=NULL) {
int32_t length=p->caseFolding->full[0];
if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) {
addUnfolding(p->code, p->caseFolding->full+1, length);
}
}
}
extern void
@ -298,13 +381,368 @@ addCaseSensitive(UChar32 first, UChar32 last) {
}
}
/* finalize reverse case folding ("unfold") data ---------------------------- */
static int32_t U_CALLCONV
compareUnfold(const void *context, const void *left, const void *right) {
return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
}
static void
makeUnfoldData() {
static const UChar
iDot[2]= { 0x69, 0x307 };
UChar *p, *q;
int32_t i, j, k;
UErrorCode errorCode;
/*
* add a case folding that we missed because it's conditional:
* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
*/
addUnfolding(0x130, iDot, 2);
/* sort the data */
errorCode=U_ZERO_ERROR;
uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
compareUnfold, NULL, FALSE, &errorCode);
/* make unique-string rows by merging adjacent ones' code point columns */
/* make p point to row i-1 */
p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH;
for(i=1; i<unfoldRows;) {
if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
/* concatenate code point columns */
q=p+UGENCASE_UNFOLD_STRING_WIDTH;
for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
}
if(j>UGENCASE_UNFOLD_CP_WIDTH) {
fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
(long)j, UGENCASE_UNFOLD_CP_WIDTH);
exit(U_BUFFER_OVERFLOW_ERROR);
}
/* move following rows up one */
--unfoldRows;
unfoldTop-=UGENCASE_UNFOLD_WIDTH;
u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
} else {
p+=UGENCASE_UNFOLD_WIDTH;
++i;
}
}
unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
if(beVerbose) {
puts("unfold data:");
p=(UChar *)unfold;
for(i=0; i<unfoldRows; ++i) {
p+=UGENCASE_UNFOLD_WIDTH;
printf("[%2d] %04x %04x %04x <- %04x %04x\n",
i, p[0], p[1], p[2], p[3], p[4]);
}
}
}
/* case closure ------------------------------------------------------------- */
static void
addClosureMapping(UChar32 src, UChar32 dest) {
uint32_t value;
if(beVerbose) {
printf("add closure mapping U+%04lx->U+%04lx\n",
(unsigned long)src, (unsigned long)dest);
}
value=upvec_getValue(pv, src, 0);
if(value&UCASE_EXCEPTION) {
Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
int32_t i;
/* append dest to src's closure array */
for(i=0;; ++i) {
if(i==LENGTHOF(p->closure)) {
fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n",
(unsigned long)src, (unsigned long)dest);
exit(U_BUFFER_OVERFLOW_ERROR);
} else if(p->closure[i]==dest) {
break; /* do not store duplicates */
} else if(p->closure[i]==0) {
p->closure[i]=dest;
break;
}
}
} else {
Props p2={ 0 };
UChar32 next;
UErrorCode errorCode;
/*
* decode value into p2 (enough for makeException() to work properly),
* add the closure mapping,
* and set the new exception for src
*/
p2.code=src;
p2.closure[0]=dest;
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
/* one simple case mapping, don't care which one */
next=src+((int16_t)value>>UCASE_DELTA_SHIFT);
if(next!=src) {
if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
p2.upperCase=p2.titleCase=next;
} else {
p2.lowerCase=next;
}
}
} else if(value&UCASE_DELTA_MASK) {
fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n",
(unsigned long)src);
exit(U_INTERNAL_PROGRAM_ERROR);
}
value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */
value|=(uint32_t)exceptionsCount<<UGENCASE_EXC_SHIFT;
value|=UCASE_EXCEPTION;
uprv_memcpy(excProps+exceptionsCount, &p2, sizeof(p2));
if(++exceptionsCount==MAX_EXC_COUNT) {
fprintf(stderr, "gencase: too many exceptions\n");
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
errorCode=U_ZERO_ERROR;
if(!upvec_setValue(pv, src, src+1, 0, value, 0xffffffff, &errorCode)) {
fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
}
/*
* Find missing case mapping relationships and add mappings for case closure.
* This function starts from an "original" code point and recursively
* finds its case mappings and the case mappings of where it maps to.
*
* The recursion depth is capped at 3 nested calls of this function.
* In each call, the current code point is c, and the function enumerates
* all of c's simple (single-code point) case mappings.
* prev is the code point that case-mapped to c.
* prev2 is the code point that case-mapped to prev.
*
* The initial function call has prev2<0, prev<0, and c==orig
* (marking no code points).
* It enumerates c's case mappings and recurses without further action.
*
* The second-level function call has prev2<0, prev==orig, and c is
* the destination code point of one of prev's case mappings.
* The function checks if any of c's case mappings go back to orig
* and adds a closure mapping if not.
* In other words, it turns a case mapping relationship of
* orig->c
* into
* orig<->c
*
* The third-level function call has prev2==orig, prev>=0, and c is
* the destination code point of one of prev's case mappings.
* (And prev is the destination of one of prev2's case mappings.)
* The function checks if any of c's case mappings go back to orig
* and adds a closure mapping if not.
* In other words, it turns case mapping relationships of
* orig->prev->c or orig->prev<->c
* into
* orig->prev->c->orig or orig->prev<->c->orig
* etc.
* (Graphically, this closes a triangle.)
*
* With repeated application on all code points until no more closure mappings
* are added, all case equivalence groups get complete mappings.
* That is, in each group of code points with case relationships
* each code point will in the end have some mapping to each other
* code point in the group.
*
* @return TRUE if a closure mapping was added
*/
static UBool
addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) {
UChar32 next;
UBool someMappingsAdded=FALSE;
if(c!=orig) {
/* get the properties for c */
value=upvec_getValue(pv, c, 0);
}
/* else if c==orig then c's value was passed in */
if(value&UCASE_EXCEPTION) {
UChar32 set[32];
int32_t i, count=0;
Props *p=excProps+(value>>UGENCASE_EXC_SHIFT);
/*
* marker for whether any of c's mappings goes to orig
* c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
*/
UBool mapsToOrig=(UBool)(c==orig);
/* collect c's case mapping destinations in set[] */
if((next=p->upperCase)!=0 && next!=c) {
set[count++]=next;
}
if((next=p->lowerCase)!=0 && next!=c) {
set[count++]=next;
}
if(p->upperCase!=(next=p->titleCase) && next!=c) {
set[count++]=next;
}
if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) {
set[count++]=next;
}
/* append c's current closure mappings to set[] */
for(i=0; i<LENGTHOF(p->closure) && (next=p->closure[i])!=0; ++i) {
set[count++]=next;
}
/* process all code points to which c case-maps */
for(i=0; i<count; ++i) {
next=set[i]; /* next!=c */
if(next==orig) {
mapsToOrig=TRUE; /* remember that we map to orig */
} else if(prev2<0 && next!=prev) {
/*
* recurse unless
* we have reached maximum depth (prev2>=0) or
* this is a mapping to one of the previous code points (orig, prev, c)
*/
someMappingsAdded|=addClosure(orig, prev, c, next, 0);
}
}
if(!mapsToOrig) {
addClosureMapping(c, orig);
return TRUE;
}
} else {
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
/* one simple case mapping, don't care which one */
next=c+((int16_t)value>>UCASE_DELTA_SHIFT);
if(next!=c) {
/*
* recurse unless
* we have reached maximum depth (prev2>=0) or
* this is a mapping to one of the previous code points (orig, prev, c)
*/
if(prev2<0 && next!=orig && next!=prev) {
someMappingsAdded|=addClosure(orig, prev, c, next, 0);
}
if(c!=orig && next!=orig) {
/* c does not map to orig, add a closure mapping c->orig */
addClosureMapping(c, orig);
return TRUE;
}
}
}
}
return someMappingsAdded;
}
extern void
makeCaseClosure() {
/* TODO */
UChar *p;
uint32_t *row;
uint32_t value;
UChar32 start, limit, c, c2;
int32_t i, j;
UBool someMappingsAdded;
/*
* finalize the "unfold" data because we need to use it to add closure mappings
* for situations like FB05->"st"<-FB06
* where we would otherwise miss the FB05<->FB06 relationship
*/
makeUnfoldData();
/* use the "unfold" data to add mappings */
/* p always points to the code points; this loop ignores the strings completely */
p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
for(i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
j=0;
U16_NEXT_UNSAFE(p, j, c);
while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
U16_NEXT_UNSAFE(p, j, c2);
addClosure(c, U_SENTINEL, c, c2, 0);
}
}
if(beVerbose) {
puts("---- ---- ---- ---- (done with closures from unfolding)");
}
/* add further closure mappings from analyzing simple mappings */
do {
someMappingsAdded=FALSE;
i=0;
while((row=upvec_getRow(pv, i, &start, &limit))!=NULL) {
value=*row;
if(value!=0) {
while(start<limit) {
if(addClosure(start, U_SENTINEL, U_SENTINEL, start, value)) {
someMappingsAdded=TRUE;
/*
* stop this loop because pv was changed and row is not valid any more
* skip all rows below the current start
*/
while((row=upvec_getRow(pv, i, NULL, &limit))!=NULL && start>=limit) {
++i;
}
row=NULL; /* signal to continue with outer loop, without further ++i */
break;
}
++start;
}
if(row==NULL) {
continue; /* see row=NULL above */
}
}
++i;
}
if(beVerbose && someMappingsAdded) {
puts("---- ---- ---- ----");
}
} while(someMappingsAdded);
}
/* exceptions --------------------------------------------------------------- */
/* get the string length from zero-terminated code points in a limited-length array */
static int32_t
getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) {
int32_t i, length;
for(i=length=0; i<maxLength && s[i]!=0; ++i) {
length+=U16_LENGTH(s[i]);
}
return length;
}
static UBool
fullMappingEqualsSimple(const UChar *s, UChar32 simple, UChar32 c) {
int32_t i, length;
@ -441,6 +879,15 @@ makeException(uint32_t value, Props *p) {
excWord|=U_MASK(UCASE_EXC_TITLE);
}
/* length of case closure */
if(p->closure[0]!=0) {
length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure));
slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
slotBits|=slots[count];
++count;
excWord|=U_MASK(UCASE_EXC_CLOSURE);
}
/* lengths of full case mapping strings, stored in the last slot */
fullLengths=0;
if(p->specialCasing!=NULL) {
@ -493,6 +940,15 @@ makeException(uint32_t value, Props *p) {
excTop+=length;
}
/* write the closure data */
if(p->closure[0]!=0) {
UChar32 c;
for(i=0; i<LENGTHOF(p->closure) && (c=p->closure[i])!=0; ++i) {
U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c);
}
}
exceptionsTop=excTop;
/* write the main exceptions word */
@ -559,7 +1015,8 @@ generateData(const char *dataDir) {
indexes[UCASE_IX_EXC_LENGTH]=exceptionsTop;
indexes[UCASE_IX_TRIE_SIZE]=trieSize;
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop;
indexes[UCASE_IX_UNFOLD_LENGTH]=unfoldTop;
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptionsTop+2*unfoldTop;
indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
@ -567,6 +1024,7 @@ generateData(const char *dataDir) {
printf("trie size in bytes: %5d\n", (int)trieSize);
printf("number of code points with exceptions: %5d\n", exceptionsCount);
printf("size in bytes of exceptions: %5d\n", 2*exceptionsTop);
printf("size in bytes of reverse foldings: %5d\n", 2*unfoldTop);
printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
}
@ -581,6 +1039,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, trieBlock, trieSize);
udata_writeBlock(pData, exceptions, 2*exceptionsTop);
udata_writeBlock(pData, unfold, 2*unfoldTop);
/* finish up */
dataLength=udata_finish(pData, &errorCode);