ICU-1586 add common trie implementation

X-SVN-Rev: 7327
This commit is contained in:
Markus Scherer 2001-12-06 01:23:47 +00:00
parent 5760c7b55b
commit 268abe3937
11 changed files with 2540 additions and 644 deletions

View File

@ -332,6 +332,10 @@ SOURCE=.\utf_impl.c
# End Source File
# Begin Source File
SOURCE=.\utrie.c
# End Source File
# Begin Source File
SOURCE=.\uvector.cpp
# End Source File
# End Group
@ -1407,6 +1411,10 @@ InputPath=.\unicode\utf8.h
# End Source File
# Begin Source File
SOURCE=.\utrie.h
# End Source File
# Begin Source File
SOURCE=.\unicode\utypes.h
!IF "$(CFG)" == "common - Win32 Release"

View File

@ -25,8 +25,9 @@
#include "unicode/uloc.h"
#include "umutex.h"
#include "cmemory.h"
#include "ustr_imp.h"
#include "ucln_cmn.h"
#include "utrie.h"
#include "ustr_imp.h"
/*
* Since genprops overrides the general category for some control codes,
@ -213,36 +214,31 @@ static UDataMemory *propsData=NULL;
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UVersionInfo dataVersion={ 3, 0, 0, 0 };
static const uint16_t *propsTable=NULL;
#define props32Table ((uint32_t *)propsTable)
static UTrie propsTrie={ 0 };
static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL;
static const UChar *ucharsTable=NULL;
static int8_t havePropsData=0;
/* index values loaded from uprops.dat */
static uint16_t indexes[8];
static int32_t indexes[16];
enum {
INDEX_STAGE_2_BITS,
INDEX_STAGE_3_BITS,
INDEX_EXCEPTIONS,
INDEX_STAGE_3_INDEX,
INDEX_PROPS,
INDEX_UCHARS
INDEX_EXCEPTIONS,
INDEX_UCHARS,
INDEX_RESERVED /* contains the uint32_t offset to the top of the known data */
};
#ifdef UCHAR_VARIABLE_TRIE_BITS
/* access values calculated from indexes */
static uint16_t stage23Bits, stage2Mask, stage3Mask;
# define stage3Bits indexes[INDEX_STAGE_3_BITS]
#else
/* We are now hardcoding the bit distribution for the trie table access. */
# define stage23Bits 10
# define stage2Mask 0x3f
# define stage3Mask 0xf
# define stage3Bits 4
#endif
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
static int32_t U_CALLCONV
getFoldingPropsOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
static UBool
isAcceptable(void *context,
@ -256,7 +252,9 @@ isAcceptable(void *context,
pInfo->dataFormat[1]==0x50 &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6f &&
pInfo->formatVersion[0]==1
pInfo->formatVersion[0]==2 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
@ -271,11 +269,13 @@ uchar_cleanup()
{
if (propsData) {
udata_close(propsData);
propsData = NULL;
propsData=NULL;
}
propsTable = NULL;
ucharsTable = NULL;
havePropsData = FALSE;
pData32=NULL;
props32Table=NULL;
exceptionsTable=NULL;
ucharsTable=NULL;
havePropsData=FALSE;
return TRUE;
}
@ -283,9 +283,11 @@ static int8_t
loadPropsData() {
/* load Unicode character properties data from file if necessary */
if(havePropsData==0) {
UTrie trie={ 0 };
UErrorCode errorCode=U_ZERO_ERROR;
UDataMemory *data;
const uint16_t *p=NULL;
const uint32_t *p=NULL;
int32_t length;
/* open the data outside the mutex block */
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
@ -293,38 +295,33 @@ loadPropsData() {
return havePropsData=-1;
}
p=(const uint16_t *)udata_getMemory(data);
p=(const uint32_t *)udata_getMemory(data);
#ifndef UCHAR_VARIABLE_TRIE_BITS
/*
* We are now hardcoding the bit distribution for the trie table access.
* Check that the file is stored accordingly.
*/
if(p[INDEX_STAGE_2_BITS]!=6 || p[INDEX_STAGE_3_BITS]!=4) {
/* unserialize the trie; it is directly after the int32_t indexes[16] */
length=(*(int32_t *)p)*4;
length=utrie_unserialize(&trie, (const uint8_t *)(p+16), length-64, &errorCode);
if(U_FAILURE(errorCode)) {
udata_close(data);
errorCode=U_INVALID_FORMAT_ERROR;
return havePropsData=-1;
}
#endif
trie.getFoldingOffset=getFoldingPropsOffset;
/* in the mutex block, set the data for this process */
umtx_lock(NULL);
if(propsData==NULL) {
propsData=data;
data=NULL;
propsTable=p;
pData32=p;
p=NULL;
uprv_memcpy(&propsTrie, &trie, sizeof(trie));
}
umtx_unlock(NULL);
/* initialize some variables */
uprv_memcpy(indexes, propsTable, 16);
#ifdef UCHAR_VARIABLE_TRIE_BITS
stage23Bits=(uint16_t)(indexes[INDEX_STAGE_2_BITS]+indexes[INDEX_STAGE_3_BITS]);
stage2Mask=(uint16_t)((1<<indexes[INDEX_STAGE_2_BITS])-1);
stage3Mask=(uint16_t)((1<<indexes[INDEX_STAGE_3_BITS])-1);
#endif
ucharsTable=(const UChar *)(props32Table+indexes[INDEX_UCHARS]);
uprv_memcpy(indexes, pData32, sizeof(indexes));
props32Table=pData32+indexes[INDEX_PROPS];
exceptionsTable=pData32+indexes[INDEX_EXCEPTIONS];
ucharsTable=(const UChar *)(pData32+indexes[INDEX_UCHARS]);
havePropsData=1;
/* if a different thread set it first, then close the extra data */
@ -361,28 +358,22 @@ enum {
/* getting a uint32_t properties word from the data */
#define HAVE_DATA (havePropsData>0 || (havePropsData==0 && loadPropsData()>0))
#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
#define GET_PROPS_UNSAFE(c) \
props32Table[ \
propsTable[ \
propsTable[ \
propsTable[8+((c)>>stage23Bits)]+ \
(((c)>>stage3Bits)&stage2Mask)]+ \
((c)&stage3Mask) \
] \
]
#define GET_PROPS(c) \
(((uint32_t)(c))<=0x10ffff ? \
HAVE_DATA ? \
GET_PROPS_UNSAFE(c) \
: (c)<=0x9f ? \
staticProps32Table[c] \
: 0 \
: 0)
#define GET_PROPS_UNSAFE(c, result) \
UTRIE_GET16(&propsTrie, c, result); \
(result)=props32Table[(result)]
#define GET_PROPS(c, result) \
if(HAVE_DATA) { \
GET_PROPS_UNSAFE(c, result); \
} else if((c)<=0x9f) { \
(result)=staticProps32Table[c]; \
} else { \
(result)=0; \
}
#define PROPS_VALUE_IS_EXCEPTION(props) ((props)&(1UL<<EXCEPTION_SHIFT))
#define GET_CATEGORY(props) ((props)&0x1f)
#define GET_UNSIGNED_VALUE(props) ((props)>>VALUE_SHIFT)
#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>VALUE_SHIFT)
#define GET_EXCEPTIONS(props) (props32Table+indexes[INDEX_EXCEPTIONS]+GET_UNSIGNED_VALUE(props))
#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props))
/* finding an exception value */
#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
@ -427,31 +418,41 @@ uprv_haveProperties() {
/* Gets the Unicode character's general category.*/
U_CAPI int8_t U_EXPORT2
u_charType(UChar32 c) {
return (int8_t)GET_CATEGORY(GET_PROPS(c));
uint32_t props;
GET_PROPS(c, props);
return (int8_t)GET_CATEGORY(props);
}
/* Checks if ch is a lower case letter.*/
U_CAPI UBool U_EXPORT2
u_islower(UChar32 c) {
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_LOWERCASE_LETTER);
uint32_t props;
GET_PROPS(c, props);
return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
}
/* Checks if ch is an upper case letter.*/
U_CAPI UBool U_EXPORT2
u_isupper(UChar32 c) {
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_UPPERCASE_LETTER);
uint32_t props;
GET_PROPS(c, props);
return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
}
/* Checks if ch is a title case letter; usually upper case letters.*/
U_CAPI UBool U_EXPORT2
u_istitle(UChar32 c) {
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_TITLECASE_LETTER);
uint32_t props;
GET_PROPS(c, props);
return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
}
/* Checks if ch is a decimal digit. */
U_CAPI UBool U_EXPORT2
u_isdigit(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER)
)!=0);
}
@ -459,7 +460,9 @@ u_isdigit(UChar32 c) {
/* Checks if the Unicode character is a letter.*/
U_CAPI UBool U_EXPORT2
u_isalpha(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
)!=0);
}
@ -467,7 +470,9 @@ u_isalpha(UChar32 c) {
/* Checks if ch is a letter or a decimal digit */
U_CAPI UBool U_EXPORT2
u_isalnum(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
)!=0);
@ -476,13 +481,17 @@ u_isalnum(UChar32 c) {
/* Checks if ch is a unicode character with assigned character type.*/
U_CAPI UBool U_EXPORT2
u_isdefined(UChar32 c) {
return (UBool)(GET_PROPS(c)!=0);
uint32_t props;
GET_PROPS(c, props);
return (UBool)(GET_CATEGORY(props)!=0);
}
/* Checks if the Unicode character is a base form character that can take a diacritic.*/
U_CAPI UBool U_EXPORT2
u_isbase(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
1UL<<U_NON_SPACING_MARK|1UL<<U_ENCLOSING_MARK|1UL<<U_COMBINING_SPACING_MARK)
@ -492,17 +501,24 @@ u_isbase(UChar32 c) {
/* Checks if the Unicode character is a control character.*/
U_CAPI UBool U_EXPORT2
u_iscntrl(UChar32 c) {
return (UBool)(
IS_ISO_8_CONTROL(c) ||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0);
if(IS_ISO_8_CONTROL(c)) {
return TRUE;
} else {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
((1UL<<GET_CATEGORY(props))&
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0);
}
}
/* Checks if the Unicode character is a space character.*/
U_CAPI UBool U_EXPORT2
u_isspace(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0);
}
@ -510,7 +526,9 @@ u_isspace(UChar32 c) {
/* Checks if the Unicode character is a whitespace character.*/
U_CAPI UBool U_EXPORT2
u_isWhitespace(UChar32 c) {
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
)!=0 &&
c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
@ -519,20 +537,27 @@ u_isWhitespace(UChar32 c) {
/* Checks if the Unicode character is printable.*/
U_CAPI UBool U_EXPORT2
u_isprint(UChar32 c) {
return (UBool)(
!IS_ISO_8_CONTROL(c) &&
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
~(1UL<<U_UNASSIGNED|
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
)!=0);
if(IS_ISO_8_CONTROL(c)) {
return FALSE;
} else {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
((1UL<<GET_CATEGORY(props))&
~(1UL<<U_UNASSIGNED|
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
)!=0);
}
}
/* Checks if the Unicode character can start a Unicode identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDStart(UChar32 c) {
/* same as u_isalpha() */
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
uint32_t props;
GET_PROPS(c, props);
return (UBool)(((1UL<<GET_CATEGORY(props))&
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
)!=0);
}
@ -541,8 +566,10 @@ u_isIDStart(UChar32 c) {
identifier.*/
U_CAPI UBool U_EXPORT2
u_isIDPart(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
((1UL<<GET_CATEGORY(props))&
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
1UL<<U_CONNECTOR_PUNCTUATION|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_NON_SPACING_MARK)
@ -564,8 +591,10 @@ u_isIDIgnorable(UChar32 c) {
/*Checks if the Unicode character can start a Java identifier.*/
U_CAPI UBool U_EXPORT2
u_isJavaIDStart(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
((1UL<<GET_CATEGORY(props))&
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION)
)!=0);
@ -576,8 +605,10 @@ u_isJavaIDStart(UChar32 c) {
*/
U_CAPI UBool U_EXPORT2
u_isJavaIDPart(UChar32 c) {
uint32_t props;
GET_PROPS(c, props);
return (UBool)(
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
((1UL<<GET_CATEGORY(props))&
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION|
@ -589,13 +620,14 @@ u_isJavaIDPart(UChar32 c) {
/* Transforms the Unicode character to its lower case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_tolower(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
return c+GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
int i=EXC_LOWERCASE;
@ -610,13 +642,14 @@ u_tolower(UChar32 c) {
/* Transforms the Unicode character to its upper case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_toupper(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
return c-GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
int i=EXC_UPPERCASE;
@ -631,14 +664,15 @@ u_toupper(UChar32 c) {
/* Transforms the Unicode character to its title case equivalent.*/
U_CAPI UChar32 U_EXPORT2
u_totitle(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
/* here, titlecase is same as uppercase */
return c-GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_TITLECASE)) {
int i=EXC_TITLECASE;
@ -658,13 +692,14 @@ u_totitle(UChar32 c) {
U_CAPI int32_t U_EXPORT2
u_charDigitValue(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER) {
return GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DIGIT_VALUE)) {
int32_t value;
@ -697,7 +732,8 @@ u_charDigitValue(UChar32 c) {
/* Gets the character's linguistic directionality.*/
U_CAPI UCharDirection U_EXPORT2
u_charDirection(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(props!=0) {
return (UCharDirection)((props>>BIDI_SHIFT)&0x1f);
} else {
@ -707,19 +743,22 @@ u_charDirection(UChar32 c) {
U_CAPI UBool U_EXPORT2
u_isMirrored(UChar32 c) {
return (UBool)(GET_PROPS(c)&(1UL<<MIRROR_SHIFT) ? TRUE : FALSE);
uint32_t props;
GET_PROPS(c, props);
return (UBool)(props&(1UL<<MIRROR_SHIFT) ? TRUE : FALSE);
}
U_CAPI UChar32 U_EXPORT2
u_charMirror(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if((props&(1UL<<MIRROR_SHIFT))==0) {
/* not mirrored - the value is not a mirror offset */
return c;
} else if(!PROPS_VALUE_IS_EXCEPTION(props)) {
return c+GET_SIGNED_VALUE(props);
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_MIRROR_MAPPING)) {
int i=EXC_MIRROR_MAPPING;
@ -734,7 +773,8 @@ u_charMirror(UChar32 c) {
U_CFUNC uint8_t
u_internalGetCombiningClass(UChar32 c) {
uint32_t props=GET_PROPS_UNSAFE(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_NON_SPACING_MARK) {
return (uint8_t)GET_UNSIGNED_VALUE(props);
@ -749,7 +789,8 @@ u_internalGetCombiningClass(UChar32 c) {
U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_NON_SPACING_MARK) {
return (uint8_t)GET_UNSIGNED_VALUE(props);
@ -1091,7 +1132,7 @@ isFollowedByCasedLetter(const UChar *src, UTextOffset srcIndex, int32_t srcLengt
while(srcIndex<srcLength) {
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
props=GET_PROPS_UNSAFE(c);
GET_PROPS_UNSAFE(c, props);
category=GET_CATEGORY(props);
if((1UL<<category)&(1UL<<U_LOWERCASE_LETTER|1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
return TRUE; /* followed by cased letter */
@ -1112,7 +1153,7 @@ isPrecededByCasedLetter(const UChar *src, UTextOffset srcIndex) {
while(0<srcIndex) {
UTF_PREV_CHAR(src, 0, srcIndex, c);
props=GET_PROPS_UNSAFE(c);
GET_PROPS_UNSAFE(c, props);
category=GET_CATEGORY(props);
if((1UL<<category)&(1UL<<U_LOWERCASE_LETTER|1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
return TRUE; /* preceded by cased letter */
@ -1216,7 +1257,7 @@ u_internalStrToLower(UChar *dest, int32_t destCapacity,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
UChar buffer[8];
uint32_t *pe;
const uint32_t *pe;
const UChar *u;
uint32_t props, firstExceptionValue, specialCasing;
int32_t srcIndex, destIndex, i, loc;
@ -1262,7 +1303,7 @@ u_internalStrToLower(UChar *dest, int32_t destCapacity,
srcIndex=destIndex=0;
while(srcIndex<srcLength) {
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
props=GET_PROPS_UNSAFE(c);
GET_PROPS_UNSAFE(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
c+=GET_SIGNED_VALUE(props);
@ -1432,7 +1473,7 @@ u_internalStrToUpper(UChar *dest, int32_t destCapacity,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
UChar buffer[8];
uint32_t *pe;
const uint32_t *pe;
const UChar *u;
uint32_t props, firstExceptionValue, specialCasing;
int32_t srcIndex, destIndex, i, loc;
@ -1478,7 +1519,7 @@ u_internalStrToUpper(UChar *dest, int32_t destCapacity,
srcIndex=destIndex=0;
while(srcIndex<srcLength) {
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
props=GET_PROPS_UNSAFE(c);
GET_PROPS_UNSAFE(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
c-=GET_SIGNED_VALUE(props);
@ -1594,11 +1635,12 @@ notSpecial:
/* internal */
U_CAPI int32_t U_EXPORT2
u_internalTitleCase(UChar32 c, UChar *dest, int32_t destCapacity, const char *locale) {
uint32_t props=GET_PROPS(c);
uint32_t props;
UChar32 title;
int32_t i, length;
title=c;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
/* here, titlecase is same as uppercase */
@ -1606,7 +1648,7 @@ u_internalTitleCase(UChar32 c, UChar *dest, int32_t destCapacity, const char *lo
}
} else if(HAVE_DATA) {
const UChar *u;
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe, specialCasing;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_SPECIAL_CASING)) {
i=EXC_SPECIAL_CASING;
@ -1704,16 +1746,17 @@ single:
/* return the simple case folding mapping for c */
U_CAPI UChar32 U_EXPORT2
u_foldCase(UChar32 c, uint32_t options) {
uint32_t props=GET_PROPS(c);
uint32_t props;
GET_PROPS(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
return c+GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
uint32_t *oldPE=pe;
const uint32_t *oldPE=pe;
int i=EXC_CASE_FOLDING;
++pe;
ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
@ -1753,15 +1796,16 @@ u_foldCase(UChar32 c, uint32_t options) {
/* internal, return the full case folding mapping for c, must be used only if uprv_haveProperties() is true */
U_CFUNC int32_t
u_internalFoldCase(UChar32 c, UChar dest[32], uint32_t options) {
uint32_t props=GET_PROPS_UNSAFE(c);
uint32_t props;
int32_t i;
GET_PROPS_UNSAFE(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
c+=GET_SIGNED_VALUE(props);
}
} else {
uint32_t *pe=GET_EXCEPTIONS(props);
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
i=EXC_CASE_FOLDING;
@ -1810,7 +1854,7 @@ u_internalStrFoldCase(UChar *dest, int32_t destCapacity,
UGrowBuffer *growBuffer, void *context,
UErrorCode *pErrorCode) {
UChar buffer[UTF_MAX_CHAR_LENGTH];
uint32_t *pe;
const uint32_t *pe;
const UChar *uchars, *u;
uint32_t props, firstExceptionValue;
int32_t srcIndex, destIndex, i;
@ -1857,7 +1901,7 @@ u_internalStrFoldCase(UChar *dest, int32_t destCapacity,
srcIndex=destIndex=0;
while(srcIndex<srcLength) {
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
props=GET_PROPS_UNSAFE(c);
GET_PROPS_UNSAFE(c, props);
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
c+=GET_SIGNED_VALUE(props);

1025
icu4c/source/common/utrie.c Normal file

File diff suppressed because it is too large Load Diff

656
icu4c/source/common/utrie.h Normal file
View File

@ -0,0 +1,656 @@
/*
******************************************************************************
*
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utrie.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001nov08
* created by: Markus W. Scherer
*/
#ifndef __UTRIE_H__
#define __UTRIE_H__
#include "unicode/utypes.h"
U_CDECL_BEGIN
/**
* /file
*
* This is a common implementation of a "folded" trie.
* It is a kind of compressed, serializable table of 16- or 32-bit values associated with
* Unicode code points (0..0x10ffff).
*
* This implementation is optimized for getting values while walking forward
* through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
*
* The _FROM_BMP() macros are a little more complicated; they get values
* even for lead surrogate code _points_, while the _FROM_LEAD() macros
* get special "folded" values for lead surrogate code _units_ if
* there is relevant data associated with them.
* From such a folded value, an offset needs to be extracted to supply
* to the _FROM_OFFSET_TRAIL() macros.
*
* Most of the more complex (and more convenient) functions call a callback function
* to get that offset from the folded value for a lead surrogate unit.
*/
/**
* Trie constants, defining shift widths, index array lengths, etc.
*/
enum {
/** Shift size for shifting right the input index. 1..9 */
UTRIE_SHIFT=5,
/** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
/** Mask for getting the lower bits from the input index. */
UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
/**
* Lead surrogate code points' index displacement in the index array.
* 0x10000-0xd800=0x2800
*/
UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
* 0..UTRIE_SHIFT
*/
UTRIE_INDEX_SHIFT=2,
/** The alignment size of a stage 2 data block. Also the granularity for compaction. */
UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
/** Number of bits of a trail surrogate that are used in index table lookups. */
UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
/**
* Number of index (stage 1) entries per lead surrogate.
* Same as number of indexe entries for 1024 trail surrogates,
* ==0x400>>UTRIE_SHIFT
*/
UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
/** Length of the BMP portion of the index (stage 1) array. */
UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
};
/**
* Length of the index (stage 1) array before folding.
* Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
*/
#define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
/**
* Maximum length of the runtime data (stage 2) array.
* Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
*/
#define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
/**
* Maximum length of the build-time data (stage 2) array.
* The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
* (Number of Unicode code points + one all-zero block +
* possible duplicate entries for 1024 lead surrogates.)
*/
#define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
/**
* Runtime UTrie callback function.
* Extract from a lead surrogate's data the
* index array offset of the indexes for that lead surrogate.
*
* @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
*/
typedef int32_t U_CALLCONV
UTrieGetFoldingOffset(uint32_t data);
/**
* Run-time Trie structure.
*
* Either the data table is 16 bits wide and accessed via the index
* pointer, with each index item increased by indexLength;
* in this case, data32==NULL.
*
* Or the data table is 32 bits wide and accessed via the data32 pointer.
*/
struct UTrie {
const uint16_t *index;
const uint32_t *data32; /* NULL if 16b data is used via index */
/**
* This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
* If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
* @see UTrieGetFoldingOffset
*/
UTrieGetFoldingOffset *getFoldingOffset;
int32_t indexLength, dataLength;
UBool isLatin1Linear;
};
typedef struct UTrie UTrie;
/** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
#define _UTRIE_GET_RAW(trie, data, offset, c16) \
(trie)->data[ \
((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
((c16)&UTRIE_MASK) \
]
/** Internal trie getter from a pair of surrogates */
#define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result) { \
int32_t __offset; \
\
/* get data for lead surrogate */ \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
__offset=(trie)->getFoldingOffset(result); \
\
/* get the real data from the folded lead/trail units */ \
if(__offset>0) { \
(result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
} else { \
(result)=0; \
} \
}
/** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
#define _UTRIE_GET_FROM_BMP(trie, data, c16) \
_UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16);
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
*/
#define _UTRIE_GET(trie, data, c32, result) \
if((uint32_t)(c32)<=0xffff) { \
/* BMP code points */ \
(result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
} else if((uint32_t)(c32)<=0x10ffff) { \
/* supplementary code point */ \
UChar __lead16=UTF16_LEAD(c32); \
_UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result); \
} else { \
/* out of range */ \
(result)=0; \
}
/** Internal next-post-increment: get the next code point (c, c2) and its data */
#define _UTRIE_NEXT(trie, data, src, limit, c, c2, result) { \
(c)=*(src)++; \
if(!UTF_IS_LEAD(c)) { \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} else if((src)!=(limit) && UTF_IS_TRAIL((c2)=*(src))) { \
++(src); \
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result)); \
} else { \
/* unpaired lead surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
} \
}
/** Internal previous: get the previous code point (c, c2) and its data */
#define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result) { \
(c)=*--(src); \
if(!UTF_IS_SURROGATE(c)) { \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} else if(!UTF_IS_SURROGATE_FIRST(c)) { \
/* trail surrogate */ \
if((start)!=(src) && UTF_IS_LEAD((c2)=*((src)-1))) { \
--(src); \
(result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result)); \
} else { \
/* unpaired trail surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
} \
} else { \
/* unpaired lead surrogate code point */ \
(c2)=0; \
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
} \
}
/* Public UTrie API ---------------------------------------------------------*/
/**
* Get a pointer to the contiguous part of the data array
* for the Latin-1 range (U+0000..U+00ff).
* Must be used only if the Latin-1 range is in fact linear
* (trie->isLatin1Linear).
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @return (const uint16_t *) pointer to values for Latin-1 code points
*/
#define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
/**
* Get a pointer to the contiguous part of the data array
* for the Latin-1 range (U+0000..U+00ff).
* Must be used only if the Latin-1 range is in fact linear
* (trie->isLatin1Linear).
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @return (const uint32_t *) pointer to values for Latin-1 code points
*/
#define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
/**
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
* c16 may be a lead surrogate, which may have a value including a folding offset.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
/**
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
* c16 may be a lead surrogate, which may have a value including a folding offset.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
/**
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
/**
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c16 (UChar, in) the input BMP code point
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
/**
* Get a 16-bit trie value from a code point.
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c32 (UChar32, in) the input code point
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result)
/**
* Get a 32-bit trie value from a code point.
* Even lead surrogate code points are treated as normal code points,
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c32 (UChar32, in) the input code point
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result)
/**
* Get the next code point (c, c2), post-increment src,
* and get a 16-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param src (const UChar *, in/out) the source text pointer
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result)
/**
* Get the next code point (c, c2), post-increment src,
* and get a 32-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param src (const UChar *, in/out) the source text pointer
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result)
/**
* Get the previous code point (c, c2), pre-decrement src,
* and get a 16-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param start (const UChar *, in) the start pointer for the text, or NULL
* @param src (const UChar *, in/out) the source text pointer
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result)
/**
* Get the previous code point (c, c2), pre-decrement src,
* and get a 32-bit value from the trie.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param start (const UChar *, in) the start pointer for the text, or NULL
* @param src (const UChar *, in/out) the source text pointer
* @param c (UChar, out) variable for the BMP or lead code unit
* @param c2 (UChar, out) variable for 0 or the trail code unit
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result)
/**
* Get a 16-bit trie value from a pair of surrogates.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c (UChar, in) a lead surrogate
* @param c2 (UChar, in) a trail surrogate
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
*/
#define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result)
/**
* Get a 32-bit trie value from a pair of surrogates.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param c (UChar, in) a lead surrogate
* @param c2 (UChar, in) a trail surrogate
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
*/
#define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result)
/**
* Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
* @return (uint16_t) trie lookup result
*/
#define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
/**
* Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
* and a trail surrogate.
*
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
* @return (uint32_t) trie lookup result
*/
#define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
/* enumeration callback types */
/**
* Callback from utrie_enum(), extracts a uint32_t value from a
* trie value. This value will be passed on to the UTrieEnumRange function.
*
* @param context an opaque pointer, as passed into utrie_enum()
* @param value a value from the trie
* @return the value that is to be passed on to the UTrieEnumRange function
*/
typedef uint32_t U_CALLCONV
UTrieEnumValue(const void *context, uint32_t value);
/**
* Callback from utrie_enum(), is called for each contiguous range
* of code points with the same value as retrieved from the trie and
* transformed by the UTrieEnumValue function.
*
* @param context an opaque pointer, as passed into utrie_enum()
* @param start the first code point in a contiguous range with value
* @param limit one past the last code point in a contiguous range with value
* @param value the value that is set for all code points in [start..limit[
*/
typedef void U_CALLCONV
UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
/**
* Enumerate efficiently all values in a trie.
* For each entry in the trie, the value to be delivered is passed through
* the UTrieEnumValue function.
* The value is unchanged if that function pointer is NULL.
*
* For each contiguous range of code points with a given value,
* the UTrieEnumRange function is called.
*
* @param trie a pointer to the runtime trie structure
* @param enumValue a pointer to a function that may transform the trie entry value,
* or NULL if the values from the trie are to be used directly
* @param enumRange a pointer to a function that is called for each contiguous range
* of code points with the same value
* @param context an opaque pointer that is passed on to the callback functions
*/
U_CAPI void U_EXPORT2
utrie_enum(UTrie *trie,
UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
/**
* Unserialize a trie from 32-bit-aligned memory.
* Inverse of utrie_serialize().
* Fills the UTrie runtime trie structure with the settings for the trie data.
*
* @param trie a pointer to the runtime trie structure
* @param data a pointer to 32-bit-aligned memory containing trie data
* @param length the number of bytes available at data
* @param pErrorCode an in/out ICU UErrorCode
* @return the number of bytes at data taken up by the trie data
*/
U_CAPI int32_t U_EXPORT2
utrie_unserialize(UTrie *trie, const uint8_t *data, int32_t length, UErrorCode *pErrorCode);
/* Building a trie ----------------------------------------------------------*/
/**
* Build-time trie structure.
* Opaque definition, here only to make fillIn parameters possible
* for utrie_open() and utrie_clone().
*/
struct UNewTrie {
/**
* Index values at build-time are 32 bits wide for easier processing.
* Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
*/
int32_t index[UTRIE_MAX_INDEX_LENGTH];
uint32_t *data;
int32_t indexLength, dataCapacity, dataLength;
UBool isAllocated, isDataAllocated;
UBool isLatin1Linear, isCompacted;
/**
* Map of adjusted indexes, used in utrie_compact().
* Maps from original indexes to new ones.
*/
int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
};
typedef struct UNewTrie UNewTrie;
/**
* Build-time trie callback function, used with utrie_serialize().
* This function calculates a lead surrogate's value including a folding offset
* from the 1024 supplementary code points [start..start+1024[ .
* It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
*
* The folding offset is provided by the caller.
* It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
* Instead of the offset itself, n can be stored in 10 bits -
* or fewer if it can be assumed that few lead surrogates have associated data.
*
* The returned value must be
* - not zero if and only if there is relevant data
* for the corresponding 1024 supplementary code points
* - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
*
* @return a folded value, or 0 if there is no relevant data for the lead surrogate.
*/
typedef uint32_t U_CALLCONV
UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
/**
* Open a build-time trie structure.
* The size of the build-time data array is specified to avoid allocating a large
* array in all cases. The array itself can also be passed in.
*
* Although the trie is never fully expanded to a linear array, especially when
* utrie_setRange32() is used, the data array could be large during build time.
* The maximum length is
* UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
* (Number of Unicode code points + one all-zero block +
* possible duplicate entries for 1024 lead surrogates.)
* (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
*
* @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
* NULL if one is to be allocated
* @param aliasData a pointer to a data array to be used (will not be released), or
* NULL if one is to be allocated
* @param maxDataLength the capacity of aliasData (if not NULL) or
* the length of the data array to be allocated
* @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
* kept in a linear, contiguous part of the data array
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
*/
U_CAPI UNewTrie * U_EXPORT2
utrie_open(UNewTrie *fillIn, uint32_t *aliasData, int32_t maxDataLength, UBool latin1Linear);
/**
* Clone a build-time trie structure with all entries.
*
* @param fillIn like in utrie_open()
* @param other the build-time trie structure to clone
* @param aliasData like in utrie_open(),
* used if aliasDataLength>=(capacity of other's data array)
* @param aliasDataLength the length of aliasData
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
*/
U_CAPI UNewTrie * U_EXPORT2
utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
/**
* Close a build-time trie structure, and release memory
* that was allocated by utrie_open() or utrie_clone().
*
* @param trie the build-time trie
*/
U_CAPI void U_EXPORT2
utrie_close(UNewTrie *trie);
/**
* Get the data array of a build-time trie.
* The data may be modified, but entries that are equal before
* must still be equal after modification.
*
* @param trie the build-time trie
* @param pLength (out) a pointer to a variable that receives the number
* of entries in the data array
* @return the data array
*/
U_CAPI uint32_t * U_EXPORT2
utrie_getData(UNewTrie *trie, int32_t *pLength);
/**
* Set a value for a code point.
*
* @param trie the build-time trie
* @param c the code point
* @param value the value
* @return FALSE if a failure occurred (illegal argument or data array overrun)
*/
U_CAPI UBool U_EXPORT2
utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
/**
* Get a value from a code point as stored in the build-time trie.
*
* @param trie the build-time trie
* @param c the code point
* @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE
* iff the value is retrieved from block 0;
* block 0 is the all-zero initial block
* @return the value
*/
U_CAPI uint32_t U_EXPORT2
utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
/**
* Set a value in a range of code points [start..limit[.
* All code points c with start<=c<limit will get the value if
* overwrite is TRUE or if the old value is 0.
*
* @param trie the build-time trie
* @param start the first code point to get the value
* @param limit one past the last code point to get the value
* @param value the value
* @param overwrite flag for whether old non-zero values are to be overwritten
* @return FALSE if a failure occurred (illegal argument or data array overrun)
*/
U_CAPI UBool U_EXPORT2
utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
/**
* Compact the build-time trie after all values are set, and then
* serialize it into 32-bit aligned memory.
*
* After this, the trie can only be serizalized again and/or closed;
* no further values can be added.
*
* @see utrie_unserialize()
*
* @param trie the build-time trie
* @param data a pointer to 32-bit-aligned memory for the trie data
* @param capacity the number of bytes available at data
* @param getFoldedValue a callback function that calculates the value for
* a lead surrogate from all of its supplementary code points
* and the folding offset
* @param reduceTo16Bits flag for whether the values are to be reduced to a
* width of 16 bits for serialization and runtime
* @param pErrorCode a UErrorCode argument; among other possible error codes:
* - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
* - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
* - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
*
* @return the number of bytes written for the trie
*/
U_CAPI int32_t U_EXPORT2
utrie_serialize(UNewTrie *trie, uint8_t *data, int32_t capacity,
UNewTrieGetFoldedValue *getFoldedValue,
UBool reduceTo16Bits,
UErrorCode *pErrorCode);
U_CDECL_END
#endif

View File

@ -292,6 +292,10 @@ SOURCE=.\susctest.c
# End Source File
# Begin Source File
SOURCE=.\trietest.c
# End Source File
# Begin Source File
SOURCE=.\ucmptst.c
# End Source File
# Begin Source File

View File

@ -634,7 +634,7 @@ static void TestUnicodeData()
/* sanity check on repeated properties */
for(c=0xfffe; c<=0x10ffff;) {
if(u_charType(c)!=U_UNASSIGNED) {
log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED\n", c);
log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
}
if((c&0xffff)==0xfffe) {
++c;

View File

@ -22,11 +22,13 @@ void addSCSUTest(TestNode** root);
void addHashtableTest(TestNode** root);
void addCStringTest(TestNode** root);
void addMemoryStreamTest(TestNode** root);
void addTrieTest(TestNode** root);
void addUtility(TestNode** root);
void addUtility(TestNode** root)
{
addTrieTest(root);
addLocaleTest(root);
addUnicodeTest(root);
addResourceBundleTest(root);
@ -36,4 +38,3 @@ void addUtility(TestNode** root)
addCStringTest(root);
addMemoryStreamTest(root);
}

View File

@ -0,0 +1,567 @@
/*
******************************************************************************
*
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: trietest.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001nov20
* created by: Markus W. Scherer
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "utrie.h"
#include "cstring.h"
#if 1
#include "cintltst.h"
#else
/* definitions from standalone utrie development */
#define log_err printf
#define log_verbose printf
#undef u_errorName
#define u_errorName(errorCode) "some error code"
#endif
#define ARRAY_LENGTH(array) (sizeof(array)/sizeof(array[0]))
/* Values for setting possibly overlapping, out-of-order ranges of values */
typedef struct SetRange {
UChar32 start, limit;
uint32_t value;
UBool overwrite;
} SetRange;
/*
* Values for testing:
* value is set from the previous boundary's limit to before
* this boundary's limit
*/
typedef struct CheckRange {
UChar32 limit;
uint32_t value;
} CheckRange;
static uint8_t storage[100000];
static uint32_t U_CALLCONV
_testFoldedValue32(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t foldedValue, value;
UChar32 limit;
UBool inBlockZero;
foldedValue=0;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else {
foldedValue|=value;
++start;
}
}
if(foldedValue!=0) {
return ((uint32_t)offset<<16)|foldedValue;
} else {
return 0;
}
}
static int32_t U_CALLCONV
_testFoldingOffset32(uint32_t data) {
return (int32_t)(data>>16);
}
static uint32_t U_CALLCONV
_testFoldedValue16(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t foldedValue, value;
UChar32 limit;
UBool inBlockZero;
foldedValue=0;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else {
foldedValue|=value;
++start;
}
}
if(foldedValue!=0) {
return (uint32_t)(offset|0x8000);
} else {
return 0;
}
}
static int32_t U_CALLCONV
_testFoldingOffset16(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
static uint32_t U_CALLCONV
_testEnumValue(const void *context, uint32_t value) {
return value^0x5555;
}
static void U_CALLCONV
_testEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
const CheckRange **pb=(const CheckRange **)context;
const CheckRange *b=(*pb)++;
value^=0x5555;
if(start!=(b-1)->limit || limit!=b->limit || value!=b->value) {
log_err("error: utrie_enum() delivers wrong range [U+%04lx..U+%04lx[.0x%lx instead of [U+%04lx..U+%04lx[.0x%lx\n",
start, limit, value,
(b-1)->limit, b->limit, b->value);
}
}
static void
testTrieIteration(const char *testName,
const UTrie *trie,
const CheckRange checkRanges[], int32_t countCheckRanges) {
UChar s[100];
uint32_t values[30];
const UChar *p, *limit;
uint32_t value;
UChar32 c;
int32_t i, length, countValues;
UChar c2;
/* write a string */
length=countValues=0;
for(i=0; i<countCheckRanges; ++i) {
c=checkRanges[i].limit;
if(c!=0) {
--c;
UTF_APPEND_CHAR_UNSAFE(s, length, c);
values[countValues++]=checkRanges[i].value;
}
}
limit=s+length;
/* try forward */
p=s;
i=0;
while(p<limit) {
c=c2=0x33;
if(trie->data32!=NULL) {
UTRIE_NEXT32(trie, p, limit, c, c2, value);
} else {
UTRIE_NEXT16(trie, p, limit, c, c2, value);
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE_NEXT(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
testName, c, c2, value, values[i]);
}
if(
c2==0 ?
c!=*(p-1) :
!UTF_IS_LEAD(c) || !UTF_IS_TRAIL(c2) || c!=*(p-2) || c2!=*(p-1)
) {
log_err("error: wrong (c, c2) from UTRIE_NEXT(%s): (U+%04lx, U+%04lx)\n",
testName, c, c2);
continue;
}
if(c2!=0) {
int32_t offset;
if(trie->data32==NULL) {
value=UTRIE_GET16_FROM_LEAD(trie, c);
offset=trie->getFoldingOffset(value);
if(offset>0) {
value=UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2);
} else {
value=0;
}
} else {
value=UTRIE_GET32_FROM_LEAD(trie, c);
offset=trie->getFoldingOffset(value);
if(offset>0) {
value=UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2);
} else {
value=0;
}
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE_GETXX_FROM_OFFSET_TRAIL(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
testName, c, c2, value, values[i]);
}
}
if(c2!=0) {
value=0x44;
if(trie->data32==NULL) {
UTRIE_GET16_FROM_PAIR(trie, c, c2, value);
} else {
UTRIE_GET32_FROM_PAIR(trie, c, c2, value);
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE_GETXX_FROM_PAIR(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
testName, c, c2, value, values[i]);
}
}
++i;
}
/* try backward */
p=limit;
i=countValues;
while(s<p) {
--i;
c=c2=0x33;
if(trie->data32!=NULL) {
UTRIE_PREVIOUS32(trie, s, p, c, c2, value);
} else {
UTRIE_PREVIOUS16(trie, s, p, c, c2, value);
}
if(value!=values[i]) {
log_err("error: wrong value from UTRIE_PREVIOUS(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
testName, c, c2, value, values[i]);
}
if(
c2==0 ?
c!=*p:
!UTF_IS_LEAD(c) || !UTF_IS_TRAIL(c2) || c!=*p || c2!=*(p+1)
) {
log_err("error: wrong (c, c2) from UTRIE_PREVIOUS(%s): (U+%04lx, U+%04lx)\n",
testName, c, c2);
}
}
}
static void
testTrieRanges(const char *testName,
const SetRange setRanges[], int32_t countSetRanges,
const CheckRange checkRanges[], int32_t countCheckRanges,
UBool dataIs32, UBool latin1Linear) {
UTrieGetFoldingOffset *getFoldingOffset;
const CheckRange *enumRanges;
UNewTrie *newTrie;
UTrie trie={ 0 };
uint32_t value, value2;
UChar32 start, limit;
int32_t i, length;
UErrorCode errorCode;
UBool overwrite, ok;
log_verbose("\ntesting Trie '%s'\n", testName);
newTrie=utrie_open(NULL, NULL, 2000, latin1Linear);
/* set values from setRanges[] */
ok=TRUE;
for(i=0; i<countSetRanges; ++i) {
start=setRanges[i].start;
limit=setRanges[i].limit;
value=setRanges[i].value;
overwrite=setRanges[i].overwrite;
if((limit-start)==1 && overwrite) {
ok&=utrie_set32(newTrie, start, value);
} else {
ok&=utrie_setRange32(newTrie, start, limit, value, overwrite);
}
}
if(!ok) {
log_err("error: setting values into a trie failed (%s)\n", testName);
return;
}
/* verify that all these values are in the new Trie */
start=0;
for(i=0; i<countCheckRanges; ++i) {
limit=checkRanges[i].limit;
value=checkRanges[i].value;
while(start<limit) {
if(value!=utrie_get32(newTrie, start, NULL)) {
log_err("error: newTrie(%s)[U+%04lx]==0x%lx instead of 0x%lx\n",
testName, start, utrie_get32(newTrie, start, NULL), value);
}
++start;
}
}
if(dataIs32) {
getFoldingOffset=_testFoldingOffset32;
} else {
getFoldingOffset=_testFoldingOffset16;
}
errorCode=U_ZERO_ERROR;
length=utrie_serialize(newTrie, storage, sizeof(storage),
dataIs32 ? _testFoldedValue32 : _testFoldedValue16,
(UBool)!dataIs32,
&errorCode);
if(U_FAILURE(errorCode)) {
log_err("error: utrie_serialize(%s) failed: %s\n", testName, u_errorName(errorCode));
utrie_close(newTrie);
return;
}
/* test linear Latin-1 range from utrie_getData() */
if(latin1Linear) {
uint32_t *data;
int32_t dataLength;
data=utrie_getData(newTrie, &dataLength);
start=0;
for(i=0; i<countCheckRanges && start<=0xff; ++i) {
limit=checkRanges[i].limit;
value=checkRanges[i].value;
while(start<limit && start<=0xff) {
if(value!=data[UTRIE_DATA_BLOCK_LENGTH+start]) {
log_err("error: newTrie(%s).latin1Data[U+%04lx]==0x%lx instead of 0x%lx\n",
testName, start, data[UTRIE_DATA_BLOCK_LENGTH+start], value);
}
++start;
}
}
}
utrie_close(newTrie);
errorCode=U_ZERO_ERROR;
if(!utrie_unserialize(&trie, storage, length, &errorCode)) {
log_err("error: utrie_unserialize() failed, %s\n", u_errorName(errorCode));
return;
}
trie.getFoldingOffset=getFoldingOffset;
if(dataIs32!=(trie.data32!=NULL)) {
log_err("error: trie serialization (%s) did not preserve 32-bitness\n", testName);
}
if(latin1Linear!=trie.isLatin1Linear) {
log_err("error: trie serialization (%s) did not preserve Latin-1-linearity\n", testName);
}
/* verify that all these values are in the unserialized Trie */
start=0;
for(i=0; i<countCheckRanges; ++i) {
limit=checkRanges[i].limit;
value=checkRanges[i].value;
if(start==0xd800) {
/* skip surrogates */
start=limit;
continue;
}
while(start<limit) {
if(start<=0xffff) {
if(dataIs32) {
value2=UTRIE_GET32_FROM_BMP(&trie, start);
} else {
value2=UTRIE_GET16_FROM_BMP(&trie, start);
}
if(value!=value2) {
log_err("error: unserialized trie(%s).fromBMP(U+%04lx)==0x%lx instead of 0x%lx\n",
testName, start, value2, value);
}
if(!UTF_IS_LEAD(start)) {
if(dataIs32) {
value2=UTRIE_GET32_FROM_LEAD(&trie, start);
} else {
value2=UTRIE_GET16_FROM_LEAD(&trie, start);
}
if(value!=value2) {
log_err("error: unserialized trie(%s).fromLead(U+%04lx)==0x%lx instead of 0x%lx\n",
testName, start, value2, value);
}
}
}
if(dataIs32) {
UTRIE_GET32(&trie, start, value2);
} else {
UTRIE_GET16(&trie, start, value2);
}
if(value!=value2) {
log_err("error: unserialized trie(%s).get(U+%04lx)==0x%lx instead of 0x%lx\n",
testName, start, value2, value);
}
++start;
}
}
/* enumerate and verify all ranges */
enumRanges=checkRanges+1;
utrie_enum(&trie, _testEnumValue, _testEnumRange, &enumRanges);
/* test linear Latin-1 range */
if(trie.isLatin1Linear) {
if(trie.data32!=NULL) {
const uint32_t *latin1=UTRIE_GET32_LATIN1(&trie);
for(start=0; start<0x100; ++start) {
if(latin1[start]!=UTRIE_GET32_FROM_LEAD(&trie, start)) {
log_err("error: (%s) trie.latin1[U+%04lx]=0x%lx!=0x%lx=trie.get32(U+%04lx)\n",
testName, start, latin1[start], UTRIE_GET32_FROM_LEAD(&trie, start), start);
}
}
} else {
const uint16_t *latin1=UTRIE_GET16_LATIN1(&trie);
for(start=0; start<0x100; ++start) {
if(latin1[start]!=UTRIE_GET16_FROM_LEAD(&trie, start)) {
log_err("error: (%s) trie.latin1[U+%04lx]=0x%lx!=0x%lx=trie.get16(U+%04lx)\n",
testName, start, latin1[start], UTRIE_GET16_FROM_LEAD(&trie, start), start);
}
}
}
}
testTrieIteration(testName, &trie, checkRanges, countCheckRanges);
}
static void
testTrieRanges2(const char *testName,
const SetRange setRanges[], int32_t countSetRanges,
const CheckRange checkRanges[], int32_t countCheckRanges,
UBool dataIs32) {
char name[40];
testTrieRanges(testName,
setRanges, countSetRanges,
checkRanges, countCheckRanges,
dataIs32, FALSE);
uprv_strcpy(name, testName);
uprv_strcat(name, "-latin1Linear");
testTrieRanges(name,
setRanges, countSetRanges,
checkRanges, countCheckRanges,
dataIs32, TRUE);
}
static void
testTrieRanges4(const char *testName,
const SetRange setRanges[], int32_t countSetRanges,
const CheckRange checkRanges[], int32_t countCheckRanges) {
char name[40];
uprv_strcpy(name, testName);
uprv_strcat(name, ".32");
testTrieRanges2(name,
setRanges, countSetRanges,
checkRanges, countCheckRanges,
TRUE);
uprv_strcpy(name, testName);
uprv_strcat(name, ".16");
testTrieRanges2(name,
setRanges, countSetRanges,
checkRanges, countCheckRanges,
FALSE);
}
/* test data ----------------------------------------------------------------*/
/* set consecutive ranges, even with value 0 */
static const SetRange
setRanges1[]={
0, 0x20, 0, FALSE,
0x20, 0xa7, 0x1234, FALSE,
0xa7, 0x3400, 0, FALSE,
0x3400, 0x9fa6, 0x6162, FALSE,
0x9fa6, 0xdada, 0x3132, FALSE,
0xdada, 0xeeee, 0x27, FALSE,
0xeeee, 0x11111, 1, FALSE,
0x11111, 0x44444, 0x6162, FALSE,
0x44444, 0xf0003, 0, FALSE,
0xf0003, 0xf0004, 0xf, FALSE,
0xf0004, 0xf0006, 0x10, FALSE,
0xf0006, 0xf0007, 0x11, FALSE,
0xf0007, 0xf0020, 0x12, FALSE,
0xf0020, 0x110000, 0, FALSE
};
static const CheckRange
checkRanges1[]={
0, 0, /* dummy start range to make _testEnumRange() simpler */
0x20, 0,
0xa7, 0x1234,
0x3400, 0,
0x9fa6, 0x6162,
0xdada, 0x3132,
0xeeee, 0x27,
0x11111,1,
0x44444,0x6162,
0xf0003,0,
0xf0004,0xf,
0xf0006,0x10,
0xf0007,0x11,
0xf0020,0x12,
0x110000, 0
};
/* set some interesting overlapping ranges */
static const SetRange
setRanges2[]={
0x21, 0x7f, 0x5555, TRUE,
0x2f800,0x2fedc, 0x7a, TRUE,
0x72, 0xdd, 3, TRUE,
0xdd, 0xde, 4, FALSE,
0x2f987,0x2fa98, 5, TRUE,
0x2f777,0x2f833, 0, TRUE,
0x2f900,0x2ffee, 1, FALSE,
0x2ffee,0x2ffef, 2, TRUE
};
static const CheckRange
checkRanges2[]={
0, 0, /* dummy start range to make _testEnumRange() simpler */
0x21, 0,
0x72, 0x5555,
0xdd, 3,
0xde, 4,
0x2f833,0,
0x2f987,0x7a,
0x2fa98,5,
0x2fedc,0x7a,
0x2ffee,1,
0x2ffef,2,
0x110000, 0
};
static void
TrieTest() {
testTrieRanges4("set1",
setRanges1, ARRAY_LENGTH(setRanges1),
checkRanges1, ARRAY_LENGTH(checkRanges1));
testTrieRanges4("set2",
setRanges2, ARRAY_LENGTH(setRanges2),
checkRanges2, ARRAY_LENGTH(checkRanges2));
}
#if 1
void
addTrieTest(TestNode** root) {
addTest(root, &TrieTest, "tsutil/TrieTest");
}
#else
/* standalone utrie development */
int main(int argc, const char *argv[]) {
TrieTest();
return 0;
}
#endif

View File

@ -180,10 +180,6 @@ main(int argc, char* argv[]) {
/* process parsed data */
if(U_SUCCESS(errorCode)) {
compactProps();
compactStage3();
compactStage2();
/* write the properties data file */
generateData(destDir);
}

View File

@ -74,15 +74,6 @@ addProps(uint32_t c, uint32_t props);
extern void
repeatProps(uint32_t first, uint32_t last, uint32_t props);
extern void
compactStage2(void);
extern void
compactStage3(void);
extern void
compactProps(void);
extern void
generateData(const char *dataDir);

View File

@ -24,6 +24,7 @@
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "genprops.h"
@ -40,8 +41,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 1.1 .
The following is a description of format version 2.0 .
Data contents:
@ -53,39 +53,28 @@ the properties, if any, for that code point. This means that the input
to the lookup are 21-bit unsigned integers, with not all of the
21-bit range used.
It is assumed that client code keeps a uint16_t pointer
It is assumed that client code keeps a uint32_t pointer
to the beginning of the data:
const uint16 *p16;
Some indexes assume 32-bit units; although client code should only
cast the above pointer to (const uint32_t *), it is easier here
to talk about the result of the indexing with the definition of
another pointer variable for this:
const uint32_t *p32=(const uint32_t *)p16;
const uint32_t *p32;
Formally, the file contains the following structures:
A0 const uint16_t STAGE_2_BITS(=6);
A1 const uint16_t STAGE_3_BITS(=4);
(STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS))
A2 const uint16_t exceptionsIndex; -- 32-bit unit index
A3 const uint16_t stage3Index; -- 16-bit unit index of stage3, new in formatVersion 1.1
A4 const uint16_t propsIndex; -- 32-bit unit index, new in formatVersion 1.1
A5 const uint16_t exceptionsTopIndex; -- 32-bit unit index to the first unit after exceptions units, new in formatVersion 1.1
A6 const uint16_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special casing
A7 const uint16_t reservedIndex;
indexes[16] with values i0..i15:
S1 const uint16_t stage1[0x440]; -- 0x440=0x110000>>10
S2 const uint16_t stage2[variable size];
S3 const uint16_t stage3[variable size];
(possible 1*uint16_t for padding to 4-alignment)
i0 const int32_t propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 const int32_t exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 const int32_t exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i3 const int32_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special mappings
i4..i15 const int32_t[] reservedIndex; -- reserved values; 0 for now
P const uint32_t props32[variable size];
E const uint32_t exceptions[variable size];
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
3-stage lookup and properties:
P const uint32_t props32[i1-i0];
E const uint32_t exceptions[i2-i1];
U const UChar uchars[2*(i3-i2)];
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
the Unicode code assignment are exploited:
@ -95,56 +84,30 @@ the Unicode code assignment are exploited:
- Inside blocks for scripts the properties are often repetitive.
- The 21-bit space is not fully used for Unicode.
The three-stage lookup organizes code points in groups of 16 in stage 3.
64 such groups are grouped again, resulting in blocks of 64 indexes
for a total of 1k code points in stage 2.
The first stage is limited according to all code points being <0x110000.
Each stage contains indexes to groups or blocks of the next stage
in an n:1 manner, i.e., multiple entries of one stage may index the same
group or block in the next one.
In the second and third stages, groups of 64 or 16 may partially or completely
overlap to save space with repetitive properties.
In the properties table, only unique 32-bit words are stored to exploit
non-adjacent overlapping. This is why the third stage does not directly
contain the 32-bit properties words but only indexes to them.
The lookup of properties for a given code point is done with a trie lookup,
using the UTrie implementation.
The trie lookup result is a 16-bit index in the props32[] table where the
actual 32-bit properties word is stored. This is done to save space.
The indexes in each stage take the offset in the data of the next block into
account to save additional arithmetic in the access.
(There are thousands of 16-bit entries in the trie data table, but
only a few hundred unique 32-bit properties words.
If the trie data table contained 32-bit words directly, then that would be
larger because the length of the table would be the same as now but the
width would be 32 bits instead of 16. This saves more than 10kB.)
With a given Unicode code point
uint32_t c;
UChar32 c;
and 0<=c<0x110000, the lookup uses the three stage tables to
arrive at an index into the props32[] table containing the character
properties for c.
For some characters, not all of the properties can be efficiently encoded
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
array.
The first stage consumes the 11 most significant bits of the 21-bit code point
and results in an index into the second stage:
uint16_t i2=p16[8+c>>10];
The second stage consumes bits 9 to 4 of c and results in an index into the
third stage:
uint16_t i3=p16[i2+((c>>4)&0x3f)];
The third stage consumes bits 3 to 0 of c and results in a code point-
specific value, which itself is only an index into the props32[] table:
uint16_t i=p16[i3+(c&0xf)];
Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS
in p16[0..1].
There is finally the 32-bit encoded set of properties for c:
and 0<=c<0x110000, the lookup is done like this:
uint16_t i;
UTRIE_GET16(c, i);
uint32_t props=p32[i];
For some characters, this contains an index into the exceptions array:
For some characters, not all of the properties can be efficiently encoded
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
array:
if(props&EXCEPTION_BIT)) {
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
@ -280,31 +243,16 @@ static UDataInfo dataInfo={
U_SIZEOF_UCHAR,
0,
{0x55, 0x50, 0x72, 0x6f}, /* dataFormat="UPro" */
{1, 3, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion */
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
{ 2, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 3, 0, 0, 0 } /* dataVersion */
};
/* definitions and arrays for the 3-stage lookup */
/* definitions of expected data size limits */
enum {
STAGE_2_BITS=6, STAGE_3_BITS=4,
STAGE_1_BITS=21-(STAGE_2_BITS+STAGE_3_BITS),
STAGE_2_SHIFT=STAGE_3_BITS,
STAGE_1_SHIFT=(STAGE_2_SHIFT+STAGE_2_BITS),
/* number of entries per sub-table in each stage */
STAGE_1_BLOCK=0x110000>>STAGE_1_SHIFT,
STAGE_2_BLOCK=1<<STAGE_2_BITS,
STAGE_3_BLOCK=1<<STAGE_3_BITS,
/* number of code points per stage 1 index */
STAGE_2_3_AREA=1<<STAGE_1_SHIFT,
MAX_PROPS_COUNT=25000,
MAX_UCHAR_COUNT=10000,
MAX_EXCEPTIONS_COUNT=4096,
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT
MAX_EXCEPTIONS_COUNT=4096
};
/* definitions for the properties words */
@ -321,15 +269,16 @@ enum {
static const int32_t MAX_VALUE=(1L<<(VALUE_BITS-1))-1;
static const int32_t MIN_VALUE=-(1L<<(VALUE_BITS-1));
static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
static UNewTrie *pTrie=NULL;
/* stage1Top=STAGE_1_BLOCK never changes, stage2Top starts after the empty-properties-group */
static uint16_t stage2Top=STAGE_2_BLOCK, stage3Top;
/* props32[] contains unique properties words after compacting the array of properties */
static uint32_t props32[MAX_PROPS_COUNT];
/* props[] is used before, props32[] after compacting the array of properties */
static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
static uint32_t *props;
/* length of props32[] after compaction */
static int32_t propsTop;
/* exceptions values */
static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
@ -344,36 +293,9 @@ static uint16_t exceptionsCount=0;
/* prototypes --------------------------------------------------------------- */
static void
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x);
static void
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x);
static uint16_t
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
uint16_t *parent, uint16_t parentTop);
static int
compareProps(const void *l, const void *r);
#if DO_DEBUG_OUT
static uint32_t
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4);
static uint32_t
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
#endif
static void
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
static uint16_t
allocStage2(void);
static uint16_t
allocProps(void);
static uint32_t
addUChars(const UChar *s, uint32_t length);
@ -388,11 +310,12 @@ setUnicodeVersion(const char *v) {
extern void
initStore() {
uprv_memset(stage1, 0, sizeof(stage1));
uprv_memset(stage2, 0, sizeof(stage2));
uprv_memset(stage3, 0, sizeof(stage3));
uprv_memset(map, 0, sizeof(map));
uprv_memset(props, 0, sizeof(props));
pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, FALSE);
if(pTrie==NULL) {
fprintf(stderr, "error: unable to create a UNewTrie\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_memset(props32, 0, sizeof(props32));
}
@ -702,292 +625,19 @@ makeProps(Props *p) {
extern void
addProps(uint32_t c, uint32_t x) {
uint16_t notUsed;
setProps(c, x, &notUsed, &notUsed, &notUsed);
utrie_set32(pTrie, (UChar32)c, x);
}
/* areas of same properties ------------------------------------------------- */
extern void
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
/*
* Set the repetitive properties for the big, known areas of all the same
* character properties. Most of those will share the same stage 2 and 3
* tables.
*
* Assumptions:
* - each area starts at a code point that is a multiple of 16
* - there may be some properties already stored for some code points,
* especially in the Private Use areas
*/
uint16_t i1, i2, j3, i1Limit, i2Repeat, i3Repeat;
uint32_t start, next, limit;
/* fill in the repetitive properties */
start=first;
limit=last+1;
/* allocate a stage 3 block and set all of its properties to x */
i3Repeat=allocProps();
for(j3=0; j3<STAGE_3_BLOCK; ++j3) {
props[i3Repeat+j3]=x;
}
/* we will need to allocate a stage 2 block if we use an entire one at all */
i2Repeat=0;
i1=(uint16_t)(start>>STAGE_1_SHIFT);
i1Limit=(uint16_t)(limit>>STAGE_1_SHIFT);
/*
* now there are up to three sub-areas:
* - a range of code points before the first full block for
* one stage 1 index
* - a (big) range of code points within full blocks for
* stage 1 indexes
* - a range of code points after the last full block for
* one stage 1 index
*/
if((start&(STAGE_2_3_AREA-1))!=0) {
/* incomplete stage 2 block at the beginning */
/* allocate the stage 2 block if necessary */
i2=stage1[i1];
if(i2==0) {
stage1[i1]=i2=allocStage2();
}
/* fill stages 2 & 3 of this sub-area */
if(i1<i1Limit) {
/* the stage 2 block goes to the end */
next=(i1+1)<<STAGE_1_SHIFT;
repeatFromStage2(i2, start, next, i3Repeat, x);
start=next;
/* advance i1 to the first full block */
++i1;
} else {
/* there is only one stage 2 block at all */
repeatFromStage2(i2, start, limit, i3Repeat, x);
return;
}
}
while(i1<i1Limit) {
/* fill complete stage 2 blocks */
next=start+STAGE_2_3_AREA;
i2=stage1[i1];
if(i2==0) {
/* set the index for common repeat block for stage 2 */
if(i2Repeat==0) {
/* allocate and fill a stage 2 block for this */
uint16_t j2;
i2Repeat=allocStage2();
for(j2=0; j2<STAGE_2_BLOCK; ++j2) {
stage2[i2Repeat+j2]=i3Repeat;
}
}
stage1[i1]=i2Repeat;
} else {
repeatFromStage2(i2, start, next, i3Repeat, x);
}
start=next;
++i1;
}
if(start<limit) {
/* fill the area after the last full block */
i2=stage1[i1];
if(i2==0) {
stage1[i1]=i2=allocStage2();
}
repeatFromStage2(i2, start, limit, i3Repeat, x);
}
}
/* set a section of a stage 2 table and its properties to x */
static void
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x) {
uint32_t next;
uint16_t i2Limit, i3;
/* remove irrelevant bits from start and limit */
start&=STAGE_2_3_AREA-1;
limit=((limit-1)&(STAGE_2_3_AREA-1))+1;
i2Limit=(uint16_t)(i2+(limit>>STAGE_3_BITS));
i2+=(uint16_t)(start>>STAGE_3_BITS);
/* similar to repeatProps(), there may be 3 sub-areas */
if((start&(STAGE_3_BLOCK-1))!=0) {
/* incomplete stage 3 block at the beginning */
i3=stage2[i2];
if(i3==0) {
stage2[i2]=i3=allocProps();
}
if(i2<i2Limit) {
/* the stage 3 block goes to the end */
next=(i2+1)<<STAGE_3_BITS;
repeatFromStage3(i3, start, next, x);
start=next;
++i2;
} else {
/* there is only one stage 3 block at all */
repeatFromStage3(i3, start, limit, x);
return;
}
}
while(i2<i2Limit) {
/* fill complete stage 3 blocks */
next=start+STAGE_3_BLOCK;
i3=stage2[i2];
if(i3==0) {
stage2[i2]=i3Repeat;
} else {
repeatFromStage3(i3, start, next, x);
}
start=next;
++i2;
}
if(start<limit) {
i3=stage2[i2];
if(i3==0) {
stage2[i2]=i3=allocProps();
}
repeatFromStage3(i3, start, limit, x);
}
}
static void
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x) {
uint16_t i3End;
i3End=(uint16_t)(i3+((limit-1)&(STAGE_3_BLOCK-1)));
i3+=(uint16_t)(start&(STAGE_3_BLOCK-1));
while(i3<=i3End) {
/* some properties may be set in this stage 3 block */
if(props[i3]==0) {
props[i3]=x;
}
++i3;
}
utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, TRUE);
}
/* compacting --------------------------------------------------------------- */
extern void
compactStage2(void) {
uint16_t newTop=compactStage(stage2, stage2Top, STAGE_2_BLOCK, stage1, STAGE_1_BLOCK);
/* we saved some space */
if(beVerbose) {
printf("compactStage2() reduced stage2Top from %u to %u\n", stage2Top, newTop);
}
stage2Top=newTop;
#if DO_DEBUG_OUT
{
/* debug output */
uint16_t i1, i2, i3, i4;
uint32_t c;
for(c=0; c<0xffff; c+=307) {
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
}
}
#endif
}
extern void
compactStage3(void) {
uint16_t newTop=compactStage(stage3, stage3Top, STAGE_3_BLOCK, stage2, stage2Top);
/* we saved some space */
if(beVerbose) {
printf("compactStage3() reduced stage3Top from %u to %u\n", stage3Top, newTop);
}
stage3Top=newTop;
#if DO_DEBUG_OUT
{
/* debug output */
uint16_t i1, i2, i3, i4;
uint32_t c;
for(c=0; c<0xffff; c+=307) {
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
}
}
#endif
}
static uint16_t
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
uint16_t *parent, uint16_t parentTop) {
/*
* This function is the common implementation for compacting
* a stage table.
* There are stageTop entries (indexes) in stage[].
* stageTop is a multiple of blockSize, and there are always blockSize stage[] entries
* per parent stage entry which do not overlap - yet.
* The first blockSize stage[] entries are always the empty ones.
* We make the blocks overlap appropriately here and fill every blockSize-th entry in
* map[] with the mapping from old to new properties indexes
* in order to adjust the parent stage tables.
* This simple algorithm does not find arbitrary overlaps, but only those
* where the last i entries of the previous block and the first i of the
* current one all have the same value.
* This seems reasonable and yields linear performance.
*/
uint16_t i, start, prevEnd, newStart, x;
map[0]=0;
newStart=blockSize;
for(start=newStart; start<stageTop;) {
prevEnd=(uint16_t)(newStart-1);
x=stage[start];
if(x==stage[prevEnd]) {
/* overlap by at least one */
for(i=1; i<blockSize && x==stage[start+i] && x==stage[prevEnd-i]; ++i) {}
/* overlap by i */
map[start]=(uint16_t)(newStart-i);
/* move the non-overlapping indexes to their new positions */
start+=i;
for(i=(uint16_t)(blockSize-i); i>0; --i) {
stage[newStart++]=stage[start++];
}
} else if(newStart<start) {
/* move the indexes to their new positions */
map[start]=newStart;
for(i=blockSize; i>0; --i) {
stage[newStart++]=stage[start++];
}
} else /* no overlap && newStart==start */ {
map[start]=start;
newStart+=blockSize;
start=newStart;
}
}
/* now adjust the parent stage table */
for(i=0; i<parentTop; ++i) {
parent[i]=map[parent[i]];
}
/* we saved some space */
return (uint16_t)(stageTop-(start-newStart));
}
extern void
static void
compactProps(void) {
/*
* At this point, all the propsTop properties are in props[], but they
@ -1003,8 +653,11 @@ compactProps(void) {
* index table anyway and qsort() does not allow to sort two tables together
* directly. This will thus also reduce the amount of data moved around.
*/
uint16_t i, oldIndex, newIndex;
uint32_t x;
int32_t i, oldIndex, newIndex;
static uint16_t map[MAX_PROPS_COUNT];
#if DO_DEBUG_OUT
{
/* debug output */
@ -1016,14 +669,16 @@ compactProps(void) {
}
#endif
props=utrie_getData(pTrie, &propsTop);
/* build the index table */
for(i=propsTop; i>0;) {
--i;
map[i]=i;
map[i]=(uint16_t)i;
}
/* do not reorder the first, empty entries */
qsort(map+STAGE_3_BLOCK, propsTop-STAGE_3_BLOCK, 2, compareProps);
/* reorder */
qsort(map, propsTop, 2, compareProps);
/*
* Now invert the reordered table and compact it in the same step.
@ -1035,22 +690,22 @@ compactProps(void) {
/* set the first of a possible series of the same properties */
oldIndex=map[i];
props32[newIndex]=x=props[oldIndex];
stage3[oldIndex]=newIndex;
props[oldIndex]=newIndex;
/* set the following same properties only in stage3 */
while(++i<propsTop && x==props[map[i]]) {
stage3[map[i]]=newIndex;
props[map[i]]=newIndex;
}
++newIndex;
}
/* we saved some space */
stage3Top=propsTop;
propsTop=newIndex;
if(beVerbose) {
printf("compactProps() reduced propsTop from %u to %u\n", stage3Top, propsTop);
printf("compactProps() reduced propsTop from %u to %u\n", propsTop, newIndex);
}
propsTop=newIndex;
#if DO_DEBUG_OUT
{
/* debug output */
@ -1077,56 +732,77 @@ compareProps(const void *l, const void *r) {
/* generate output data ----------------------------------------------------- */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
static uint32_t U_CALLCONV
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
extern void
generateData(const char *dataDir) {
static uint16_t indexes[8]={
STAGE_2_BITS, STAGE_3_BITS,
0, 0,
static int32_t indexes[16]={
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0, 0
};
static uint8_t trieBlock[40000];
UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR;
uint32_t size;
int32_t trieSize, offset;
long dataLength;
uint16_t i, offset;
/* fix up the indexes in the stage tables to include the table offsets in the data */
offset=8+STAGE_1_BLOCK; /* uint16_t offset to stage2[] */
for(i=0; i<STAGE_1_BLOCK; ++i) {
stage1[i]+=offset;
compactProps();
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), trieSize);
exit(errorCode);
}
offset+=stage2Top; /* uint16_t offset to stage3[] */
indexes[3]=offset;
for(i=0; i<stage2Top; ++i) {
stage2[i]+=offset;
}
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
offset=(uint16_t)((offset+stage3Top+1)/2); /* uint32_t offset to props[], include padding */
indexes[4]=offset; /* uint32_t offset to props[] */
for(i=0; i<stage3Top; ++i) {
stage3[i]+=offset;
}
/* round up trie size to 4-alignement */
trieSize=(trieSize+3)&~3;
offset+=trieSize>>2;
indexes[0]=offset; /* uint32_t offset to props[] */
offset+=propsTop;
indexes[2]=offset; /* uint32_t offset to exceptions[] */
indexes[1]=offset; /* uint32_t offset to exceptions[] */
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
indexes[5]=offset;
indexes[2]=offset;
/* round up UChar count to 4-alignement */
ucharsTop=(ucharsTop+1)&~1;
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
indexes[6]=offset;
indexes[3]=offset;
size=4*offset; /* total size of data */
if(beVerbose) {
printf("number of stage 2 entries: %5u\n", stage2Top);
printf("number of stage 3 entries: %5u\n", stage3Top);
printf("trie size in bytes: %5u\n", trieSize);
printf("number of unique properties values: %5u\n", propsTop);
printf("number of code points with exceptions: %5u\n", exceptionsCount);
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
printf("number of UChars for special mappings: %5u\n", ucharsTop);
printf("data size: %6lu\n", (unsigned long)size);
}
@ -1134,15 +810,12 @@ generateData(const char *dataDir) {
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
exit(errorCode);
}
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, stage1, sizeof(stage1));
udata_writeBlock(pData, stage2, 2*stage2Top);
udata_writeBlock(pData, stage3, 2*stage3Top);
udata_writePadding(pData, 2*((stage2Top+stage3Top)&1));
udata_writeBlock(pData, trieBlock, trieSize);
udata_writeBlock(pData, props32, 4*propsTop);
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
udata_writeBlock(pData, uchars, 2*ucharsTop);
@ -1163,75 +836,6 @@ generateData(const char *dataDir) {
/* helpers ------------------------------------------------------------------ */
/* get properties after compacting them */
#if DO_DEBUG_OUT
static uint32_t
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4) {
uint16_t i1, i2, i3, i4;
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
*pI4=i4=stage3[i3];
return props32[i4];
}
/* get properties before compacting them */
static uint32_t
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
uint16_t i1, i2, i3;
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
return props[i3];
}
#endif
/* set properties before compacting them */
static void
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
uint16_t i1, i2, i3;
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
i2=stage1[i1];
if(i2==0) {
stage1[i1]=i2=allocStage2();
}
*pI2=i2+=(uint16_t)((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1));
i3=stage2[i2];
if(i3==0) {
stage2[i2]=i3=allocProps();
}
*pI3=i3+=(uint16_t)(c&(STAGE_3_BLOCK-1));
props[i3]=x;
}
static uint16_t
allocStage2(void) {
uint16_t i=stage2Top;
stage2Top+=STAGE_2_BLOCK;
if(stage2Top>=MAX_STAGE_2_COUNT) {
fprintf(stderr, "genprops: stage 2 overflow\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
return i;
}
static uint16_t
allocProps(void) {
uint16_t i=propsTop;
propsTop+=STAGE_3_BLOCK;
if(propsTop>=MAX_PROPS_COUNT) {
fprintf(stderr, "genprops: properties overflow\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
return i;
}
static uint32_t
addUChars(const UChar *s, uint32_t length) {
uint32_t top=(uint16_t)(ucharsTop+length);