ICU-1586 add common trie implementation
X-SVN-Rev: 7327
This commit is contained in:
parent
5760c7b55b
commit
268abe3937
@ -332,6 +332,10 @@ SOURCE=.\utf_impl.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\utrie.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\uvector.cpp
|
||||
# End Source File
|
||||
# End Group
|
||||
@ -1407,6 +1411,10 @@ InputPath=.\unicode\utf8.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\utrie.h
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\unicode\utypes.h
|
||||
|
||||
!IF "$(CFG)" == "common - Win32 Release"
|
||||
|
@ -25,8 +25,9 @@
|
||||
#include "unicode/uloc.h"
|
||||
#include "umutex.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "utrie.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/*
|
||||
* Since genprops overrides the general category for some control codes,
|
||||
@ -213,36 +214,31 @@ static UDataMemory *propsData=NULL;
|
||||
static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
|
||||
static UVersionInfo dataVersion={ 3, 0, 0, 0 };
|
||||
|
||||
static const uint16_t *propsTable=NULL;
|
||||
#define props32Table ((uint32_t *)propsTable)
|
||||
|
||||
static UTrie propsTrie={ 0 };
|
||||
static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL;
|
||||
static const UChar *ucharsTable=NULL;
|
||||
|
||||
static int8_t havePropsData=0;
|
||||
|
||||
/* index values loaded from uprops.dat */
|
||||
static uint16_t indexes[8];
|
||||
static int32_t indexes[16];
|
||||
|
||||
enum {
|
||||
INDEX_STAGE_2_BITS,
|
||||
INDEX_STAGE_3_BITS,
|
||||
INDEX_EXCEPTIONS,
|
||||
INDEX_STAGE_3_INDEX,
|
||||
INDEX_PROPS,
|
||||
INDEX_UCHARS
|
||||
INDEX_EXCEPTIONS,
|
||||
INDEX_UCHARS,
|
||||
INDEX_RESERVED /* contains the uint32_t offset to the top of the known data */
|
||||
};
|
||||
|
||||
#ifdef UCHAR_VARIABLE_TRIE_BITS
|
||||
/* access values calculated from indexes */
|
||||
static uint16_t stage23Bits, stage2Mask, stage3Mask;
|
||||
# define stage3Bits indexes[INDEX_STAGE_3_BITS]
|
||||
#else
|
||||
/* We are now hardcoding the bit distribution for the trie table access. */
|
||||
# define stage23Bits 10
|
||||
# define stage2Mask 0x3f
|
||||
# define stage3Mask 0xf
|
||||
# define stage3Bits 4
|
||||
#endif
|
||||
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
|
||||
static int32_t U_CALLCONV
|
||||
getFoldingPropsOffset(uint32_t data) {
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
isAcceptable(void *context,
|
||||
@ -256,7 +252,9 @@ isAcceptable(void *context,
|
||||
pInfo->dataFormat[1]==0x50 &&
|
||||
pInfo->dataFormat[2]==0x72 &&
|
||||
pInfo->dataFormat[3]==0x6f &&
|
||||
pInfo->formatVersion[0]==1
|
||||
pInfo->formatVersion[0]==2 &&
|
||||
pInfo->formatVersion[2]==UTRIE_SHIFT &&
|
||||
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
|
||||
) {
|
||||
uprv_memcpy(formatVersion, pInfo->formatVersion, 4);
|
||||
uprv_memcpy(dataVersion, pInfo->dataVersion, 4);
|
||||
@ -271,11 +269,13 @@ uchar_cleanup()
|
||||
{
|
||||
if (propsData) {
|
||||
udata_close(propsData);
|
||||
propsData = NULL;
|
||||
propsData=NULL;
|
||||
}
|
||||
propsTable = NULL;
|
||||
ucharsTable = NULL;
|
||||
havePropsData = FALSE;
|
||||
pData32=NULL;
|
||||
props32Table=NULL;
|
||||
exceptionsTable=NULL;
|
||||
ucharsTable=NULL;
|
||||
havePropsData=FALSE;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
@ -283,9 +283,11 @@ static int8_t
|
||||
loadPropsData() {
|
||||
/* load Unicode character properties data from file if necessary */
|
||||
if(havePropsData==0) {
|
||||
UTrie trie={ 0 };
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
UDataMemory *data;
|
||||
const uint16_t *p=NULL;
|
||||
const uint32_t *p=NULL;
|
||||
int32_t length;
|
||||
|
||||
/* open the data outside the mutex block */
|
||||
data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &errorCode);
|
||||
@ -293,38 +295,33 @@ loadPropsData() {
|
||||
return havePropsData=-1;
|
||||
}
|
||||
|
||||
p=(const uint16_t *)udata_getMemory(data);
|
||||
p=(const uint32_t *)udata_getMemory(data);
|
||||
|
||||
#ifndef UCHAR_VARIABLE_TRIE_BITS
|
||||
/*
|
||||
* We are now hardcoding the bit distribution for the trie table access.
|
||||
* Check that the file is stored accordingly.
|
||||
*/
|
||||
if(p[INDEX_STAGE_2_BITS]!=6 || p[INDEX_STAGE_3_BITS]!=4) {
|
||||
/* unserialize the trie; it is directly after the int32_t indexes[16] */
|
||||
length=(*(int32_t *)p)*4;
|
||||
length=utrie_unserialize(&trie, (const uint8_t *)(p+16), length-64, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
udata_close(data);
|
||||
errorCode=U_INVALID_FORMAT_ERROR;
|
||||
return havePropsData=-1;
|
||||
}
|
||||
#endif
|
||||
trie.getFoldingOffset=getFoldingPropsOffset;
|
||||
|
||||
/* in the mutex block, set the data for this process */
|
||||
umtx_lock(NULL);
|
||||
if(propsData==NULL) {
|
||||
propsData=data;
|
||||
data=NULL;
|
||||
propsTable=p;
|
||||
pData32=p;
|
||||
p=NULL;
|
||||
uprv_memcpy(&propsTrie, &trie, sizeof(trie));
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
/* initialize some variables */
|
||||
uprv_memcpy(indexes, propsTable, 16);
|
||||
#ifdef UCHAR_VARIABLE_TRIE_BITS
|
||||
stage23Bits=(uint16_t)(indexes[INDEX_STAGE_2_BITS]+indexes[INDEX_STAGE_3_BITS]);
|
||||
stage2Mask=(uint16_t)((1<<indexes[INDEX_STAGE_2_BITS])-1);
|
||||
stage3Mask=(uint16_t)((1<<indexes[INDEX_STAGE_3_BITS])-1);
|
||||
#endif
|
||||
ucharsTable=(const UChar *)(props32Table+indexes[INDEX_UCHARS]);
|
||||
uprv_memcpy(indexes, pData32, sizeof(indexes));
|
||||
props32Table=pData32+indexes[INDEX_PROPS];
|
||||
exceptionsTable=pData32+indexes[INDEX_EXCEPTIONS];
|
||||
ucharsTable=(const UChar *)(pData32+indexes[INDEX_UCHARS]);
|
||||
havePropsData=1;
|
||||
|
||||
/* if a different thread set it first, then close the extra data */
|
||||
@ -361,28 +358,22 @@ enum {
|
||||
/* getting a uint32_t properties word from the data */
|
||||
#define HAVE_DATA (havePropsData>0 || (havePropsData==0 && loadPropsData()>0))
|
||||
#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
|
||||
#define GET_PROPS_UNSAFE(c) \
|
||||
props32Table[ \
|
||||
propsTable[ \
|
||||
propsTable[ \
|
||||
propsTable[8+((c)>>stage23Bits)]+ \
|
||||
(((c)>>stage3Bits)&stage2Mask)]+ \
|
||||
((c)&stage3Mask) \
|
||||
] \
|
||||
]
|
||||
#define GET_PROPS(c) \
|
||||
(((uint32_t)(c))<=0x10ffff ? \
|
||||
HAVE_DATA ? \
|
||||
GET_PROPS_UNSAFE(c) \
|
||||
: (c)<=0x9f ? \
|
||||
staticProps32Table[c] \
|
||||
: 0 \
|
||||
: 0)
|
||||
#define GET_PROPS_UNSAFE(c, result) \
|
||||
UTRIE_GET16(&propsTrie, c, result); \
|
||||
(result)=props32Table[(result)]
|
||||
#define GET_PROPS(c, result) \
|
||||
if(HAVE_DATA) { \
|
||||
GET_PROPS_UNSAFE(c, result); \
|
||||
} else if((c)<=0x9f) { \
|
||||
(result)=staticProps32Table[c]; \
|
||||
} else { \
|
||||
(result)=0; \
|
||||
}
|
||||
#define PROPS_VALUE_IS_EXCEPTION(props) ((props)&(1UL<<EXCEPTION_SHIFT))
|
||||
#define GET_CATEGORY(props) ((props)&0x1f)
|
||||
#define GET_UNSIGNED_VALUE(props) ((props)>>VALUE_SHIFT)
|
||||
#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>VALUE_SHIFT)
|
||||
#define GET_EXCEPTIONS(props) (props32Table+indexes[INDEX_EXCEPTIONS]+GET_UNSIGNED_VALUE(props))
|
||||
#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props))
|
||||
|
||||
/* finding an exception value */
|
||||
#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
|
||||
@ -427,31 +418,41 @@ uprv_haveProperties() {
|
||||
/* Gets the Unicode character's general category.*/
|
||||
U_CAPI int8_t U_EXPORT2
|
||||
u_charType(UChar32 c) {
|
||||
return (int8_t)GET_CATEGORY(GET_PROPS(c));
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (int8_t)GET_CATEGORY(props);
|
||||
}
|
||||
|
||||
/* Checks if ch is a lower case letter.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_islower(UChar32 c) {
|
||||
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_LOWERCASE_LETTER);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(GET_CATEGORY(props)==U_LOWERCASE_LETTER);
|
||||
}
|
||||
|
||||
/* Checks if ch is an upper case letter.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isupper(UChar32 c) {
|
||||
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_UPPERCASE_LETTER);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(GET_CATEGORY(props)==U_UPPERCASE_LETTER);
|
||||
}
|
||||
|
||||
/* Checks if ch is a title case letter; usually upper case letters.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_istitle(UChar32 c) {
|
||||
return (UBool)(GET_CATEGORY(GET_PROPS(c))==U_TITLECASE_LETTER);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(GET_CATEGORY(props)==U_TITLECASE_LETTER);
|
||||
}
|
||||
|
||||
/* Checks if ch is a decimal digit. */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isdigit(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER)
|
||||
)!=0);
|
||||
}
|
||||
@ -459,7 +460,9 @@ u_isdigit(UChar32 c) {
|
||||
/* Checks if the Unicode character is a letter.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isalpha(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
|
||||
)!=0);
|
||||
}
|
||||
@ -467,7 +470,9 @@ u_isalpha(UChar32 c) {
|
||||
/* Checks if ch is a letter or a decimal digit */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isalnum(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
|
||||
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
|
||||
)!=0);
|
||||
@ -476,13 +481,17 @@ u_isalnum(UChar32 c) {
|
||||
/* Checks if ch is a unicode character with assigned character type.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isdefined(UChar32 c) {
|
||||
return (UBool)(GET_PROPS(c)!=0);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(GET_CATEGORY(props)!=0);
|
||||
}
|
||||
|
||||
/* Checks if the Unicode character is a base form character that can take a diacritic.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isbase(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_OTHER_NUMBER|1UL<<U_LETTER_NUMBER|
|
||||
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
|
||||
1UL<<U_NON_SPACING_MARK|1UL<<U_ENCLOSING_MARK|1UL<<U_COMBINING_SPACING_MARK)
|
||||
@ -492,17 +501,24 @@ u_isbase(UChar32 c) {
|
||||
/* Checks if the Unicode character is a control character.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_iscntrl(UChar32 c) {
|
||||
return (UBool)(
|
||||
IS_ISO_8_CONTROL(c) ||
|
||||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||
)!=0);
|
||||
if(IS_ISO_8_CONTROL(c)) {
|
||||
return TRUE;
|
||||
} else {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(
|
||||
((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||
)!=0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Checks if the Unicode character is a space character.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isspace(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||
)!=0);
|
||||
}
|
||||
@ -510,7 +526,9 @@ u_isspace(UChar32 c) {
|
||||
/* Checks if the Unicode character is a whitespace character.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isWhitespace(UChar32 c) {
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_SPACE_SEPARATOR|1UL<<U_LINE_SEPARATOR|1UL<<U_PARAGRAPH_SEPARATOR)
|
||||
)!=0 &&
|
||||
c!=0xa0 && c!=0x202f && c!=0xfeff); /* exclude no-break spaces */
|
||||
@ -519,20 +537,27 @@ u_isWhitespace(UChar32 c) {
|
||||
/* Checks if the Unicode character is printable.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isprint(UChar32 c) {
|
||||
return (UBool)(
|
||||
!IS_ISO_8_CONTROL(c) &&
|
||||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
~(1UL<<U_UNASSIGNED|
|
||||
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
|
||||
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
|
||||
)!=0);
|
||||
if(IS_ISO_8_CONTROL(c)) {
|
||||
return FALSE;
|
||||
} else {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(
|
||||
((1UL<<GET_CATEGORY(props))&
|
||||
~(1UL<<U_UNASSIGNED|
|
||||
1UL<<U_CONTROL_CHAR|1UL<<U_FORMAT_CHAR|1UL<<U_PRIVATE_USE_CHAR|1UL<<U_SURROGATE|
|
||||
1UL<<U_GENERAL_OTHER_TYPES|1UL<<31)
|
||||
)!=0);
|
||||
}
|
||||
}
|
||||
|
||||
/* Checks if the Unicode character can start a Unicode identifier.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isIDStart(UChar32 c) {
|
||||
/* same as u_isalpha() */
|
||||
return (UBool)(((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER)
|
||||
)!=0);
|
||||
}
|
||||
@ -541,8 +566,10 @@ u_isIDStart(UChar32 c) {
|
||||
identifier.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isIDPart(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(
|
||||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
|
||||
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
|
||||
1UL<<U_CONNECTOR_PUNCTUATION|1UL<<U_COMBINING_SPACING_MARK|1UL<<U_NON_SPACING_MARK)
|
||||
@ -564,8 +591,10 @@ u_isIDIgnorable(UChar32 c) {
|
||||
/*Checks if the Unicode character can start a Java identifier.*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isJavaIDStart(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(
|
||||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
|
||||
1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION)
|
||||
)!=0);
|
||||
@ -576,8 +605,10 @@ u_isJavaIDStart(UChar32 c) {
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isJavaIDPart(UChar32 c) {
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(
|
||||
((1UL<<GET_CATEGORY(GET_PROPS(c)))&
|
||||
((1UL<<GET_CATEGORY(props))&
|
||||
(1UL<<U_DECIMAL_DIGIT_NUMBER|1UL<<U_LETTER_NUMBER|
|
||||
1UL<<U_UPPERCASE_LETTER|1UL<<U_LOWERCASE_LETTER|1UL<<U_TITLECASE_LETTER|1UL<<U_MODIFIER_LETTER|1UL<<U_OTHER_LETTER|
|
||||
1UL<<U_CURRENCY_SYMBOL|1UL<<U_CONNECTOR_PUNCTUATION|
|
||||
@ -589,13 +620,14 @@ u_isJavaIDPart(UChar32 c) {
|
||||
/* Transforms the Unicode character to its lower case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_tolower(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
return c+GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_LOWERCASE)) {
|
||||
int i=EXC_LOWERCASE;
|
||||
@ -610,13 +642,14 @@ u_tolower(UChar32 c) {
|
||||
/* Transforms the Unicode character to its upper case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_toupper(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
|
||||
return c-GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_UPPERCASE)) {
|
||||
int i=EXC_UPPERCASE;
|
||||
@ -631,14 +664,15 @@ u_toupper(UChar32 c) {
|
||||
/* Transforms the Unicode character to its title case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_totitle(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
|
||||
/* here, titlecase is same as uppercase */
|
||||
return c-GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_TITLECASE)) {
|
||||
int i=EXC_TITLECASE;
|
||||
@ -658,13 +692,14 @@ u_totitle(UChar32 c) {
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_charDigitValue(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_DECIMAL_DIGIT_NUMBER) {
|
||||
return GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DIGIT_VALUE)) {
|
||||
int32_t value;
|
||||
@ -697,7 +732,8 @@ u_charDigitValue(UChar32 c) {
|
||||
/* Gets the character's linguistic directionality.*/
|
||||
U_CAPI UCharDirection U_EXPORT2
|
||||
u_charDirection(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(props!=0) {
|
||||
return (UCharDirection)((props>>BIDI_SHIFT)&0x1f);
|
||||
} else {
|
||||
@ -707,19 +743,22 @@ u_charDirection(UChar32 c) {
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
u_isMirrored(UChar32 c) {
|
||||
return (UBool)(GET_PROPS(c)&(1UL<<MIRROR_SHIFT) ? TRUE : FALSE);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
return (UBool)(props&(1UL<<MIRROR_SHIFT) ? TRUE : FALSE);
|
||||
}
|
||||
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_charMirror(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if((props&(1UL<<MIRROR_SHIFT))==0) {
|
||||
/* not mirrored - the value is not a mirror offset */
|
||||
return c;
|
||||
} else if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
return c+GET_SIGNED_VALUE(props);
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_MIRROR_MAPPING)) {
|
||||
int i=EXC_MIRROR_MAPPING;
|
||||
@ -734,7 +773,8 @@ u_charMirror(UChar32 c) {
|
||||
|
||||
U_CFUNC uint8_t
|
||||
u_internalGetCombiningClass(UChar32 c) {
|
||||
uint32_t props=GET_PROPS_UNSAFE(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_NON_SPACING_MARK) {
|
||||
return (uint8_t)GET_UNSIGNED_VALUE(props);
|
||||
@ -749,7 +789,8 @@ u_internalGetCombiningClass(UChar32 c) {
|
||||
|
||||
U_CAPI uint8_t U_EXPORT2
|
||||
u_getCombiningClass(UChar32 c) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_NON_SPACING_MARK) {
|
||||
return (uint8_t)GET_UNSIGNED_VALUE(props);
|
||||
@ -1091,7 +1132,7 @@ isFollowedByCasedLetter(const UChar *src, UTextOffset srcIndex, int32_t srcLengt
|
||||
|
||||
while(srcIndex<srcLength) {
|
||||
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
|
||||
props=GET_PROPS_UNSAFE(c);
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
category=GET_CATEGORY(props);
|
||||
if((1UL<<category)&(1UL<<U_LOWERCASE_LETTER|1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
return TRUE; /* followed by cased letter */
|
||||
@ -1112,7 +1153,7 @@ isPrecededByCasedLetter(const UChar *src, UTextOffset srcIndex) {
|
||||
|
||||
while(0<srcIndex) {
|
||||
UTF_PREV_CHAR(src, 0, srcIndex, c);
|
||||
props=GET_PROPS_UNSAFE(c);
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
category=GET_CATEGORY(props);
|
||||
if((1UL<<category)&(1UL<<U_LOWERCASE_LETTER|1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
return TRUE; /* preceded by cased letter */
|
||||
@ -1216,7 +1257,7 @@ u_internalStrToLower(UChar *dest, int32_t destCapacity,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[8];
|
||||
uint32_t *pe;
|
||||
const uint32_t *pe;
|
||||
const UChar *u;
|
||||
uint32_t props, firstExceptionValue, specialCasing;
|
||||
int32_t srcIndex, destIndex, i, loc;
|
||||
@ -1262,7 +1303,7 @@ u_internalStrToLower(UChar *dest, int32_t destCapacity,
|
||||
srcIndex=destIndex=0;
|
||||
while(srcIndex<srcLength) {
|
||||
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
|
||||
props=GET_PROPS_UNSAFE(c);
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
c+=GET_SIGNED_VALUE(props);
|
||||
@ -1432,7 +1473,7 @@ u_internalStrToUpper(UChar *dest, int32_t destCapacity,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[8];
|
||||
uint32_t *pe;
|
||||
const uint32_t *pe;
|
||||
const UChar *u;
|
||||
uint32_t props, firstExceptionValue, specialCasing;
|
||||
int32_t srcIndex, destIndex, i, loc;
|
||||
@ -1478,7 +1519,7 @@ u_internalStrToUpper(UChar *dest, int32_t destCapacity,
|
||||
srcIndex=destIndex=0;
|
||||
while(srcIndex<srcLength) {
|
||||
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
|
||||
props=GET_PROPS_UNSAFE(c);
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
|
||||
c-=GET_SIGNED_VALUE(props);
|
||||
@ -1594,11 +1635,12 @@ notSpecial:
|
||||
/* internal */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_internalTitleCase(UChar32 c, UChar *dest, int32_t destCapacity, const char *locale) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
UChar32 title;
|
||||
int32_t i, length;
|
||||
|
||||
title=c;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if(GET_CATEGORY(props)==U_LOWERCASE_LETTER) {
|
||||
/* here, titlecase is same as uppercase */
|
||||
@ -1606,7 +1648,7 @@ u_internalTitleCase(UChar32 c, UChar *dest, int32_t destCapacity, const char *lo
|
||||
}
|
||||
} else if(HAVE_DATA) {
|
||||
const UChar *u;
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe, specialCasing;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_SPECIAL_CASING)) {
|
||||
i=EXC_SPECIAL_CASING;
|
||||
@ -1704,16 +1746,17 @@ single:
|
||||
/* return the simple case folding mapping for c */
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_foldCase(UChar32 c, uint32_t options) {
|
||||
uint32_t props=GET_PROPS(c);
|
||||
uint32_t props;
|
||||
GET_PROPS(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
return c+GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
|
||||
uint32_t *oldPE=pe;
|
||||
const uint32_t *oldPE=pe;
|
||||
int i=EXC_CASE_FOLDING;
|
||||
++pe;
|
||||
ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
|
||||
@ -1753,15 +1796,16 @@ u_foldCase(UChar32 c, uint32_t options) {
|
||||
/* internal, return the full case folding mapping for c, must be used only if uprv_haveProperties() is true */
|
||||
U_CFUNC int32_t
|
||||
u_internalFoldCase(UChar32 c, UChar dest[32], uint32_t options) {
|
||||
uint32_t props=GET_PROPS_UNSAFE(c);
|
||||
uint32_t props;
|
||||
int32_t i;
|
||||
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
c+=GET_SIGNED_VALUE(props);
|
||||
}
|
||||
} else {
|
||||
uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
const uint32_t *pe=GET_EXCEPTIONS(props);
|
||||
uint32_t firstExceptionValue=*pe;
|
||||
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_CASE_FOLDING)) {
|
||||
i=EXC_CASE_FOLDING;
|
||||
@ -1810,7 +1854,7 @@ u_internalStrFoldCase(UChar *dest, int32_t destCapacity,
|
||||
UGrowBuffer *growBuffer, void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar buffer[UTF_MAX_CHAR_LENGTH];
|
||||
uint32_t *pe;
|
||||
const uint32_t *pe;
|
||||
const UChar *uchars, *u;
|
||||
uint32_t props, firstExceptionValue;
|
||||
int32_t srcIndex, destIndex, i;
|
||||
@ -1857,7 +1901,7 @@ u_internalStrFoldCase(UChar *dest, int32_t destCapacity,
|
||||
srcIndex=destIndex=0;
|
||||
while(srcIndex<srcLength) {
|
||||
UTF_NEXT_CHAR(src, srcIndex, srcLength, c);
|
||||
props=GET_PROPS_UNSAFE(c);
|
||||
GET_PROPS_UNSAFE(c, props);
|
||||
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
|
||||
if((1UL<<GET_CATEGORY(props))&(1UL<<U_UPPERCASE_LETTER|1UL<<U_TITLECASE_LETTER)) {
|
||||
c+=GET_SIGNED_VALUE(props);
|
||||
|
1025
icu4c/source/common/utrie.c
Normal file
1025
icu4c/source/common/utrie.c
Normal file
File diff suppressed because it is too large
Load Diff
656
icu4c/source/common/utrie.h
Normal file
656
icu4c/source/common/utrie.h
Normal file
@ -0,0 +1,656 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: utrie.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001nov08
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UTRIE_H__
|
||||
#define __UTRIE_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/**
|
||||
* /file
|
||||
*
|
||||
* This is a common implementation of a "folded" trie.
|
||||
* It is a kind of compressed, serializable table of 16- or 32-bit values associated with
|
||||
* Unicode code points (0..0x10ffff).
|
||||
*
|
||||
* This implementation is optimized for getting values while walking forward
|
||||
* through a UTF-16 string.
|
||||
* Therefore, the simplest and fastest access macros are the
|
||||
* _FROM_LEAD() and _FROM_OFFSET_TRAIL() macros.
|
||||
*
|
||||
* The _FROM_BMP() macros are a little more complicated; they get values
|
||||
* even for lead surrogate code _points_, while the _FROM_LEAD() macros
|
||||
* get special "folded" values for lead surrogate code _units_ if
|
||||
* there is relevant data associated with them.
|
||||
* From such a folded value, an offset needs to be extracted to supply
|
||||
* to the _FROM_OFFSET_TRAIL() macros.
|
||||
*
|
||||
* Most of the more complex (and more convenient) functions call a callback function
|
||||
* to get that offset from the folded value for a lead surrogate unit.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Trie constants, defining shift widths, index array lengths, etc.
|
||||
*/
|
||||
enum {
|
||||
/** Shift size for shifting right the input index. 1..9 */
|
||||
UTRIE_SHIFT=5,
|
||||
|
||||
/** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
|
||||
UTRIE_DATA_BLOCK_LENGTH=1<<UTRIE_SHIFT,
|
||||
|
||||
/** Mask for getting the lower bits from the input index. */
|
||||
UTRIE_MASK=UTRIE_DATA_BLOCK_LENGTH-1,
|
||||
|
||||
/**
|
||||
* Lead surrogate code points' index displacement in the index array.
|
||||
* 0x10000-0xd800=0x2800
|
||||
*/
|
||||
UTRIE_LEAD_INDEX_DISP=0x2800>>UTRIE_SHIFT,
|
||||
|
||||
/**
|
||||
* Shift size for shifting left the index array values.
|
||||
* Increases possible data size with 16-bit index values at the cost
|
||||
* of compactability.
|
||||
* This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
|
||||
* 0..UTRIE_SHIFT
|
||||
*/
|
||||
UTRIE_INDEX_SHIFT=2,
|
||||
|
||||
/** The alignment size of a stage 2 data block. Also the granularity for compaction. */
|
||||
UTRIE_DATA_GRANULARITY=1<<UTRIE_INDEX_SHIFT,
|
||||
|
||||
/** Number of bits of a trail surrogate that are used in index table lookups. */
|
||||
UTRIE_SURROGATE_BLOCK_BITS=10-UTRIE_SHIFT,
|
||||
|
||||
/**
|
||||
* Number of index (stage 1) entries per lead surrogate.
|
||||
* Same as number of indexe entries for 1024 trail surrogates,
|
||||
* ==0x400>>UTRIE_SHIFT
|
||||
*/
|
||||
UTRIE_SURROGATE_BLOCK_COUNT=(1<<UTRIE_SURROGATE_BLOCK_BITS),
|
||||
|
||||
/** Length of the BMP portion of the index (stage 1) array. */
|
||||
UTRIE_BMP_INDEX_LENGTH=0x10000>>UTRIE_SHIFT
|
||||
};
|
||||
|
||||
/**
|
||||
* Length of the index (stage 1) array before folding.
|
||||
* Maximum number of Unicode code points (0x110000) shifted right by UTRIE_SHIFT.
|
||||
*/
|
||||
#define UTRIE_MAX_INDEX_LENGTH (0x110000>>UTRIE_SHIFT)
|
||||
|
||||
/**
|
||||
* Maximum length of the runtime data (stage 2) array.
|
||||
* Limited by 16-bit index values that are left-shifted by UTRIE_INDEX_SHIFT.
|
||||
*/
|
||||
#define UTRIE_MAX_DATA_LENGTH (0x10000<<UTRIE_INDEX_SHIFT)
|
||||
|
||||
/**
|
||||
* Maximum length of the build-time data (stage 2) array.
|
||||
* The maximum length is 0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
|
||||
* (Number of Unicode code points + one all-zero block +
|
||||
* possible duplicate entries for 1024 lead surrogates.)
|
||||
*/
|
||||
#define UTRIE_MAX_BUILD_TIME_DATA_LENGTH (0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400)
|
||||
|
||||
/**
|
||||
* Runtime UTrie callback function.
|
||||
* Extract from a lead surrogate's data the
|
||||
* index array offset of the indexes for that lead surrogate.
|
||||
*
|
||||
* @return offset>=UTRIE_BMP_INDEX_LENGTH, or 0 if there is no data for the lead surrogate
|
||||
*/
|
||||
typedef int32_t U_CALLCONV
|
||||
UTrieGetFoldingOffset(uint32_t data);
|
||||
|
||||
/**
|
||||
* Run-time Trie structure.
|
||||
*
|
||||
* Either the data table is 16 bits wide and accessed via the index
|
||||
* pointer, with each index item increased by indexLength;
|
||||
* in this case, data32==NULL.
|
||||
*
|
||||
* Or the data table is 32 bits wide and accessed via the data32 pointer.
|
||||
*/
|
||||
struct UTrie {
|
||||
const uint16_t *index;
|
||||
const uint32_t *data32; /* NULL if 16b data is used via index */
|
||||
|
||||
/**
|
||||
* This function is not used in _FROM_LEAD, _FROM_BMP, and _FROM_OFFSET_TRAIL macros.
|
||||
* If convenience macros like _GET16 or _NEXT32 are used, this function must be set.
|
||||
* @see UTrieGetFoldingOffset
|
||||
*/
|
||||
UTrieGetFoldingOffset *getFoldingOffset;
|
||||
|
||||
int32_t indexLength, dataLength;
|
||||
UBool isLatin1Linear;
|
||||
};
|
||||
|
||||
typedef struct UTrie UTrie;
|
||||
|
||||
/** Internal trie getter from an offset (0 if c16 is a BMP/lead units) and a 16-bit unit */
|
||||
#define _UTRIE_GET_RAW(trie, data, offset, c16) \
|
||||
(trie)->data[ \
|
||||
((int32_t)((trie)->index[(offset)+((c16)>>UTRIE_SHIFT)])<<UTRIE_INDEX_SHIFT)+ \
|
||||
((c16)&UTRIE_MASK) \
|
||||
]
|
||||
|
||||
/** Internal trie getter from a pair of surrogates */
|
||||
#define _UTRIE_GET_FROM_PAIR(trie, data, c, c2, result) { \
|
||||
int32_t __offset; \
|
||||
\
|
||||
/* get data for lead surrogate */ \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
|
||||
__offset=(trie)->getFoldingOffset(result); \
|
||||
\
|
||||
/* get the real data from the folded lead/trail units */ \
|
||||
if(__offset>0) { \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, __offset, (c2)&0x3ff); \
|
||||
} else { \
|
||||
(result)=0; \
|
||||
} \
|
||||
}
|
||||
|
||||
/** Internal trie getter from a BMP code point, treating a lead surrogate as a normal code point */
|
||||
#define _UTRIE_GET_FROM_BMP(trie, data, c16) \
|
||||
_UTRIE_GET_RAW(trie, data, 0xd800<=(c16) && (c16)<=0xdbff ? UTRIE_LEAD_INDEX_DISP : 0, c16);
|
||||
|
||||
/**
|
||||
* Internal trie getter from a code point.
|
||||
* Could be faster(?) but longer with
|
||||
* if((c32)<=0xd7ff) { (result)=_UTRIE_GET_RAW(trie, data, 0, c32); }
|
||||
*/
|
||||
#define _UTRIE_GET(trie, data, c32, result) \
|
||||
if((uint32_t)(c32)<=0xffff) { \
|
||||
/* BMP code points */ \
|
||||
(result)=_UTRIE_GET_FROM_BMP(trie, data, c32); \
|
||||
} else if((uint32_t)(c32)<=0x10ffff) { \
|
||||
/* supplementary code point */ \
|
||||
UChar __lead16=UTF16_LEAD(c32); \
|
||||
_UTRIE_GET_FROM_PAIR(trie, data, __lead16, c32, result); \
|
||||
} else { \
|
||||
/* out of range */ \
|
||||
(result)=0; \
|
||||
}
|
||||
|
||||
/** Internal next-post-increment: get the next code point (c, c2) and its data */
|
||||
#define _UTRIE_NEXT(trie, data, src, limit, c, c2, result) { \
|
||||
(c)=*(src)++; \
|
||||
if(!UTF_IS_LEAD(c)) { \
|
||||
(c2)=0; \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
|
||||
} else if((src)!=(limit) && UTF_IS_TRAIL((c2)=*(src))) { \
|
||||
++(src); \
|
||||
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result)); \
|
||||
} else { \
|
||||
/* unpaired lead surrogate code point */ \
|
||||
(c2)=0; \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
|
||||
} \
|
||||
}
|
||||
|
||||
/** Internal previous: get the previous code point (c, c2) and its data */
|
||||
#define _UTRIE_PREVIOUS(trie, data, start, src, c, c2, result) { \
|
||||
(c)=*--(src); \
|
||||
if(!UTF_IS_SURROGATE(c)) { \
|
||||
(c2)=0; \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
|
||||
} else if(!UTF_IS_SURROGATE_FIRST(c)) { \
|
||||
/* trail surrogate */ \
|
||||
if((start)!=(src) && UTF_IS_LEAD((c2)=*((src)-1))) { \
|
||||
--(src); \
|
||||
(result)=(c); (c)=(c2); (c2)=(UChar)(result); /* swap c, c2 */ \
|
||||
_UTRIE_GET_FROM_PAIR((trie), data, (c), (c2), (result)); \
|
||||
} else { \
|
||||
/* unpaired trail surrogate code point */ \
|
||||
(c2)=0; \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, 0, (c)); \
|
||||
} \
|
||||
} else { \
|
||||
/* unpaired lead surrogate code point */ \
|
||||
(c2)=0; \
|
||||
(result)=_UTRIE_GET_RAW((trie), data, UTRIE_LEAD_INDEX_DISP, (c)); \
|
||||
} \
|
||||
}
|
||||
|
||||
/* Public UTrie API ---------------------------------------------------------*/
|
||||
|
||||
/**
|
||||
* Get a pointer to the contiguous part of the data array
|
||||
* for the Latin-1 range (U+0000..U+00ff).
|
||||
* Must be used only if the Latin-1 range is in fact linear
|
||||
* (trie->isLatin1Linear).
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @return (const uint16_t *) pointer to values for Latin-1 code points
|
||||
*/
|
||||
#define UTRIE_GET16_LATIN1(trie) ((trie)->index+(trie)->indexLength+UTRIE_DATA_BLOCK_LENGTH)
|
||||
|
||||
/**
|
||||
* Get a pointer to the contiguous part of the data array
|
||||
* for the Latin-1 range (U+0000..U+00ff).
|
||||
* Must be used only if the Latin-1 range is in fact linear
|
||||
* (trie->isLatin1Linear).
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @return (const uint32_t *) pointer to values for Latin-1 code points
|
||||
*/
|
||||
#define UTRIE_GET32_LATIN1(trie) ((trie)->data32+UTRIE_DATA_BLOCK_LENGTH)
|
||||
|
||||
/**
|
||||
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
|
||||
* c16 may be a lead surrogate, which may have a value including a folding offset.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c16 (UChar, in) the input BMP code point
|
||||
* @return (uint16_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET16_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, index, 0, c16)
|
||||
|
||||
/**
|
||||
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
|
||||
* c16 may be a lead surrogate, which may have a value including a folding offset.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c16 (UChar, in) the input BMP code point
|
||||
* @return (uint32_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET32_FROM_LEAD(trie, c16) _UTRIE_GET_RAW(trie, data32, 0, c16)
|
||||
|
||||
/**
|
||||
* Get a 16-bit trie value from a BMP code point (UChar, <=U+ffff).
|
||||
* Even lead surrogate code points are treated as normal code points,
|
||||
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c16 (UChar, in) the input BMP code point
|
||||
* @return (uint16_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET16_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, index, c16)
|
||||
|
||||
/**
|
||||
* Get a 32-bit trie value from a BMP code point (UChar, <=U+ffff).
|
||||
* Even lead surrogate code points are treated as normal code points,
|
||||
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c16 (UChar, in) the input BMP code point
|
||||
* @return (uint32_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET32_FROM_BMP(trie, c16) _UTRIE_GET_FROM_BMP(trie, data32, c16)
|
||||
|
||||
/**
|
||||
* Get a 16-bit trie value from a code point.
|
||||
* Even lead surrogate code points are treated as normal code points,
|
||||
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c32 (UChar32, in) the input code point
|
||||
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET16(trie, c32, result) _UTRIE_GET(trie, index, c32, result)
|
||||
|
||||
/**
|
||||
* Get a 32-bit trie value from a code point.
|
||||
* Even lead surrogate code points are treated as normal code points,
|
||||
* with unfolded values that may differ from _FROM_LEAD() macro results for them.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c32 (UChar32, in) the input code point
|
||||
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET32(trie, c32, result) _UTRIE_GET(trie, data32, c32, result)
|
||||
|
||||
/**
|
||||
* Get the next code point (c, c2), post-increment src,
|
||||
* and get a 16-bit value from the trie.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param src (const UChar *, in/out) the source text pointer
|
||||
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
|
||||
* @param c (UChar, out) variable for the BMP or lead code unit
|
||||
* @param c2 (UChar, out) variable for 0 or the trail code unit
|
||||
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_NEXT16(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, index, src, limit, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get the next code point (c, c2), post-increment src,
|
||||
* and get a 32-bit value from the trie.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param src (const UChar *, in/out) the source text pointer
|
||||
* @param limit (const UChar *, in) the limit pointer for the text, or NULL
|
||||
* @param c (UChar, out) variable for the BMP or lead code unit
|
||||
* @param c2 (UChar, out) variable for 0 or the trail code unit
|
||||
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_NEXT32(trie, src, limit, c, c2, result) _UTRIE_NEXT(trie, data32, src, limit, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get the previous code point (c, c2), pre-decrement src,
|
||||
* and get a 16-bit value from the trie.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param start (const UChar *, in) the start pointer for the text, or NULL
|
||||
* @param src (const UChar *, in/out) the source text pointer
|
||||
* @param c (UChar, out) variable for the BMP or lead code unit
|
||||
* @param c2 (UChar, out) variable for 0 or the trail code unit
|
||||
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_PREVIOUS16(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, index, start, src, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get the previous code point (c, c2), pre-decrement src,
|
||||
* and get a 32-bit value from the trie.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param start (const UChar *, in) the start pointer for the text, or NULL
|
||||
* @param src (const UChar *, in/out) the source text pointer
|
||||
* @param c (UChar, out) variable for the BMP or lead code unit
|
||||
* @param c2 (UChar, out) variable for 0 or the trail code unit
|
||||
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_PREVIOUS32(trie, start, src, c, c2, result) _UTRIE_PREVIOUS(trie, data32, start, src, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get a 16-bit trie value from a pair of surrogates.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c (UChar, in) a lead surrogate
|
||||
* @param c2 (UChar, in) a trail surrogate
|
||||
* @param result (uint16_t, out) uint16_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET16_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, index, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get a 32-bit trie value from a pair of surrogates.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param c (UChar, in) a lead surrogate
|
||||
* @param c2 (UChar, in) a trail surrogate
|
||||
* @param result (uint32_t, out) uint32_t variable for the trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET32_FROM_PAIR(trie, c, c2, result) _UTRIE_GET_FROM_PAIR(trie, data32, c, c2, result)
|
||||
|
||||
/**
|
||||
* Get a 16-bit trie value from a folding offset (from the value of a lead surrogate)
|
||||
* and a trail surrogate.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
|
||||
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
|
||||
* @return (uint16_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, index, offset, (c2)&0x3ff)
|
||||
|
||||
/**
|
||||
* Get a 32-bit trie value from a folding offset (from the value of a lead surrogate)
|
||||
* and a trail surrogate.
|
||||
*
|
||||
* @param trie (const UTrie *, in) a pointer to the runtime trie structure
|
||||
* @param offset (int32_t, in) the folding offset from the value of a lead surrogate
|
||||
* @param c2 (UChar, in) a trail surrogate (only the 10 low bits are significant)
|
||||
* @return (uint32_t) trie lookup result
|
||||
*/
|
||||
#define UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2) _UTRIE_GET_RAW(trie, data32, offset, (c2)&0x3ff)
|
||||
|
||||
/* enumeration callback types */
|
||||
|
||||
/**
|
||||
* Callback from utrie_enum(), extracts a uint32_t value from a
|
||||
* trie value. This value will be passed on to the UTrieEnumRange function.
|
||||
*
|
||||
* @param context an opaque pointer, as passed into utrie_enum()
|
||||
* @param value a value from the trie
|
||||
* @return the value that is to be passed on to the UTrieEnumRange function
|
||||
*/
|
||||
typedef uint32_t U_CALLCONV
|
||||
UTrieEnumValue(const void *context, uint32_t value);
|
||||
|
||||
/**
|
||||
* Callback from utrie_enum(), is called for each contiguous range
|
||||
* of code points with the same value as retrieved from the trie and
|
||||
* transformed by the UTrieEnumValue function.
|
||||
*
|
||||
* @param context an opaque pointer, as passed into utrie_enum()
|
||||
* @param start the first code point in a contiguous range with value
|
||||
* @param limit one past the last code point in a contiguous range with value
|
||||
* @param value the value that is set for all code points in [start..limit[
|
||||
*/
|
||||
typedef void U_CALLCONV
|
||||
UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value);
|
||||
|
||||
/**
|
||||
* Enumerate efficiently all values in a trie.
|
||||
* For each entry in the trie, the value to be delivered is passed through
|
||||
* the UTrieEnumValue function.
|
||||
* The value is unchanged if that function pointer is NULL.
|
||||
*
|
||||
* For each contiguous range of code points with a given value,
|
||||
* the UTrieEnumRange function is called.
|
||||
*
|
||||
* @param trie a pointer to the runtime trie structure
|
||||
* @param enumValue a pointer to a function that may transform the trie entry value,
|
||||
* or NULL if the values from the trie are to be used directly
|
||||
* @param enumRange a pointer to a function that is called for each contiguous range
|
||||
* of code points with the same value
|
||||
* @param context an opaque pointer that is passed on to the callback functions
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
utrie_enum(UTrie *trie,
|
||||
UTrieEnumValue *enumValue, UTrieEnumRange *enumRange, const void *context);
|
||||
|
||||
/**
|
||||
* Unserialize a trie from 32-bit-aligned memory.
|
||||
* Inverse of utrie_serialize().
|
||||
* Fills the UTrie runtime trie structure with the settings for the trie data.
|
||||
*
|
||||
* @param trie a pointer to the runtime trie structure
|
||||
* @param data a pointer to 32-bit-aligned memory containing trie data
|
||||
* @param length the number of bytes available at data
|
||||
* @param pErrorCode an in/out ICU UErrorCode
|
||||
* @return the number of bytes at data taken up by the trie data
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utrie_unserialize(UTrie *trie, const uint8_t *data, int32_t length, UErrorCode *pErrorCode);
|
||||
|
||||
/* Building a trie ----------------------------------------------------------*/
|
||||
|
||||
/**
|
||||
* Build-time trie structure.
|
||||
* Opaque definition, here only to make fillIn parameters possible
|
||||
* for utrie_open() and utrie_clone().
|
||||
*/
|
||||
struct UNewTrie {
|
||||
/**
|
||||
* Index values at build-time are 32 bits wide for easier processing.
|
||||
* Bit 31 is set if the data block is used by multiple index values (from utrie_setRange()).
|
||||
*/
|
||||
int32_t index[UTRIE_MAX_INDEX_LENGTH];
|
||||
uint32_t *data;
|
||||
|
||||
int32_t indexLength, dataCapacity, dataLength;
|
||||
UBool isAllocated, isDataAllocated;
|
||||
UBool isLatin1Linear, isCompacted;
|
||||
|
||||
/**
|
||||
* Map of adjusted indexes, used in utrie_compact().
|
||||
* Maps from original indexes to new ones.
|
||||
*/
|
||||
int32_t map[UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT];
|
||||
};
|
||||
|
||||
typedef struct UNewTrie UNewTrie;
|
||||
|
||||
/**
|
||||
* Build-time trie callback function, used with utrie_serialize().
|
||||
* This function calculates a lead surrogate's value including a folding offset
|
||||
* from the 1024 supplementary code points [start..start+1024[ .
|
||||
* It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
|
||||
*
|
||||
* The folding offset is provided by the caller.
|
||||
* It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
|
||||
* Instead of the offset itself, n can be stored in 10 bits -
|
||||
* or fewer if it can be assumed that few lead surrogates have associated data.
|
||||
*
|
||||
* The returned value must be
|
||||
* - not zero if and only if there is relevant data
|
||||
* for the corresponding 1024 supplementary code points
|
||||
* - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
|
||||
*
|
||||
* @return a folded value, or 0 if there is no relevant data for the lead surrogate.
|
||||
*/
|
||||
typedef uint32_t U_CALLCONV
|
||||
UNewTrieGetFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset);
|
||||
|
||||
/**
|
||||
* Open a build-time trie structure.
|
||||
* The size of the build-time data array is specified to avoid allocating a large
|
||||
* array in all cases. The array itself can also be passed in.
|
||||
*
|
||||
* Although the trie is never fully expanded to a linear array, especially when
|
||||
* utrie_setRange32() is used, the data array could be large during build time.
|
||||
* The maximum length is
|
||||
* UTRIE_MAX_BUILD_TIME_DATA_LENGTH=0x110000+UTRIE_DATA_BLOCK_LENGTH+0x400.
|
||||
* (Number of Unicode code points + one all-zero block +
|
||||
* possible duplicate entries for 1024 lead surrogates.)
|
||||
* (UTRIE_DATA_BLOCK_LENGTH<=0x200 in all cases.)
|
||||
*
|
||||
* @param fillIn a pointer to a UNewTrie structure to be initialized (will not be released), or
|
||||
* NULL if one is to be allocated
|
||||
* @param aliasData a pointer to a data array to be used (will not be released), or
|
||||
* NULL if one is to be allocated
|
||||
* @param maxDataLength the capacity of aliasData (if not NULL) or
|
||||
* the length of the data array to be allocated
|
||||
* @param latin1Linear a flag indicating whether the Latin-1 range is to be allocated and
|
||||
* kept in a linear, contiguous part of the data array
|
||||
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
|
||||
*/
|
||||
U_CAPI UNewTrie * U_EXPORT2
|
||||
utrie_open(UNewTrie *fillIn, uint32_t *aliasData, int32_t maxDataLength, UBool latin1Linear);
|
||||
|
||||
/**
|
||||
* Clone a build-time trie structure with all entries.
|
||||
*
|
||||
* @param fillIn like in utrie_open()
|
||||
* @param other the build-time trie structure to clone
|
||||
* @param aliasData like in utrie_open(),
|
||||
* used if aliasDataLength>=(capacity of other's data array)
|
||||
* @param aliasDataLength the length of aliasData
|
||||
* @return a pointer to the initialized fillIn or the allocated and initialized new UNewTrie
|
||||
*/
|
||||
U_CAPI UNewTrie * U_EXPORT2
|
||||
utrie_clone(UNewTrie *fillIn, const UNewTrie *other, uint32_t *aliasData, int32_t aliasDataLength);
|
||||
|
||||
/**
|
||||
* Close a build-time trie structure, and release memory
|
||||
* that was allocated by utrie_open() or utrie_clone().
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
utrie_close(UNewTrie *trie);
|
||||
|
||||
/**
|
||||
* Get the data array of a build-time trie.
|
||||
* The data may be modified, but entries that are equal before
|
||||
* must still be equal after modification.
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
* @param pLength (out) a pointer to a variable that receives the number
|
||||
* of entries in the data array
|
||||
* @return the data array
|
||||
*/
|
||||
U_CAPI uint32_t * U_EXPORT2
|
||||
utrie_getData(UNewTrie *trie, int32_t *pLength);
|
||||
|
||||
/**
|
||||
* Set a value for a code point.
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
* @param c the code point
|
||||
* @param value the value
|
||||
* @return FALSE if a failure occurred (illegal argument or data array overrun)
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
utrie_set32(UNewTrie *trie, UChar32 c, uint32_t value);
|
||||
|
||||
/**
|
||||
* Get a value from a code point as stored in the build-time trie.
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
* @param c the code point
|
||||
* @param pInBlockZero if not NULL, then *pInBlockZero is set to TRUE
|
||||
* iff the value is retrieved from block 0;
|
||||
* block 0 is the all-zero initial block
|
||||
* @return the value
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
utrie_get32(UNewTrie *trie, UChar32 c, UBool *pInBlockZero);
|
||||
|
||||
/**
|
||||
* Set a value in a range of code points [start..limit[.
|
||||
* All code points c with start<=c<limit will get the value if
|
||||
* overwrite is TRUE or if the old value is 0.
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
* @param start the first code point to get the value
|
||||
* @param limit one past the last code point to get the value
|
||||
* @param value the value
|
||||
* @param overwrite flag for whether old non-zero values are to be overwritten
|
||||
* @return FALSE if a failure occurred (illegal argument or data array overrun)
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
utrie_setRange32(UNewTrie *trie, UChar32 start, UChar32 limit, uint32_t value, UBool overwrite);
|
||||
|
||||
/**
|
||||
* Compact the build-time trie after all values are set, and then
|
||||
* serialize it into 32-bit aligned memory.
|
||||
*
|
||||
* After this, the trie can only be serizalized again and/or closed;
|
||||
* no further values can be added.
|
||||
*
|
||||
* @see utrie_unserialize()
|
||||
*
|
||||
* @param trie the build-time trie
|
||||
* @param data a pointer to 32-bit-aligned memory for the trie data
|
||||
* @param capacity the number of bytes available at data
|
||||
* @param getFoldedValue a callback function that calculates the value for
|
||||
* a lead surrogate from all of its supplementary code points
|
||||
* and the folding offset
|
||||
* @param reduceTo16Bits flag for whether the values are to be reduced to a
|
||||
* width of 16 bits for serialization and runtime
|
||||
* @param pErrorCode a UErrorCode argument; among other possible error codes:
|
||||
* - U_BUFFER_OVERFLOW_ERROR if the data storage block is too small for serialization
|
||||
* - U_MEMORY_ALLOCATION_ERROR if the trie data array is too small
|
||||
* - U_INDEX_OUTOFBOUNDS_ERROR if the index or data arrays are too long after compaction for serialization
|
||||
*
|
||||
* @return the number of bytes written for the trie
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
utrie_serialize(UNewTrie *trie, uint8_t *data, int32_t capacity,
|
||||
UNewTrieGetFoldedValue *getFoldedValue,
|
||||
UBool reduceTo16Bits,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
#endif
|
@ -292,6 +292,10 @@ SOURCE=.\susctest.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\trietest.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\ucmptst.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
@ -634,7 +634,7 @@ static void TestUnicodeData()
|
||||
/* sanity check on repeated properties */
|
||||
for(c=0xfffe; c<=0x10ffff;) {
|
||||
if(u_charType(c)!=U_UNASSIGNED) {
|
||||
log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED\n", c);
|
||||
log_err("error: u_charType(U+%04lx)!=U_UNASSIGNED (returns %d)\n", c, u_charType(c));
|
||||
}
|
||||
if((c&0xffff)==0xfffe) {
|
||||
++c;
|
||||
|
@ -22,11 +22,13 @@ void addSCSUTest(TestNode** root);
|
||||
void addHashtableTest(TestNode** root);
|
||||
void addCStringTest(TestNode** root);
|
||||
void addMemoryStreamTest(TestNode** root);
|
||||
void addTrieTest(TestNode** root);
|
||||
|
||||
void addUtility(TestNode** root);
|
||||
|
||||
void addUtility(TestNode** root)
|
||||
{
|
||||
addTrieTest(root);
|
||||
addLocaleTest(root);
|
||||
addUnicodeTest(root);
|
||||
addResourceBundleTest(root);
|
||||
@ -36,4 +38,3 @@ void addUtility(TestNode** root)
|
||||
addCStringTest(root);
|
||||
addMemoryStreamTest(root);
|
||||
}
|
||||
|
||||
|
567
icu4c/source/test/cintltst/trietest.c
Normal file
567
icu4c/source/test/cintltst/trietest.c
Normal file
@ -0,0 +1,567 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: trietest.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001nov20
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "utrie.h"
|
||||
#include "cstring.h"
|
||||
|
||||
#if 1
|
||||
#include "cintltst.h"
|
||||
#else
|
||||
/* definitions from standalone utrie development */
|
||||
#define log_err printf
|
||||
#define log_verbose printf
|
||||
|
||||
#undef u_errorName
|
||||
#define u_errorName(errorCode) "some error code"
|
||||
#endif
|
||||
|
||||
#define ARRAY_LENGTH(array) (sizeof(array)/sizeof(array[0]))
|
||||
|
||||
/* Values for setting possibly overlapping, out-of-order ranges of values */
|
||||
typedef struct SetRange {
|
||||
UChar32 start, limit;
|
||||
uint32_t value;
|
||||
UBool overwrite;
|
||||
} SetRange;
|
||||
|
||||
/*
|
||||
* Values for testing:
|
||||
* value is set from the previous boundary's limit to before
|
||||
* this boundary's limit
|
||||
*/
|
||||
typedef struct CheckRange {
|
||||
UChar32 limit;
|
||||
uint32_t value;
|
||||
} CheckRange;
|
||||
|
||||
static uint8_t storage[100000];
|
||||
|
||||
static uint32_t U_CALLCONV
|
||||
_testFoldedValue32(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t foldedValue, value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
foldedValue=0;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
foldedValue|=value;
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
if(foldedValue!=0) {
|
||||
return ((uint32_t)offset<<16)|foldedValue;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
_testFoldingOffset32(uint32_t data) {
|
||||
return (int32_t)(data>>16);
|
||||
}
|
||||
|
||||
static uint32_t U_CALLCONV
|
||||
_testFoldedValue16(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t foldedValue, value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
foldedValue=0;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
foldedValue|=value;
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
if(foldedValue!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static int32_t U_CALLCONV
|
||||
_testFoldingOffset16(uint32_t data) {
|
||||
if(data&0x8000) {
|
||||
return (int32_t)(data&0x7fff);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t U_CALLCONV
|
||||
_testEnumValue(const void *context, uint32_t value) {
|
||||
return value^0x5555;
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
_testEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
|
||||
const CheckRange **pb=(const CheckRange **)context;
|
||||
const CheckRange *b=(*pb)++;
|
||||
|
||||
value^=0x5555;
|
||||
if(start!=(b-1)->limit || limit!=b->limit || value!=b->value) {
|
||||
log_err("error: utrie_enum() delivers wrong range [U+%04lx..U+%04lx[.0x%lx instead of [U+%04lx..U+%04lx[.0x%lx\n",
|
||||
start, limit, value,
|
||||
(b-1)->limit, b->limit, b->value);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
testTrieIteration(const char *testName,
|
||||
const UTrie *trie,
|
||||
const CheckRange checkRanges[], int32_t countCheckRanges) {
|
||||
UChar s[100];
|
||||
uint32_t values[30];
|
||||
|
||||
const UChar *p, *limit;
|
||||
|
||||
uint32_t value;
|
||||
UChar32 c;
|
||||
int32_t i, length, countValues;
|
||||
UChar c2;
|
||||
|
||||
/* write a string */
|
||||
length=countValues=0;
|
||||
for(i=0; i<countCheckRanges; ++i) {
|
||||
c=checkRanges[i].limit;
|
||||
if(c!=0) {
|
||||
--c;
|
||||
UTF_APPEND_CHAR_UNSAFE(s, length, c);
|
||||
values[countValues++]=checkRanges[i].value;
|
||||
}
|
||||
}
|
||||
limit=s+length;
|
||||
|
||||
/* try forward */
|
||||
p=s;
|
||||
i=0;
|
||||
while(p<limit) {
|
||||
c=c2=0x33;
|
||||
if(trie->data32!=NULL) {
|
||||
UTRIE_NEXT32(trie, p, limit, c, c2, value);
|
||||
} else {
|
||||
UTRIE_NEXT16(trie, p, limit, c, c2, value);
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE_NEXT(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, c, c2, value, values[i]);
|
||||
}
|
||||
if(
|
||||
c2==0 ?
|
||||
c!=*(p-1) :
|
||||
!UTF_IS_LEAD(c) || !UTF_IS_TRAIL(c2) || c!=*(p-2) || c2!=*(p-1)
|
||||
) {
|
||||
log_err("error: wrong (c, c2) from UTRIE_NEXT(%s): (U+%04lx, U+%04lx)\n",
|
||||
testName, c, c2);
|
||||
continue;
|
||||
}
|
||||
if(c2!=0) {
|
||||
int32_t offset;
|
||||
|
||||
if(trie->data32==NULL) {
|
||||
value=UTRIE_GET16_FROM_LEAD(trie, c);
|
||||
offset=trie->getFoldingOffset(value);
|
||||
if(offset>0) {
|
||||
value=UTRIE_GET16_FROM_OFFSET_TRAIL(trie, offset, c2);
|
||||
} else {
|
||||
value=0;
|
||||
}
|
||||
} else {
|
||||
value=UTRIE_GET32_FROM_LEAD(trie, c);
|
||||
offset=trie->getFoldingOffset(value);
|
||||
if(offset>0) {
|
||||
value=UTRIE_GET32_FROM_OFFSET_TRAIL(trie, offset, c2);
|
||||
} else {
|
||||
value=0;
|
||||
}
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE_GETXX_FROM_OFFSET_TRAIL(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, c, c2, value, values[i]);
|
||||
}
|
||||
}
|
||||
if(c2!=0) {
|
||||
value=0x44;
|
||||
if(trie->data32==NULL) {
|
||||
UTRIE_GET16_FROM_PAIR(trie, c, c2, value);
|
||||
} else {
|
||||
UTRIE_GET32_FROM_PAIR(trie, c, c2, value);
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE_GETXX_FROM_PAIR(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, c, c2, value, values[i]);
|
||||
}
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
/* try backward */
|
||||
p=limit;
|
||||
i=countValues;
|
||||
while(s<p) {
|
||||
--i;
|
||||
c=c2=0x33;
|
||||
if(trie->data32!=NULL) {
|
||||
UTRIE_PREVIOUS32(trie, s, p, c, c2, value);
|
||||
} else {
|
||||
UTRIE_PREVIOUS16(trie, s, p, c, c2, value);
|
||||
}
|
||||
if(value!=values[i]) {
|
||||
log_err("error: wrong value from UTRIE_PREVIOUS(%s)(U+%04lx, U+%04lx): 0x%lx instead of 0x%lx\n",
|
||||
testName, c, c2, value, values[i]);
|
||||
}
|
||||
if(
|
||||
c2==0 ?
|
||||
c!=*p:
|
||||
!UTF_IS_LEAD(c) || !UTF_IS_TRAIL(c2) || c!=*p || c2!=*(p+1)
|
||||
) {
|
||||
log_err("error: wrong (c, c2) from UTRIE_PREVIOUS(%s): (U+%04lx, U+%04lx)\n",
|
||||
testName, c, c2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
testTrieRanges(const char *testName,
|
||||
const SetRange setRanges[], int32_t countSetRanges,
|
||||
const CheckRange checkRanges[], int32_t countCheckRanges,
|
||||
UBool dataIs32, UBool latin1Linear) {
|
||||
UTrieGetFoldingOffset *getFoldingOffset;
|
||||
const CheckRange *enumRanges;
|
||||
UNewTrie *newTrie;
|
||||
UTrie trie={ 0 };
|
||||
uint32_t value, value2;
|
||||
UChar32 start, limit;
|
||||
int32_t i, length;
|
||||
UErrorCode errorCode;
|
||||
UBool overwrite, ok;
|
||||
|
||||
log_verbose("\ntesting Trie '%s'\n", testName);
|
||||
newTrie=utrie_open(NULL, NULL, 2000, latin1Linear);
|
||||
|
||||
/* set values from setRanges[] */
|
||||
ok=TRUE;
|
||||
for(i=0; i<countSetRanges; ++i) {
|
||||
start=setRanges[i].start;
|
||||
limit=setRanges[i].limit;
|
||||
value=setRanges[i].value;
|
||||
overwrite=setRanges[i].overwrite;
|
||||
if((limit-start)==1 && overwrite) {
|
||||
ok&=utrie_set32(newTrie, start, value);
|
||||
} else {
|
||||
ok&=utrie_setRange32(newTrie, start, limit, value, overwrite);
|
||||
}
|
||||
}
|
||||
if(!ok) {
|
||||
log_err("error: setting values into a trie failed (%s)\n", testName);
|
||||
return;
|
||||
}
|
||||
|
||||
/* verify that all these values are in the new Trie */
|
||||
start=0;
|
||||
for(i=0; i<countCheckRanges; ++i) {
|
||||
limit=checkRanges[i].limit;
|
||||
value=checkRanges[i].value;
|
||||
|
||||
while(start<limit) {
|
||||
if(value!=utrie_get32(newTrie, start, NULL)) {
|
||||
log_err("error: newTrie(%s)[U+%04lx]==0x%lx instead of 0x%lx\n",
|
||||
testName, start, utrie_get32(newTrie, start, NULL), value);
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
if(dataIs32) {
|
||||
getFoldingOffset=_testFoldingOffset32;
|
||||
} else {
|
||||
getFoldingOffset=_testFoldingOffset16;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
length=utrie_serialize(newTrie, storage, sizeof(storage),
|
||||
dataIs32 ? _testFoldedValue32 : _testFoldedValue16,
|
||||
(UBool)!dataIs32,
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_err("error: utrie_serialize(%s) failed: %s\n", testName, u_errorName(errorCode));
|
||||
utrie_close(newTrie);
|
||||
return;
|
||||
}
|
||||
|
||||
/* test linear Latin-1 range from utrie_getData() */
|
||||
if(latin1Linear) {
|
||||
uint32_t *data;
|
||||
int32_t dataLength;
|
||||
|
||||
data=utrie_getData(newTrie, &dataLength);
|
||||
start=0;
|
||||
for(i=0; i<countCheckRanges && start<=0xff; ++i) {
|
||||
limit=checkRanges[i].limit;
|
||||
value=checkRanges[i].value;
|
||||
|
||||
while(start<limit && start<=0xff) {
|
||||
if(value!=data[UTRIE_DATA_BLOCK_LENGTH+start]) {
|
||||
log_err("error: newTrie(%s).latin1Data[U+%04lx]==0x%lx instead of 0x%lx\n",
|
||||
testName, start, data[UTRIE_DATA_BLOCK_LENGTH+start], value);
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
utrie_close(newTrie);
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
if(!utrie_unserialize(&trie, storage, length, &errorCode)) {
|
||||
log_err("error: utrie_unserialize() failed, %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
trie.getFoldingOffset=getFoldingOffset;
|
||||
|
||||
if(dataIs32!=(trie.data32!=NULL)) {
|
||||
log_err("error: trie serialization (%s) did not preserve 32-bitness\n", testName);
|
||||
}
|
||||
if(latin1Linear!=trie.isLatin1Linear) {
|
||||
log_err("error: trie serialization (%s) did not preserve Latin-1-linearity\n", testName);
|
||||
}
|
||||
|
||||
/* verify that all these values are in the unserialized Trie */
|
||||
start=0;
|
||||
for(i=0; i<countCheckRanges; ++i) {
|
||||
limit=checkRanges[i].limit;
|
||||
value=checkRanges[i].value;
|
||||
|
||||
if(start==0xd800) {
|
||||
/* skip surrogates */
|
||||
start=limit;
|
||||
continue;
|
||||
}
|
||||
|
||||
while(start<limit) {
|
||||
if(start<=0xffff) {
|
||||
if(dataIs32) {
|
||||
value2=UTRIE_GET32_FROM_BMP(&trie, start);
|
||||
} else {
|
||||
value2=UTRIE_GET16_FROM_BMP(&trie, start);
|
||||
}
|
||||
if(value!=value2) {
|
||||
log_err("error: unserialized trie(%s).fromBMP(U+%04lx)==0x%lx instead of 0x%lx\n",
|
||||
testName, start, value2, value);
|
||||
}
|
||||
if(!UTF_IS_LEAD(start)) {
|
||||
if(dataIs32) {
|
||||
value2=UTRIE_GET32_FROM_LEAD(&trie, start);
|
||||
} else {
|
||||
value2=UTRIE_GET16_FROM_LEAD(&trie, start);
|
||||
}
|
||||
if(value!=value2) {
|
||||
log_err("error: unserialized trie(%s).fromLead(U+%04lx)==0x%lx instead of 0x%lx\n",
|
||||
testName, start, value2, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(dataIs32) {
|
||||
UTRIE_GET32(&trie, start, value2);
|
||||
} else {
|
||||
UTRIE_GET16(&trie, start, value2);
|
||||
}
|
||||
if(value!=value2) {
|
||||
log_err("error: unserialized trie(%s).get(U+%04lx)==0x%lx instead of 0x%lx\n",
|
||||
testName, start, value2, value);
|
||||
}
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
/* enumerate and verify all ranges */
|
||||
enumRanges=checkRanges+1;
|
||||
utrie_enum(&trie, _testEnumValue, _testEnumRange, &enumRanges);
|
||||
|
||||
/* test linear Latin-1 range */
|
||||
if(trie.isLatin1Linear) {
|
||||
if(trie.data32!=NULL) {
|
||||
const uint32_t *latin1=UTRIE_GET32_LATIN1(&trie);
|
||||
|
||||
for(start=0; start<0x100; ++start) {
|
||||
if(latin1[start]!=UTRIE_GET32_FROM_LEAD(&trie, start)) {
|
||||
log_err("error: (%s) trie.latin1[U+%04lx]=0x%lx!=0x%lx=trie.get32(U+%04lx)\n",
|
||||
testName, start, latin1[start], UTRIE_GET32_FROM_LEAD(&trie, start), start);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const uint16_t *latin1=UTRIE_GET16_LATIN1(&trie);
|
||||
|
||||
for(start=0; start<0x100; ++start) {
|
||||
if(latin1[start]!=UTRIE_GET16_FROM_LEAD(&trie, start)) {
|
||||
log_err("error: (%s) trie.latin1[U+%04lx]=0x%lx!=0x%lx=trie.get16(U+%04lx)\n",
|
||||
testName, start, latin1[start], UTRIE_GET16_FROM_LEAD(&trie, start), start);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
testTrieIteration(testName, &trie, checkRanges, countCheckRanges);
|
||||
}
|
||||
|
||||
static void
|
||||
testTrieRanges2(const char *testName,
|
||||
const SetRange setRanges[], int32_t countSetRanges,
|
||||
const CheckRange checkRanges[], int32_t countCheckRanges,
|
||||
UBool dataIs32) {
|
||||
char name[40];
|
||||
|
||||
testTrieRanges(testName,
|
||||
setRanges, countSetRanges,
|
||||
checkRanges, countCheckRanges,
|
||||
dataIs32, FALSE);
|
||||
|
||||
uprv_strcpy(name, testName);
|
||||
uprv_strcat(name, "-latin1Linear");
|
||||
testTrieRanges(name,
|
||||
setRanges, countSetRanges,
|
||||
checkRanges, countCheckRanges,
|
||||
dataIs32, TRUE);
|
||||
}
|
||||
|
||||
static void
|
||||
testTrieRanges4(const char *testName,
|
||||
const SetRange setRanges[], int32_t countSetRanges,
|
||||
const CheckRange checkRanges[], int32_t countCheckRanges) {
|
||||
char name[40];
|
||||
|
||||
uprv_strcpy(name, testName);
|
||||
uprv_strcat(name, ".32");
|
||||
testTrieRanges2(name,
|
||||
setRanges, countSetRanges,
|
||||
checkRanges, countCheckRanges,
|
||||
TRUE);
|
||||
|
||||
uprv_strcpy(name, testName);
|
||||
uprv_strcat(name, ".16");
|
||||
testTrieRanges2(name,
|
||||
setRanges, countSetRanges,
|
||||
checkRanges, countCheckRanges,
|
||||
FALSE);
|
||||
}
|
||||
|
||||
/* test data ----------------------------------------------------------------*/
|
||||
|
||||
/* set consecutive ranges, even with value 0 */
|
||||
static const SetRange
|
||||
setRanges1[]={
|
||||
0, 0x20, 0, FALSE,
|
||||
0x20, 0xa7, 0x1234, FALSE,
|
||||
0xa7, 0x3400, 0, FALSE,
|
||||
0x3400, 0x9fa6, 0x6162, FALSE,
|
||||
0x9fa6, 0xdada, 0x3132, FALSE,
|
||||
0xdada, 0xeeee, 0x27, FALSE,
|
||||
0xeeee, 0x11111, 1, FALSE,
|
||||
0x11111, 0x44444, 0x6162, FALSE,
|
||||
0x44444, 0xf0003, 0, FALSE,
|
||||
0xf0003, 0xf0004, 0xf, FALSE,
|
||||
0xf0004, 0xf0006, 0x10, FALSE,
|
||||
0xf0006, 0xf0007, 0x11, FALSE,
|
||||
0xf0007, 0xf0020, 0x12, FALSE,
|
||||
0xf0020, 0x110000, 0, FALSE
|
||||
};
|
||||
|
||||
static const CheckRange
|
||||
checkRanges1[]={
|
||||
0, 0, /* dummy start range to make _testEnumRange() simpler */
|
||||
0x20, 0,
|
||||
0xa7, 0x1234,
|
||||
0x3400, 0,
|
||||
0x9fa6, 0x6162,
|
||||
0xdada, 0x3132,
|
||||
0xeeee, 0x27,
|
||||
0x11111,1,
|
||||
0x44444,0x6162,
|
||||
0xf0003,0,
|
||||
0xf0004,0xf,
|
||||
0xf0006,0x10,
|
||||
0xf0007,0x11,
|
||||
0xf0020,0x12,
|
||||
0x110000, 0
|
||||
};
|
||||
|
||||
/* set some interesting overlapping ranges */
|
||||
static const SetRange
|
||||
setRanges2[]={
|
||||
0x21, 0x7f, 0x5555, TRUE,
|
||||
0x2f800,0x2fedc, 0x7a, TRUE,
|
||||
0x72, 0xdd, 3, TRUE,
|
||||
0xdd, 0xde, 4, FALSE,
|
||||
0x2f987,0x2fa98, 5, TRUE,
|
||||
0x2f777,0x2f833, 0, TRUE,
|
||||
0x2f900,0x2ffee, 1, FALSE,
|
||||
0x2ffee,0x2ffef, 2, TRUE
|
||||
};
|
||||
|
||||
static const CheckRange
|
||||
checkRanges2[]={
|
||||
0, 0, /* dummy start range to make _testEnumRange() simpler */
|
||||
0x21, 0,
|
||||
0x72, 0x5555,
|
||||
0xdd, 3,
|
||||
0xde, 4,
|
||||
0x2f833,0,
|
||||
0x2f987,0x7a,
|
||||
0x2fa98,5,
|
||||
0x2fedc,0x7a,
|
||||
0x2ffee,1,
|
||||
0x2ffef,2,
|
||||
0x110000, 0
|
||||
};
|
||||
|
||||
static void
|
||||
TrieTest() {
|
||||
testTrieRanges4("set1",
|
||||
setRanges1, ARRAY_LENGTH(setRanges1),
|
||||
checkRanges1, ARRAY_LENGTH(checkRanges1));
|
||||
testTrieRanges4("set2",
|
||||
setRanges2, ARRAY_LENGTH(setRanges2),
|
||||
checkRanges2, ARRAY_LENGTH(checkRanges2));
|
||||
}
|
||||
|
||||
#if 1
|
||||
void
|
||||
addTrieTest(TestNode** root) {
|
||||
addTest(root, &TrieTest, "tsutil/TrieTest");
|
||||
}
|
||||
#else
|
||||
/* standalone utrie development */
|
||||
int main(int argc, const char *argv[]) {
|
||||
TrieTest();
|
||||
return 0;
|
||||
}
|
||||
#endif
|
@ -180,10 +180,6 @@ main(int argc, char* argv[]) {
|
||||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
compactProps();
|
||||
compactStage3();
|
||||
compactStage2();
|
||||
|
||||
/* write the properties data file */
|
||||
generateData(destDir);
|
||||
}
|
||||
|
@ -74,15 +74,6 @@ addProps(uint32_t c, uint32_t props);
|
||||
extern void
|
||||
repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
||||
|
||||
extern void
|
||||
compactStage2(void);
|
||||
|
||||
extern void
|
||||
compactStage3(void);
|
||||
|
||||
extern void
|
||||
compactProps(void);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
|
@ -24,6 +24,7 @@
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "utrie.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "genprops.h"
|
||||
@ -40,8 +41,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
||||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 1.1 .
|
||||
|
||||
The following is a description of format version 2.0 .
|
||||
|
||||
Data contents:
|
||||
|
||||
@ -53,39 +53,28 @@ the properties, if any, for that code point. This means that the input
|
||||
to the lookup are 21-bit unsigned integers, with not all of the
|
||||
21-bit range used.
|
||||
|
||||
It is assumed that client code keeps a uint16_t pointer
|
||||
It is assumed that client code keeps a uint32_t pointer
|
||||
to the beginning of the data:
|
||||
|
||||
const uint16 *p16;
|
||||
|
||||
Some indexes assume 32-bit units; although client code should only
|
||||
cast the above pointer to (const uint32_t *), it is easier here
|
||||
to talk about the result of the indexing with the definition of
|
||||
another pointer variable for this:
|
||||
|
||||
const uint32_t *p32=(const uint32_t *)p16;
|
||||
const uint32_t *p32;
|
||||
|
||||
Formally, the file contains the following structures:
|
||||
|
||||
A0 const uint16_t STAGE_2_BITS(=6);
|
||||
A1 const uint16_t STAGE_3_BITS(=4);
|
||||
(STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS))
|
||||
A2 const uint16_t exceptionsIndex; -- 32-bit unit index
|
||||
A3 const uint16_t stage3Index; -- 16-bit unit index of stage3, new in formatVersion 1.1
|
||||
A4 const uint16_t propsIndex; -- 32-bit unit index, new in formatVersion 1.1
|
||||
A5 const uint16_t exceptionsTopIndex; -- 32-bit unit index to the first unit after exceptions units, new in formatVersion 1.1
|
||||
A6 const uint16_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special casing
|
||||
A7 const uint16_t reservedIndex;
|
||||
indexes[16] with values i0..i15:
|
||||
|
||||
S1 const uint16_t stage1[0x440]; -- 0x440=0x110000>>10
|
||||
S2 const uint16_t stage2[variable size];
|
||||
S3 const uint16_t stage3[variable size];
|
||||
(possible 1*uint16_t for padding to 4-alignment)
|
||||
i0 const int32_t propsIndex; -- 32-bit unit index to the table of 32-bit properties words
|
||||
i1 const int32_t exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
|
||||
i2 const int32_t exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
|
||||
i3 const int32_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special mappings
|
||||
i4..i15 const int32_t[] reservedIndex; -- reserved values; 0 for now
|
||||
|
||||
P const uint32_t props32[variable size];
|
||||
E const uint32_t exceptions[variable size];
|
||||
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
|
||||
|
||||
3-stage lookup and properties:
|
||||
P const uint32_t props32[i1-i0];
|
||||
E const uint32_t exceptions[i2-i1];
|
||||
U const UChar uchars[2*(i3-i2)];
|
||||
|
||||
Trie lookup and properties:
|
||||
|
||||
In order to condense the data for the 21-bit code space, several properties of
|
||||
the Unicode code assignment are exploited:
|
||||
@ -95,56 +84,30 @@ the Unicode code assignment are exploited:
|
||||
- Inside blocks for scripts the properties are often repetitive.
|
||||
- The 21-bit space is not fully used for Unicode.
|
||||
|
||||
The three-stage lookup organizes code points in groups of 16 in stage 3.
|
||||
64 such groups are grouped again, resulting in blocks of 64 indexes
|
||||
for a total of 1k code points in stage 2.
|
||||
The first stage is limited according to all code points being <0x110000.
|
||||
Each stage contains indexes to groups or blocks of the next stage
|
||||
in an n:1 manner, i.e., multiple entries of one stage may index the same
|
||||
group or block in the next one.
|
||||
In the second and third stages, groups of 64 or 16 may partially or completely
|
||||
overlap to save space with repetitive properties.
|
||||
In the properties table, only unique 32-bit words are stored to exploit
|
||||
non-adjacent overlapping. This is why the third stage does not directly
|
||||
contain the 32-bit properties words but only indexes to them.
|
||||
The lookup of properties for a given code point is done with a trie lookup,
|
||||
using the UTrie implementation.
|
||||
The trie lookup result is a 16-bit index in the props32[] table where the
|
||||
actual 32-bit properties word is stored. This is done to save space.
|
||||
|
||||
The indexes in each stage take the offset in the data of the next block into
|
||||
account to save additional arithmetic in the access.
|
||||
(There are thousands of 16-bit entries in the trie data table, but
|
||||
only a few hundred unique 32-bit properties words.
|
||||
If the trie data table contained 32-bit words directly, then that would be
|
||||
larger because the length of the table would be the same as now but the
|
||||
width would be 32 bits instead of 16. This saves more than 10kB.)
|
||||
|
||||
With a given Unicode code point
|
||||
|
||||
uint32_t c;
|
||||
UChar32 c;
|
||||
|
||||
and 0<=c<0x110000, the lookup uses the three stage tables to
|
||||
arrive at an index into the props32[] table containing the character
|
||||
properties for c.
|
||||
For some characters, not all of the properties can be efficiently encoded
|
||||
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
|
||||
array.
|
||||
|
||||
The first stage consumes the 11 most significant bits of the 21-bit code point
|
||||
and results in an index into the second stage:
|
||||
|
||||
uint16_t i2=p16[8+c>>10];
|
||||
|
||||
The second stage consumes bits 9 to 4 of c and results in an index into the
|
||||
third stage:
|
||||
|
||||
uint16_t i3=p16[i2+((c>>4)&0x3f)];
|
||||
|
||||
The third stage consumes bits 3 to 0 of c and results in a code point-
|
||||
specific value, which itself is only an index into the props32[] table:
|
||||
|
||||
uint16_t i=p16[i3+(c&0xf)];
|
||||
|
||||
Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS
|
||||
in p16[0..1].
|
||||
|
||||
There is finally the 32-bit encoded set of properties for c:
|
||||
and 0<=c<0x110000, the lookup is done like this:
|
||||
|
||||
uint16_t i;
|
||||
UTRIE_GET16(c, i);
|
||||
uint32_t props=p32[i];
|
||||
|
||||
For some characters, this contains an index into the exceptions array:
|
||||
For some characters, not all of the properties can be efficiently encoded
|
||||
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
|
||||
array:
|
||||
|
||||
if(props&EXCEPTION_BIT)) {
|
||||
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
|
||||
@ -280,31 +243,16 @@ static UDataInfo dataInfo={
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{0x55, 0x50, 0x72, 0x6f}, /* dataFormat="UPro" */
|
||||
{1, 3, 0, 0}, /* formatVersion */
|
||||
{3, 0, 0, 0} /* dataVersion */
|
||||
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
|
||||
{ 2, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 3, 0, 0, 0 } /* dataVersion */
|
||||
};
|
||||
|
||||
/* definitions and arrays for the 3-stage lookup */
|
||||
/* definitions of expected data size limits */
|
||||
enum {
|
||||
STAGE_2_BITS=6, STAGE_3_BITS=4,
|
||||
STAGE_1_BITS=21-(STAGE_2_BITS+STAGE_3_BITS),
|
||||
|
||||
STAGE_2_SHIFT=STAGE_3_BITS,
|
||||
STAGE_1_SHIFT=(STAGE_2_SHIFT+STAGE_2_BITS),
|
||||
|
||||
/* number of entries per sub-table in each stage */
|
||||
STAGE_1_BLOCK=0x110000>>STAGE_1_SHIFT,
|
||||
STAGE_2_BLOCK=1<<STAGE_2_BITS,
|
||||
STAGE_3_BLOCK=1<<STAGE_3_BITS,
|
||||
|
||||
/* number of code points per stage 1 index */
|
||||
STAGE_2_3_AREA=1<<STAGE_1_SHIFT,
|
||||
|
||||
MAX_PROPS_COUNT=25000,
|
||||
MAX_UCHAR_COUNT=10000,
|
||||
MAX_EXCEPTIONS_COUNT=4096,
|
||||
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT
|
||||
MAX_EXCEPTIONS_COUNT=4096
|
||||
};
|
||||
|
||||
/* definitions for the properties words */
|
||||
@ -321,15 +269,16 @@ enum {
|
||||
static const int32_t MAX_VALUE=(1L<<(VALUE_BITS-1))-1;
|
||||
static const int32_t MIN_VALUE=-(1L<<(VALUE_BITS-1));
|
||||
|
||||
static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
|
||||
stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
|
||||
static UNewTrie *pTrie=NULL;
|
||||
|
||||
/* stage1Top=STAGE_1_BLOCK never changes, stage2Top starts after the empty-properties-group */
|
||||
static uint16_t stage2Top=STAGE_2_BLOCK, stage3Top;
|
||||
/* props32[] contains unique properties words after compacting the array of properties */
|
||||
static uint32_t props32[MAX_PROPS_COUNT];
|
||||
|
||||
/* props[] is used before, props32[] after compacting the array of properties */
|
||||
static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
|
||||
static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
|
||||
/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
|
||||
static uint32_t *props;
|
||||
|
||||
/* length of props32[] after compaction */
|
||||
static int32_t propsTop;
|
||||
|
||||
/* exceptions values */
|
||||
static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
|
||||
@ -344,36 +293,9 @@ static uint16_t exceptionsCount=0;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x);
|
||||
|
||||
static void
|
||||
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x);
|
||||
|
||||
static uint16_t
|
||||
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
|
||||
uint16_t *parent, uint16_t parentTop);
|
||||
|
||||
static int
|
||||
compareProps(const void *l, const void *r);
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
static uint32_t
|
||||
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4);
|
||||
|
||||
static uint32_t
|
||||
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
|
||||
#endif
|
||||
|
||||
static void
|
||||
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3);
|
||||
|
||||
static uint16_t
|
||||
allocStage2(void);
|
||||
|
||||
static uint16_t
|
||||
allocProps(void);
|
||||
|
||||
static uint32_t
|
||||
addUChars(const UChar *s, uint32_t length);
|
||||
|
||||
@ -388,11 +310,12 @@ setUnicodeVersion(const char *v) {
|
||||
|
||||
extern void
|
||||
initStore() {
|
||||
uprv_memset(stage1, 0, sizeof(stage1));
|
||||
uprv_memset(stage2, 0, sizeof(stage2));
|
||||
uprv_memset(stage3, 0, sizeof(stage3));
|
||||
uprv_memset(map, 0, sizeof(map));
|
||||
uprv_memset(props, 0, sizeof(props));
|
||||
pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, FALSE);
|
||||
if(pTrie==NULL) {
|
||||
fprintf(stderr, "error: unable to create a UNewTrie\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
uprv_memset(props32, 0, sizeof(props32));
|
||||
}
|
||||
|
||||
@ -702,292 +625,19 @@ makeProps(Props *p) {
|
||||
|
||||
extern void
|
||||
addProps(uint32_t c, uint32_t x) {
|
||||
uint16_t notUsed;
|
||||
|
||||
setProps(c, x, ¬Used, ¬Used, ¬Used);
|
||||
utrie_set32(pTrie, (UChar32)c, x);
|
||||
}
|
||||
|
||||
/* areas of same properties ------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
||||
/*
|
||||
* Set the repetitive properties for the big, known areas of all the same
|
||||
* character properties. Most of those will share the same stage 2 and 3
|
||||
* tables.
|
||||
*
|
||||
* Assumptions:
|
||||
* - each area starts at a code point that is a multiple of 16
|
||||
* - there may be some properties already stored for some code points,
|
||||
* especially in the Private Use areas
|
||||
*/
|
||||
|
||||
uint16_t i1, i2, j3, i1Limit, i2Repeat, i3Repeat;
|
||||
uint32_t start, next, limit;
|
||||
|
||||
/* fill in the repetitive properties */
|
||||
start=first;
|
||||
limit=last+1;
|
||||
|
||||
/* allocate a stage 3 block and set all of its properties to x */
|
||||
i3Repeat=allocProps();
|
||||
for(j3=0; j3<STAGE_3_BLOCK; ++j3) {
|
||||
props[i3Repeat+j3]=x;
|
||||
}
|
||||
|
||||
/* we will need to allocate a stage 2 block if we use an entire one at all */
|
||||
i2Repeat=0;
|
||||
|
||||
i1=(uint16_t)(start>>STAGE_1_SHIFT);
|
||||
i1Limit=(uint16_t)(limit>>STAGE_1_SHIFT);
|
||||
|
||||
/*
|
||||
* now there are up to three sub-areas:
|
||||
* - a range of code points before the first full block for
|
||||
* one stage 1 index
|
||||
* - a (big) range of code points within full blocks for
|
||||
* stage 1 indexes
|
||||
* - a range of code points after the last full block for
|
||||
* one stage 1 index
|
||||
*/
|
||||
|
||||
if((start&(STAGE_2_3_AREA-1))!=0) {
|
||||
/* incomplete stage 2 block at the beginning */
|
||||
/* allocate the stage 2 block if necessary */
|
||||
i2=stage1[i1];
|
||||
if(i2==0) {
|
||||
stage1[i1]=i2=allocStage2();
|
||||
}
|
||||
|
||||
/* fill stages 2 & 3 of this sub-area */
|
||||
if(i1<i1Limit) {
|
||||
/* the stage 2 block goes to the end */
|
||||
next=(i1+1)<<STAGE_1_SHIFT;
|
||||
repeatFromStage2(i2, start, next, i3Repeat, x);
|
||||
start=next;
|
||||
|
||||
/* advance i1 to the first full block */
|
||||
++i1;
|
||||
} else {
|
||||
/* there is only one stage 2 block at all */
|
||||
repeatFromStage2(i2, start, limit, i3Repeat, x);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
while(i1<i1Limit) {
|
||||
/* fill complete stage 2 blocks */
|
||||
next=start+STAGE_2_3_AREA;
|
||||
i2=stage1[i1];
|
||||
if(i2==0) {
|
||||
/* set the index for common repeat block for stage 2 */
|
||||
if(i2Repeat==0) {
|
||||
/* allocate and fill a stage 2 block for this */
|
||||
uint16_t j2;
|
||||
|
||||
i2Repeat=allocStage2();
|
||||
for(j2=0; j2<STAGE_2_BLOCK; ++j2) {
|
||||
stage2[i2Repeat+j2]=i3Repeat;
|
||||
}
|
||||
}
|
||||
stage1[i1]=i2Repeat;
|
||||
} else {
|
||||
repeatFromStage2(i2, start, next, i3Repeat, x);
|
||||
}
|
||||
start=next;
|
||||
++i1;
|
||||
}
|
||||
|
||||
if(start<limit) {
|
||||
/* fill the area after the last full block */
|
||||
i2=stage1[i1];
|
||||
if(i2==0) {
|
||||
stage1[i1]=i2=allocStage2();
|
||||
}
|
||||
|
||||
repeatFromStage2(i2, start, limit, i3Repeat, x);
|
||||
}
|
||||
}
|
||||
|
||||
/* set a section of a stage 2 table and its properties to x */
|
||||
static void
|
||||
repeatFromStage2(uint16_t i2, uint32_t start, uint32_t limit, uint16_t i3Repeat, uint32_t x) {
|
||||
uint32_t next;
|
||||
uint16_t i2Limit, i3;
|
||||
|
||||
/* remove irrelevant bits from start and limit */
|
||||
start&=STAGE_2_3_AREA-1;
|
||||
limit=((limit-1)&(STAGE_2_3_AREA-1))+1;
|
||||
|
||||
i2Limit=(uint16_t)(i2+(limit>>STAGE_3_BITS));
|
||||
i2+=(uint16_t)(start>>STAGE_3_BITS);
|
||||
|
||||
/* similar to repeatProps(), there may be 3 sub-areas */
|
||||
if((start&(STAGE_3_BLOCK-1))!=0) {
|
||||
/* incomplete stage 3 block at the beginning */
|
||||
i3=stage2[i2];
|
||||
if(i3==0) {
|
||||
stage2[i2]=i3=allocProps();
|
||||
}
|
||||
|
||||
if(i2<i2Limit) {
|
||||
/* the stage 3 block goes to the end */
|
||||
next=(i2+1)<<STAGE_3_BITS;
|
||||
repeatFromStage3(i3, start, next, x);
|
||||
start=next;
|
||||
++i2;
|
||||
} else {
|
||||
/* there is only one stage 3 block at all */
|
||||
repeatFromStage3(i3, start, limit, x);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
while(i2<i2Limit) {
|
||||
/* fill complete stage 3 blocks */
|
||||
next=start+STAGE_3_BLOCK;
|
||||
i3=stage2[i2];
|
||||
if(i3==0) {
|
||||
stage2[i2]=i3Repeat;
|
||||
} else {
|
||||
repeatFromStage3(i3, start, next, x);
|
||||
}
|
||||
start=next;
|
||||
++i2;
|
||||
}
|
||||
|
||||
if(start<limit) {
|
||||
i3=stage2[i2];
|
||||
if(i3==0) {
|
||||
stage2[i2]=i3=allocProps();
|
||||
}
|
||||
|
||||
repeatFromStage3(i3, start, limit, x);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
repeatFromStage3(uint16_t i3, uint32_t start, uint32_t limit, uint32_t x) {
|
||||
uint16_t i3End;
|
||||
|
||||
i3End=(uint16_t)(i3+((limit-1)&(STAGE_3_BLOCK-1)));
|
||||
i3+=(uint16_t)(start&(STAGE_3_BLOCK-1));
|
||||
|
||||
while(i3<=i3End) {
|
||||
/* some properties may be set in this stage 3 block */
|
||||
if(props[i3]==0) {
|
||||
props[i3]=x;
|
||||
}
|
||||
++i3;
|
||||
}
|
||||
utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, TRUE);
|
||||
}
|
||||
|
||||
/* compacting --------------------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
compactStage2(void) {
|
||||
uint16_t newTop=compactStage(stage2, stage2Top, STAGE_2_BLOCK, stage1, STAGE_1_BLOCK);
|
||||
|
||||
/* we saved some space */
|
||||
if(beVerbose) {
|
||||
printf("compactStage2() reduced stage2Top from %u to %u\n", stage2Top, newTop);
|
||||
}
|
||||
stage2Top=newTop;
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
uint16_t i1, i2, i3, i4;
|
||||
uint32_t c;
|
||||
for(c=0; c<0xffff; c+=307) {
|
||||
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void
|
||||
compactStage3(void) {
|
||||
uint16_t newTop=compactStage(stage3, stage3Top, STAGE_3_BLOCK, stage2, stage2Top);
|
||||
|
||||
/* we saved some space */
|
||||
if(beVerbose) {
|
||||
printf("compactStage3() reduced stage3Top from %u to %u\n", stage3Top, newTop);
|
||||
}
|
||||
stage3Top=newTop;
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
uint16_t i1, i2, i3, i4;
|
||||
uint32_t c;
|
||||
for(c=0; c<0xffff; c+=307) {
|
||||
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
compactStage(uint16_t *stage, uint16_t stageTop, uint16_t blockSize,
|
||||
uint16_t *parent, uint16_t parentTop) {
|
||||
/*
|
||||
* This function is the common implementation for compacting
|
||||
* a stage table.
|
||||
* There are stageTop entries (indexes) in stage[].
|
||||
* stageTop is a multiple of blockSize, and there are always blockSize stage[] entries
|
||||
* per parent stage entry which do not overlap - yet.
|
||||
* The first blockSize stage[] entries are always the empty ones.
|
||||
* We make the blocks overlap appropriately here and fill every blockSize-th entry in
|
||||
* map[] with the mapping from old to new properties indexes
|
||||
* in order to adjust the parent stage tables.
|
||||
* This simple algorithm does not find arbitrary overlaps, but only those
|
||||
* where the last i entries of the previous block and the first i of the
|
||||
* current one all have the same value.
|
||||
* This seems reasonable and yields linear performance.
|
||||
*/
|
||||
uint16_t i, start, prevEnd, newStart, x;
|
||||
|
||||
map[0]=0;
|
||||
newStart=blockSize;
|
||||
for(start=newStart; start<stageTop;) {
|
||||
prevEnd=(uint16_t)(newStart-1);
|
||||
x=stage[start];
|
||||
if(x==stage[prevEnd]) {
|
||||
/* overlap by at least one */
|
||||
for(i=1; i<blockSize && x==stage[start+i] && x==stage[prevEnd-i]; ++i) {}
|
||||
|
||||
/* overlap by i */
|
||||
map[start]=(uint16_t)(newStart-i);
|
||||
|
||||
/* move the non-overlapping indexes to their new positions */
|
||||
start+=i;
|
||||
for(i=(uint16_t)(blockSize-i); i>0; --i) {
|
||||
stage[newStart++]=stage[start++];
|
||||
}
|
||||
} else if(newStart<start) {
|
||||
/* move the indexes to their new positions */
|
||||
map[start]=newStart;
|
||||
for(i=blockSize; i>0; --i) {
|
||||
stage[newStart++]=stage[start++];
|
||||
}
|
||||
} else /* no overlap && newStart==start */ {
|
||||
map[start]=start;
|
||||
newStart+=blockSize;
|
||||
start=newStart;
|
||||
}
|
||||
}
|
||||
|
||||
/* now adjust the parent stage table */
|
||||
for(i=0; i<parentTop; ++i) {
|
||||
parent[i]=map[parent[i]];
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
return (uint16_t)(stageTop-(start-newStart));
|
||||
}
|
||||
|
||||
extern void
|
||||
static void
|
||||
compactProps(void) {
|
||||
/*
|
||||
* At this point, all the propsTop properties are in props[], but they
|
||||
@ -1003,8 +653,11 @@ compactProps(void) {
|
||||
* index table anyway and qsort() does not allow to sort two tables together
|
||||
* directly. This will thus also reduce the amount of data moved around.
|
||||
*/
|
||||
uint16_t i, oldIndex, newIndex;
|
||||
uint32_t x;
|
||||
int32_t i, oldIndex, newIndex;
|
||||
|
||||
static uint16_t map[MAX_PROPS_COUNT];
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
@ -1016,14 +669,16 @@ compactProps(void) {
|
||||
}
|
||||
#endif
|
||||
|
||||
props=utrie_getData(pTrie, &propsTop);
|
||||
|
||||
/* build the index table */
|
||||
for(i=propsTop; i>0;) {
|
||||
--i;
|
||||
map[i]=i;
|
||||
map[i]=(uint16_t)i;
|
||||
}
|
||||
|
||||
/* do not reorder the first, empty entries */
|
||||
qsort(map+STAGE_3_BLOCK, propsTop-STAGE_3_BLOCK, 2, compareProps);
|
||||
/* reorder */
|
||||
qsort(map, propsTop, 2, compareProps);
|
||||
|
||||
/*
|
||||
* Now invert the reordered table and compact it in the same step.
|
||||
@ -1035,22 +690,22 @@ compactProps(void) {
|
||||
/* set the first of a possible series of the same properties */
|
||||
oldIndex=map[i];
|
||||
props32[newIndex]=x=props[oldIndex];
|
||||
stage3[oldIndex]=newIndex;
|
||||
props[oldIndex]=newIndex;
|
||||
|
||||
/* set the following same properties only in stage3 */
|
||||
while(++i<propsTop && x==props[map[i]]) {
|
||||
stage3[map[i]]=newIndex;
|
||||
props[map[i]]=newIndex;
|
||||
}
|
||||
|
||||
++newIndex;
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
stage3Top=propsTop;
|
||||
propsTop=newIndex;
|
||||
if(beVerbose) {
|
||||
printf("compactProps() reduced propsTop from %u to %u\n", stage3Top, propsTop);
|
||||
printf("compactProps() reduced propsTop from %u to %u\n", propsTop, newIndex);
|
||||
}
|
||||
propsTop=newIndex;
|
||||
|
||||
#if DO_DEBUG_OUT
|
||||
{
|
||||
/* debug output */
|
||||
@ -1077,56 +732,77 @@ compareProps(const void *l, const void *r) {
|
||||
|
||||
/* generate output data ----------------------------------------------------- */
|
||||
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else if(value!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
++start;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir) {
|
||||
static uint16_t indexes[8]={
|
||||
STAGE_2_BITS, STAGE_3_BITS,
|
||||
0, 0,
|
||||
static int32_t indexes[16]={
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0,
|
||||
0, 0, 0, 0
|
||||
};
|
||||
static uint8_t trieBlock[40000];
|
||||
|
||||
UNewDataMemory *pData;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
uint32_t size;
|
||||
int32_t trieSize, offset;
|
||||
long dataLength;
|
||||
uint16_t i, offset;
|
||||
|
||||
/* fix up the indexes in the stage tables to include the table offsets in the data */
|
||||
offset=8+STAGE_1_BLOCK; /* uint16_t offset to stage2[] */
|
||||
for(i=0; i<STAGE_1_BLOCK; ++i) {
|
||||
stage1[i]+=offset;
|
||||
compactProps();
|
||||
|
||||
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), trieSize);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
offset+=stage2Top; /* uint16_t offset to stage3[] */
|
||||
indexes[3]=offset;
|
||||
for(i=0; i<stage2Top; ++i) {
|
||||
stage2[i]+=offset;
|
||||
}
|
||||
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
|
||||
|
||||
offset=(uint16_t)((offset+stage3Top+1)/2); /* uint32_t offset to props[], include padding */
|
||||
indexes[4]=offset; /* uint32_t offset to props[] */
|
||||
|
||||
for(i=0; i<stage3Top; ++i) {
|
||||
stage3[i]+=offset;
|
||||
}
|
||||
/* round up trie size to 4-alignement */
|
||||
trieSize=(trieSize+3)&~3;
|
||||
offset+=trieSize>>2;
|
||||
indexes[0]=offset; /* uint32_t offset to props[] */
|
||||
|
||||
offset+=propsTop;
|
||||
indexes[2]=offset; /* uint32_t offset to exceptions[] */
|
||||
indexes[1]=offset; /* uint32_t offset to exceptions[] */
|
||||
|
||||
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
|
||||
indexes[5]=offset;
|
||||
indexes[2]=offset;
|
||||
|
||||
/* round up UChar count to 4-alignement */
|
||||
ucharsTop=(ucharsTop+1)&~1;
|
||||
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
|
||||
indexes[6]=offset;
|
||||
indexes[3]=offset;
|
||||
|
||||
size=4*offset; /* total size of data */
|
||||
|
||||
if(beVerbose) {
|
||||
printf("number of stage 2 entries: %5u\n", stage2Top);
|
||||
printf("number of stage 3 entries: %5u\n", stage3Top);
|
||||
printf("trie size in bytes: %5u\n", trieSize);
|
||||
printf("number of unique properties values: %5u\n", propsTop);
|
||||
printf("number of code points with exceptions: %5u\n", exceptionsCount);
|
||||
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
|
||||
printf("number of UChars for special mappings: %5u\n", ucharsTop);
|
||||
printf("data size: %6lu\n", (unsigned long)size);
|
||||
}
|
||||
|
||||
@ -1134,15 +810,12 @@ generateData(const char *dataDir) {
|
||||
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
|
||||
fprintf(stderr, "genprops: unable to create data memory, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, stage1, sizeof(stage1));
|
||||
udata_writeBlock(pData, stage2, 2*stage2Top);
|
||||
udata_writeBlock(pData, stage3, 2*stage3Top);
|
||||
udata_writePadding(pData, 2*((stage2Top+stage3Top)&1));
|
||||
udata_writeBlock(pData, trieBlock, trieSize);
|
||||
udata_writeBlock(pData, props32, 4*propsTop);
|
||||
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
|
||||
udata_writeBlock(pData, uchars, 2*ucharsTop);
|
||||
@ -1163,75 +836,6 @@ generateData(const char *dataDir) {
|
||||
|
||||
/* helpers ------------------------------------------------------------------ */
|
||||
|
||||
/* get properties after compacting them */
|
||||
#if DO_DEBUG_OUT
|
||||
static uint32_t
|
||||
getProps2(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3, uint16_t *pI4) {
|
||||
uint16_t i1, i2, i3, i4;
|
||||
|
||||
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
||||
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
|
||||
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
|
||||
*pI4=i4=stage3[i3];
|
||||
return props32[i4];
|
||||
}
|
||||
|
||||
/* get properties before compacting them */
|
||||
static uint32_t
|
||||
getProps(uint32_t c, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
|
||||
uint16_t i1, i2, i3;
|
||||
|
||||
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
||||
*pI2=i2=(uint16_t)(stage1[i1]+((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1)));
|
||||
*pI3=i3=(uint16_t)(stage2[i2]+(c&(STAGE_3_BLOCK-1)));
|
||||
return props[i3];
|
||||
}
|
||||
#endif
|
||||
|
||||
/* set properties before compacting them */
|
||||
static void
|
||||
setProps(uint32_t c, uint32_t x, uint16_t *pI1, uint16_t *pI2, uint16_t *pI3) {
|
||||
uint16_t i1, i2, i3;
|
||||
|
||||
*pI1=i1=(uint16_t)(c>>STAGE_1_SHIFT);
|
||||
|
||||
i2=stage1[i1];
|
||||
if(i2==0) {
|
||||
stage1[i1]=i2=allocStage2();
|
||||
}
|
||||
*pI2=i2+=(uint16_t)((c>>STAGE_2_SHIFT)&(STAGE_2_BLOCK-1));
|
||||
|
||||
i3=stage2[i2];
|
||||
if(i3==0) {
|
||||
stage2[i2]=i3=allocProps();
|
||||
}
|
||||
*pI3=i3+=(uint16_t)(c&(STAGE_3_BLOCK-1));
|
||||
|
||||
props[i3]=x;
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
allocStage2(void) {
|
||||
uint16_t i=stage2Top;
|
||||
stage2Top+=STAGE_2_BLOCK;
|
||||
if(stage2Top>=MAX_STAGE_2_COUNT) {
|
||||
fprintf(stderr, "genprops: stage 2 overflow\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
allocProps(void) {
|
||||
uint16_t i=propsTop;
|
||||
propsTop+=STAGE_3_BLOCK;
|
||||
if(propsTop>=MAX_PROPS_COUNT) {
|
||||
fprintf(stderr, "genprops: properties overflow\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
addUChars(const UChar *s, uint32_t length) {
|
||||
uint32_t top=(uint16_t)(ucharsTop+length);
|
||||
|
Loading…
Reference in New Issue
Block a user