diff --git a/icu4c/source/common/uchar.c b/icu4c/source/common/uchar.c index f56f35fdca..00c1b338cf 100644 --- a/icu4c/source/common/uchar.c +++ b/icu4c/source/common/uchar.c @@ -1,6 +1,6 @@ /* ******************************************************************************** -* Copyright (C) 1996-2004, International Business Machines +* Copyright (C) 1996-2005, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************** * @@ -49,8 +49,7 @@ static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; static UVersionInfo dataVersion={ 0, 0, 0, 0 }; static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 }; -static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL, *propsVectors=NULL; -static const UChar *ucharsTable=NULL; +static const uint32_t *pData32=NULL, *propsVectors=NULL; static int32_t countPropsVectors=0, propsVectorsColumns=0; static int8_t havePropsData=0; /* == 0 -> Data has not been loaded. @@ -61,16 +60,6 @@ static int8_t havePropsData=0; /* == 0 -> Data has not been loaded. /* index values loaded from uprops.dat */ static int32_t indexes[UPROPS_INDEX_COUNT]; -/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */ -static int32_t U_CALLCONV -getFoldingPropsOffset(uint32_t data) { - if(data&0x8000) { - return (int32_t)(data&0x7fff); - } else { - return 0; - } -} - static UBool U_CALLCONV isAcceptable(void *context, const char *type, const char *name, @@ -83,7 +72,7 @@ isAcceptable(void *context, pInfo->dataFormat[1]==0x50 && pInfo->dataFormat[2]==0x72 && pInfo->dataFormat[3]==0x6f && - pInfo->formatVersion[0]==3 && + pInfo->formatVersion[0]==4 && pInfo->formatVersion[2]==UTRIE_SHIFT && pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT ) { @@ -102,9 +91,6 @@ static UBool U_CALLCONV uchar_cleanup(void) propsData=NULL; } pData32=NULL; - props32Table=NULL; - exceptionsTable=NULL; - ucharsTable=NULL; propsVectors=NULL; countPropsVectors=0; dataErrorCode=U_ZERO_ERROR; @@ -139,19 +125,12 @@ _openProps(UCharProps *ucp, UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { return; } - ucp->propsTrie.getFoldingOffset=getFoldingPropsOffset; - /* unserialize the properties vectors trie, if any */ - if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 && - p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0 - ) { - length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4; - length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie)); - } else { - ucp->propsVectorsTrie.getFoldingOffset=getFoldingPropsOffset; - } + /* unserialize the properties vectors trie */ + length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4; + length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie)); } } @@ -190,9 +169,6 @@ uprv_loadPropsData(UErrorCode *pErrorCode) { /* initialize some variables */ uprv_memcpy(indexes, pData32, sizeof(indexes)); - props32Table=pData32+indexes[UPROPS_PROPS32_INDEX]; - exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX]; - ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]); /* additional properties */ if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) { @@ -250,7 +226,7 @@ uprops_swap(const UDataSwapper *ds, pInfo->dataFormat[1]==0x50 && pInfo->dataFormat[2]==0x72 && pInfo->dataFormat[3]==0x6f && - pInfo->formatVersion[0]==3 && + (pInfo->formatVersion[0]==3 || pInfo->formatVersion[0]==4) && pInfo->formatVersion[2]==UTRIE_SHIFT && pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT )) { @@ -360,10 +336,8 @@ uprops_swap(const UDataSwapper *ds, /* getting a uint32_t properties word from the data */ #define HAVE_DATA (havePropsData>0 || loadPropsData()>0) -#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA) #define GET_PROPS_UNSAFE(c, result) \ - UTRIE_GET16(&propsTrie, c, result); \ - (result)=props32Table[(result)] + UTRIE_GET16(&propsTrie, c, result); #define GET_PROPS(c, result) \ if(HAVE_DATA) { \ GET_PROPS_UNSAFE(c, result); \ @@ -371,39 +345,6 @@ uprops_swap(const UDataSwapper *ds, (result)=0; \ } -/* finding an exception value */ -#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index))) - -/* number of bits in an 8-bit integer value */ -#define EXC_GROUP 8 -static const uint8_t flagsOffset[256]={ - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 -}; - -#define ADD_EXCEPTION_OFFSET(flags, index, offset) { \ - if((index)>=EXC_GROUP) { \ - (offset)+=flagsOffset[(flags)&((1<>=EXC_GROUP; \ - (index)-=EXC_GROUP; \ - } \ - (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \ -} - U_CFUNC UBool uprv_haveProperties(UErrorCode *pErrorCode) { if(U_FAILURE(*pErrorCode)) { @@ -437,8 +378,7 @@ struct _EnumTypeCallback { static uint32_t U_CALLCONV _enumTypeValue(const void *context, uint32_t value) { - /* access the general category from the 32-bit properties, and those from the 16-bit trie value */ - return GET_CATEGORY(props32Table[value]); + return GET_CATEGORY(value); } static UBool U_CALLCONV @@ -695,114 +635,82 @@ u_isJavaIDPart(UChar32 c) { U_CAPI int32_t U_EXPORT2 u_charDigitValue(UChar32 c) { - uint32_t props, numericType; + uint32_t props; GET_PROPS(c, props); - numericType=GET_NUMERIC_TYPE(props); - if(numericType==1) { - if(!PROPS_VALUE_IS_EXCEPTION(props)) { - return GET_SIGNED_VALUE(props); - } else { - const uint32_t *pe=GET_EXCEPTIONS(props); - uint32_t firstExceptionValue=*pe; - if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) { - int i=EXC_NUMERIC_VALUE; - ++pe; - ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe); - return (int32_t)*pe; - } - } + if(GET_NUMERIC_TYPE(props)==1) { + return GET_NUMERIC_VALUE(props); + } else { + return -1; } - - return -1; } U_CAPI double U_EXPORT2 u_getNumericValue(UChar32 c) { - uint32_t props, numericType; + uint32_t props, numericType, numericValue; GET_PROPS(c, props); numericType=GET_NUMERIC_TYPE(props); - if(numericType==0 || numericType>=(int32_t)U_NT_COUNT) { + if(numericType==0 || numericType>=UPROPS_NT_COUNT) { return U_NO_NUMERIC_VALUE; - } else { - if(!PROPS_VALUE_IS_EXCEPTION(props)) { - return GET_SIGNED_VALUE(props); - } else { - const uint32_t *pe; - uint32_t firstExceptionValue; + } - double numValue; - uint32_t denominator; + numericValue=GET_NUMERIC_VALUE(props); - pe=GET_EXCEPTIONS(props); - firstExceptionValue=*pe++; + if(numericType>UPROPS_FRACTION_NUM_SHIFT; + denominator=(numericValue&UPROPS_FRACTION_DEN_MASK)+UPROPS_FRACTION_DEN_OFFSET; - ADD_EXCEPTION_OFFSET(flags, i, p); - numerator=(int32_t)*p; - - /* - * There are special values for huge numbers that are powers of ten. - * genprops/store.c documents: - * if numericValue=0x7fffff00+x then numericValue=10^x - */ - if(numerator<0x7fffff00) { - numValue=(double)numerator; - } else { - numerator&=0xff; - - /* 10^x without math.h */ - numValue=1.; - while(numerator>=4) { - numValue*=10000.; - numerator-=4; - } - switch(numerator) { - case 3: - numValue*=1000.; - break; - case 2: - numValue*=100.; - break; - case 1: - numValue*=10.; - break; - case 0: - default: - break; - } - } - } else { - numValue=0.; - } - if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DENOMINATOR_VALUE)) { - uint32_t flags=firstExceptionValue; - int i=EXC_DENOMINATOR_VALUE; - const uint32_t *p=pe; - ADD_EXCEPTION_OFFSET(flags, i, p); - denominator=*p; - } else { - denominator=0; - } - - switch(firstExceptionValue&((1UL<0) */ - default: - return U_NO_NUMERIC_VALUE; - } + if(numerator==0) { + numerator=-1; } + return (double)numerator/(double)denominator; + } else /* numericType==UPROPS_NT_LARGE */ { + /* large value with exponent */ + double numValue; + int32_t mant, exp; + + mant=(int32_t)numericValue>>UPROPS_LARGE_MANT_SHIFT; + exp=(int32_t)numericValue&UPROPS_LARGE_EXP_MASK; + if(mant==0) { + mant=1; + exp+=UPROPS_LARGE_EXP_OFFSET_EXTRA; + } else if(mant>9) { + return U_NO_NUMERIC_VALUE; /* reserved mantissa value */ + } else { + exp+=UPROPS_LARGE_EXP_OFFSET; + } + + numValue=mant; + + /* multiply by 10^exp without math.h */ + while(exp>=4) { + numValue*=10000.; + exp-=4; + } + switch(exp) { + case 3: + numValue*=1000.; + break; + case 2: + numValue*=100.; + break; + case 1: + numValue*=10.; + break; + case 0: + default: + break; + } + + return numValue; } } @@ -866,7 +774,6 @@ u_getUnicodeProperties(UChar32 c, int32_t column) { GET_PROPS(c, props); return props; } else if( !HAVE_DATA || countPropsVectors==0 || - (uint32_t)c>0x10ffff || column<0 || column>=propsVectorsColumns ) { return 0; @@ -1069,18 +976,6 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { USET_ADD_CP_AND_NEXT(sa, FIGURESP); USET_ADD_CP_AND_NEXT(sa, NNBSP); - /* add for u_charDigitValue() */ - USET_ADD_CP_AND_NEXT(sa, 0x3007); - USET_ADD_CP_AND_NEXT(sa, 0x4e00); - USET_ADD_CP_AND_NEXT(sa, 0x4e8c); - USET_ADD_CP_AND_NEXT(sa, 0x4e09); - USET_ADD_CP_AND_NEXT(sa, 0x56db); - USET_ADD_CP_AND_NEXT(sa, 0x4e94); - USET_ADD_CP_AND_NEXT(sa, 0x516d); - USET_ADD_CP_AND_NEXT(sa, 0x4e03); - USET_ADD_CP_AND_NEXT(sa, 0x516b); - USET_ADD_CP_AND_NEXT(sa, 0x4e5d); - /* add for u_digit() */ sa->add(sa->set, U_a); sa->add(sa->set, U_z+1); @@ -1096,8 +991,4 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { /* add for UCHAR_GRAPHEME_BASE and others */ USET_ADD_CP_AND_NEXT(sa, CGJ); - - /* add for UCHAR_JOINING_TYPE */ - sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */ - sa->add(sa->set, ZWJ+1); } diff --git a/icu4c/source/common/uprops.h b/icu4c/source/common/uprops.h index 40ab0b4f8b..d96c3a6ca3 100644 --- a/icu4c/source/common/uprops.h +++ b/icu4c/source/common/uprops.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2002-2004, International Business Machines +* Copyright (C) 2002-2005, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -14,7 +14,7 @@ * created by: Markus W. Scherer * * Constants for mostly non-core Unicode character properties -* stored in uprops.dat. +* stored in uprops.icu. */ #ifndef __UPROPS_H__ @@ -23,7 +23,6 @@ #include "unicode/utypes.h" #include "unicode/uset.h" #include "uset_imp.h" -#include "ucase.h" #include "udataswp.h" /* indexes[] entries */ @@ -49,42 +48,44 @@ enum { /* definitions for the main properties words */ enum { /* general category shift==0 0 (5 bits) */ - UPROPS_EXCEPTION_SHIFT=5, /* 5 (1 bit) */ - UPROPS_BIDI_SHIFT, /* 6 (5 bits) */ - UPROPS_MIRROR_SHIFT=UPROPS_BIDI_SHIFT+5, /* 11 (1 bit) */ - UPROPS_NUMERIC_TYPE_SHIFT, /* 12 (3 bits) */ - UPROPS_CASE_SENSITIVE_SHIFT=UPROPS_NUMERIC_TYPE_SHIFT+3,/* 15 (1 bit) format version 3.2 */ - UPROPS_RESERVED_SHIFT, /* 16 (4 bits) */ - UPROPS_VALUE_SHIFT=20, /* 20 */ - - UPROPS_EXCEPTION_BIT=1UL<>UPROPS_BIDI_SHIFT)&0x1f) -#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7) -#define GET_UNSIGNED_VALUE(props) ((props)>>UPROPS_VALUE_SHIFT) -#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>UPROPS_VALUE_SHIFT) -#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props)) - #define CAT_MASK(props) U_MASK(GET_CATEGORY(props)) +#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7) +#define GET_NUMERIC_VALUE(props) (((props)>>UPROPS_NUMERIC_VALUE_SHIFT)&0xff) + +/* internal numeric pseudo-types for special encodings of numeric values */ enum { - EXC_UPPERCASE, - EXC_LOWERCASE, - EXC_TITLECASE, - EXC_UNUSED, - EXC_NUMERIC_VALUE, - EXC_DENOMINATOR_VALUE, - EXC_MIRROR_MAPPING, - EXC_SPECIAL_CASING, - EXC_CASE_FOLDING + UPROPS_NT_FRACTION=4, /* ==U_NT_COUNT, must not change unless binary format version changes */ + UPROPS_NT_LARGE, + UPROPS_NT_COUNT +}; + +/* encoding of fractional and large numbers */ +enum { + UPROPS_MAX_SMALL_NUMBER=0xff, + + UPROPS_FRACTION_NUM_SHIFT=3, /* numerator: bits 7..3 */ + UPROPS_FRACTION_DEN_MASK=7, /* denominator: bits 2..0 */ + + UPROPS_FRACTION_MAX_NUM=31, + UPROPS_FRACTION_DEN_OFFSET=2, /* denominator values are 2..9 */ + + UPROPS_FRACTION_MIN_DEN=UPROPS_FRACTION_DEN_OFFSET, + UPROPS_FRACTION_MAX_DEN=UPROPS_FRACTION_MIN_DEN+UPROPS_FRACTION_DEN_MASK, + + UPROPS_LARGE_MANT_SHIFT=4, /* mantissa: bits 7..4 */ + UPROPS_LARGE_EXP_MASK=0xf, /* exponent: bits 3..0 */ + UPROPS_LARGE_EXP_OFFSET=2, /* regular exponents 2..17 */ + UPROPS_LARGE_EXP_OFFSET_EXTRA=18, /* extra large exponents 18..33 */ + + UPROPS_LARGE_MIN_EXP=UPROPS_LARGE_EXP_OFFSET, + UPROPS_LARGE_MAX_EXP=UPROPS_LARGE_MIN_EXP+UPROPS_LARGE_EXP_MASK, + UPROPS_LARGE_MAX_EXP_EXTRA=UPROPS_LARGE_EXP_OFFSET_EXTRA+UPROPS_LARGE_EXP_MASK }; /* number of properties vector words */ @@ -129,8 +130,8 @@ enum { */ enum { UPROPS_WHITE_SPACE, - UPROPS_BIDI_CONTROL, - UPROPS_JOIN_CONTROL, + UPROPS_WAS_BIDI_CONTROL, /* reserved, was used in format version 3 */ + UPROPS_WAS_JOIN_CONTROL, UPROPS_DASH, UPROPS_HYPHEN, UPROPS_QUOTATION_MARK, @@ -142,8 +143,8 @@ enum { UPROPS_IDEOGRAPHIC, UPROPS_DIACRITIC, UPROPS_EXTENDER, - UPROPS_LOWERCASE, - UPROPS_UPPERCASE, + UPROPS_WAS_LOWERCASE, /* reserved, was used in format version 3 */ + UPROPS_WAS_UPPERCASE, UPROPS_NONCHARACTER_CODE_POINT, UPROPS_GRAPHEME_EXTEND, UPROPS_GRAPHEME_LINK, @@ -153,7 +154,7 @@ enum { UPROPS_UNIFIED_IDEOGRAPH, UPROPS_DEFAULT_IGNORABLE_CODE_POINT, UPROPS_DEPRECATED, - UPROPS_SOFT_DOTTED, + UPROPS_WAS_SOFT_DOTTED, /* reserved, was used in format version 3 */ UPROPS_LOGICAL_ORDER_EXCEPTION, UPROPS_XID_START, UPROPS_XID_CONTINUE, @@ -167,15 +168,15 @@ enum { * Properties in vector word 2 * Bits * 31..24 More binary properties - * 13..11 Joining Type - * 10.. 5 Joining Group + * 13..11 reserved, was Joining Type in format version 3 + * 10.. 5 reserved, was Joining Group in format version 3 * 4.. 0 Decomposition Type */ -#define UPROPS_JT_MASK 0x00003800 -#define UPROPS_JT_SHIFT 11 +#define UPROPS_WAS_JT_MASK 0x00003800 +#define UPROPS_WAS_JT_SHIFT 11 -#define UPROPS_JG_MASK 0x000007e0 -#define UPROPS_JG_SHIFT 5 +#define UPROPS_WAS_JG_MASK 0x000007e0 +#define UPROPS_WAS_JG_SHIFT 5 #define UPROPS_DT_MASK 0x0000001f diff --git a/icu4c/source/tools/genprops/genprops.c b/icu4c/source/tools/genprops/genprops.c index 3c09e197c6..3cc95246b7 100644 --- a/icu4c/source/tools/genprops/genprops.c +++ b/icu4c/source/tools/genprops/genprops.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2003, International Business Machines +* Copyright (C) 1999-2005, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -24,7 +24,6 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" -#include "unicode/uset.h" #include "unicode/putil.h" #include "unicode/uclean.h" #include "cmemory.h" @@ -43,31 +42,13 @@ U_CDECL_END UBool beVerbose=FALSE, haveCopyright=TRUE; -/* - * Unicode set collecting the case-sensitive characters; - * see uchar.h UCHAR_CASE_SENSITIVE. - * Add code points from case mappings/foldings in - * the root locale and with default options. - */ -static USet *caseSensitive; - /* prototypes --------------------------------------------------------------- */ -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode); - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode); - static void parseDB(const char *filename, UErrorCode *pErrorCode); /* -------------------------------------------------------------------------- */ - enum { HELP_H, @@ -174,19 +155,6 @@ main(int argc, char* argv[]) { /* initialize */ initStore(); - caseSensitive=uset_open(1, 0); /* empty set (start>end) */ - - /* process BidiMirroring.txt */ - writeUCDFilename(basename, "BidiMirroring", suffix); - parseBidiMirroring(filename, &errorCode); - - /* process SpecialCasing.txt */ - writeUCDFilename(basename, "SpecialCasing", suffix); - parseSpecialCasing(filename, &errorCode); - - /* process CaseFolding.txt */ - writeUCDFilename(basename, "CaseFolding", suffix); - parseCaseFolding(filename, &errorCode); /* process UnicodeData.txt */ writeUCDFilename(basename, "UnicodeData", suffix); @@ -202,6 +170,7 @@ main(int argc, char* argv[]) { generateData(destDir); } + exitStore(); u_cleanup(); return errorCode; } @@ -270,301 +239,6 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { return -1; } -static void -_set_addAll(USet *set, const UChar *s, int32_t length) { - UChar32 c; - int32_t i; - - /* needs length>=0 */ - for(i=0; i0) { - fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)mirrorMappings[mirrorCount][0], - (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=mirrorMappings[mirrorCount][0]; - - if(++mirrorCount==MAX_MIRROR_COUNT) { - fprintf(stderr, "genprops: too many mirror mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static void -parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode); -} - -/* parser for SpecialCasing.txt --------------------------------------------- */ - -#define MAX_SPECIAL_CASING_COUNT 500 - -static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT]; -static int32_t specialCasingCount=0; - -static void U_CALLCONV -specialCasingLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - - /* get code point */ - specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* is this a complex mapping? */ - if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { - /* there is some condition text in the fifth field */ - specialCasings[specialCasingCount].isComplex=TRUE; - - /* do not store any actual mappings for this */ - specialCasings[specialCasingCount].lowerCase[0]=0; - specialCasings[specialCasingCount].upperCase[0]=0; - specialCasings[specialCasingCount].titleCase[0]=0; - } else { - /* just set the "complex" flag and get the case mappings */ - specialCasings[specialCasingCount].isComplex=FALSE; - specialCasings[specialCasingCount].lowerCase[0]= - (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].upperCase[0]= - (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].titleCase[0]= - (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); - } - - if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { - fprintf(stderr, "genprops: too many special casing mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static int -compareSpecialCasings(const void *left, const void *right) { - return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; -} - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { - char *fields[5][2]; - int32_t i, j; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); - - /* sort the special casing entries by code point */ - if(specialCasingCount>0) { - qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings); - } - - /* replace multiple entries for any code point by one "complex" one */ - j=0; - for(i=1; i0) { - qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings); - specialCasingCount-=j; - } - - /* - * Add one complex mapping to caseSensitive that was filtered out above: - * Greek final Sigma has a conditional mapping but not locale-sensitive, - * and it is taken when lowercasing just U+03A3 alone. - * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA - */ - uset_add(caseSensitive, 0x3c2); -} - -/* parser for CaseFolding.txt ----------------------------------------------- */ - -#define MAX_CASE_FOLDING_COUNT 2000 - -static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; -static int32_t caseFoldingCount=0; - -static void U_CALLCONV -caseFoldingLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - static uint32_t prevCode=0; - int32_t count; - char status; - - /* get code point */ - caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get the status of this mapping */ - caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); - if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { - fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ - if(status=='L') { - return; - } - - /* get the mapping */ - count=caseFoldings[caseFoldingCount].full[0]= - (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ - if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { - caseFoldings[caseFoldingCount].simple=0; - } - - /* update the case-sensitive set */ - if(status!='T') { - uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); - _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); - } - - /* check the status */ - if(status=='S') { - /* check if there was a full mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='F' - ) { - /* merge the two entries */ - caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; - return; - } - } else if(status=='F') { - /* check if there was a simple mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='S' - ) { - /* merge the two entries */ - uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); - return; - } - } else if(status=='I' || status=='T') { - /* check if there was a default mapping for this code point before (remove it) */ - while(caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code - ) { - prevCode=0; - --caseFoldingCount; - } - /* store only a marker for special handling for cases like dotless i */ - caseFoldings[caseFoldingCount].simple=0; - caseFoldings[caseFoldingCount].full[0]=0; - } - - /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ - if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { - fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)caseFoldings[caseFoldingCount].code, - (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=caseFoldings[caseFoldingCount].code; - - if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { - fprintf(stderr, "genprops: too many case folding mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { - char *fields[3][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); -} - /* parser for UnicodeData.txt ----------------------------------------------- */ /* general categories */ @@ -580,12 +254,6 @@ genCategoryNames[U_CHAR_CATEGORY_COUNT]={ "Pi", "Pf" }; -const char *const -bidiNames[U_CHAR_DIRECTION_COUNT]={ - "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S", - "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN" -}; - const char *const decompositionTypeNames[U_DT_COUNT]={ NULL, @@ -613,7 +281,7 @@ static struct { char name[80]; } unicodeAreas[32]; -static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0; +static int32_t unicodeAreaIndex=0; static void U_CALLCONV unicodeDataLineFn(void *context, @@ -647,17 +315,6 @@ unicodeDataLineFn(void *context, exit(U_PARSE_ERROR); } - /* get BiDi category, field 4 */ - i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]); - if(i>=0) { - p.bidi=(uint8_t)i; - } else { - fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", - fields[4][0], (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - /* get decomposition type, field 5 */ if(fields[5][0]status=='C' && - p.caseFolding->simple==p.lowerCase - ) { - p.caseFolding=NULL; - } - } else { - p.caseFolding=NULL; - } - value=makeProps(&p); if(*fields[1][0]=='<') { @@ -966,41 +549,12 @@ repeatAreaProps() { static void parseDB(const char *filename, UErrorCode *pErrorCode) { - /* default Bidi classes for unassigned code points */ - static const uint32_t defaultBidi[][2]={ /* { limit, class } */ - { 0x0590, U_LEFT_TO_RIGHT }, - { 0x0600, U_RIGHT_TO_LEFT }, - { 0x07C0, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFB1D, U_LEFT_TO_RIGHT }, - { 0xFB50, U_RIGHT_TO_LEFT }, - { 0xFE00, U_RIGHT_TO_LEFT_ARABIC }, - { 0xFE70, U_LEFT_TO_RIGHT }, - { 0xFF00, U_RIGHT_TO_LEFT_ARABIC }, - { 0x110000, U_LEFT_TO_RIGHT } - }; - char *fields[15][2]; - UChar32 start, end; - uint32_t prev; - int32_t i; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } - /* - * Set default Bidi classes for unassigned code points. - * See table 3-7 "Bidirectional Character Types" in UAX #9. - * http://www.unicode.org/reports/tr9/ - */ - prev=0; - for(i=0; ibinariesCount) { /* ignore unrecognized properties */ - addIgnoredProp(s, fields[1][1]); + if(beVerbose) { + addIgnoredProp(s, fields[1][1]); + } return; } if(isToken(bin->binaries[i].propName, s)) { @@ -382,8 +362,10 @@ parseBinariesFile(char *filename, char *basename, const char *suffix, fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); } - for(i=0; iucdFile); + if(beVerbose) { + for(i=0; iucdFile); + } } } @@ -394,6 +376,12 @@ initAdditionalProperties() { pv=upvec_open(UPROPS_VECTOR_WORDS, 20000); } +U_CFUNC void +exitAdditionalProperties() { + utrie_close(trie); + upvec_close(pv); +} + U_CFUNC void generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) { char *basename; @@ -405,9 +393,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr /* add Han numeric types & values */ parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode); - /* set proper bidi class for unassigned code points (Cn) */ - parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode); - parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode); /* @@ -441,10 +426,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr */ parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode); - parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode); - - parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode); - /* * Preset East Asian Width defaults: * @@ -481,7 +462,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr return; } - pvCount=upvec_toTrie(pv, trie, pErrorCode); + pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); @@ -538,7 +519,7 @@ static void U_CALLCONV numericLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { - Props newProps; + Props newProps={ 0 }; char *s, *end; uint32_t start, limit, value, oldProps32; int32_t oldType; @@ -575,11 +556,14 @@ numericLineFn(void *context, /* try large powers of 10 first, may otherwise overflow strtoul() */ if(0==uprv_strncmp(s, "10000000000", 11)) { /* large powers of 10 are encoded in a special way, see store.c */ - value=0x7fffff00; + uint8_t exp=0; + end=s; while(*(++end)=='0') { - ++value; + ++exp; } + value=1; + newProps.exponent=exp; } else { /* normal number parsing */ value=(uint32_t)uprv_strtoul(s, &end, 10); @@ -599,108 +583,51 @@ numericLineFn(void *context, * specific properties for single characters. */ + /* set the new numeric type and value */ + newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */ + newProps.numericValue=(int32_t)value; /* newly parsed numeric value */ + /* the exponent may have been set above */ + value=makeProps(&newProps); + for(; start -#include #include "unicode/utypes.h" #include "unicode/uchar.h" #include "cmemory.h" #include "cstring.h" -#include "filestrm.h" #include "utrie.h" #include "unicode/udata.h" #include "unewdata.h" @@ -42,7 +40,15 @@ the udata API for loading ICU data. Especially, a UDataInfo structure precedes the actual data. It contains platform properties values and the file format version. -The following is a description of format version 3 . +The following is a description of format version 4 . + +The format changes between version 3 and 4 because the properties related to +case mappings and bidi/shaping are pulled out into separate files +for modularization. +In order to reduce the need for code changes, some of the previous data +structures are omitted, rather than rearranging everything. + +For details see "Changes in format version 4" below. Data contents: @@ -63,6 +69,10 @@ Formally, the file contains the following structures: const int32_t indexes[16] with values i0..i15: + i0 indicates the length of the main trie. + i0..i3 all have the same value in format version 4.0; + the related props32[] and exceptions[] and uchars[] were used in format version 3 + i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings @@ -74,12 +84,14 @@ Formally, the file contains the following structures: i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table i7..i9 reservedIndexes; -- reserved values; 0 for now - i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+) - i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2) + i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+) + i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2) i12..i15 reservedIndexes; -- reserved values; 0 for now PT serialized properties trie, see utrie.h (byte size: 4*(i0-16)) + P, E, and U are not used (empty) in format version 4 + P const uint32_t props32[i1-i0]; E const uint32_t exceptions[i2-i1]; U const UChar uchars[2*(i3-i2)]; @@ -99,14 +111,7 @@ the Unicode code assignment are exploited: The lookup of properties for a given code point is done with a trie lookup, using the UTrie implementation. -The trie lookup result is a 16-bit index in the props32[] table where the -actual 32-bit properties word is stored. This is done to save space. - -(There are thousands of 16-bit entries in the trie data table, but -only a few hundred unique 32-bit properties words. -If the trie data table contained 32-bit words directly, then that would be -larger because the length of the table would be the same as now but the -width would be 32 bits instead of 16. This saves more than 10kB.) +The trie lookup result is a 16-bit properties word. With a given Unicode code point @@ -114,141 +119,51 @@ With a given Unicode code point and 0<=c<0x110000, the lookup is done like this: - uint16_t i; - UTRIE_GET16(c, i); - uint32_t props=p32[i]; + uint16_t props; + UTRIE_GET16(trie, c, props); -For some characters, not all of the properties can be efficiently encoded -using 32 bits. For them, the 32-bit word contains an index into the exceptions[] -array: - - if(props&EXCEPTION_BIT)) { - uint16_t e=(uint16_t)(props>>VALUE_SHIFT); - ... - } - -The exception values are a variable number of uint32_t starting at - - const uint32_t *pe=p32+exceptionsIndex+e; - -The first uint32_t there contains flags about what values actually follow it. -Some of the exception values are UChar32 code points for the case mappings, -others are numeric values etc. - -32-bit properties sets: - -Each 32-bit properties word contains: +Each 16-bit properties word contains: 0.. 4 general category - 5 has exception values - 6..10 BiDi category -11 is mirrored -12..14 numericType: - 0 no numeric value - 1 decimal digit value - 2 digit value - 3 numeric value - ### TODO: type 4 for Han digits & numbers?! -15..19 reserved -20..31 value according to bits 0..5: - if(has exception) { - exception index; - } else switch(general category) { - case Ll: delta to uppercase; -- same as titlecase - case Lu: -delta to lowercase; -- titlecase is same as c - case Lt: -delta to lowercase; -- uppercase is same as c - default: - if(is mirrored) { - delta to mirror; - } else if(numericType!=0) { - numericValue; - } else { - 0; - }; - } + 5.. 7 numeric type + non-digit numbers are stored with multiple types and pseudo-types + in order to facilitate compact encoding: + 0 no numeric value (0) + 1 decimal digit value (0..9) + 2 digit value (0..9) + 3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff + 4 (internal type UPROPS_NT_FRACTION) fraction + 5 (internal type UPROPS_NT_LARGE) large number >0xff + 6..7 reserved -Exception values: + when returning the numeric type from a public API, + internal types must be turned into U_NT_NUMERIC -In the first uint32_t exception word for a code point, -bits -31..16 reserved -15..0 flags that indicate which values follow: + 8..15 numeric value + encoding of fractions and large numbers see below -bit - 0 has uppercase mapping - 1 has lowercase mapping - 2 has titlecase mapping - 3 unused - 4 has numeric value (numerator) - if numericValue=0x7fffff00+x then numericValue=10^x - 5 has denominator value - 6 has a mirror-image Unicode code point - 7 has SpecialCasing.txt entries - 8 has CaseFolding.txt entries +Fractions: + // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) + int32_t num, den; + num=n>>3; // num=0..31 + den=(n&7)+2; // den=2..9 + if(num==0) { + num=-1; // num=-1 or 1..31 + } + double result=(double)num/(double)den; -According to the flags in this word, one or more uint32_t words follow it -in the sequence of the bit flags in the flags word; if a flag is not set, -then the value is missing or 0: - -For the case mappings and the mirror-image Unicode code point, -one uint32_t or UChar32 each is the code point. -If the titlecase mapping is missing, then it is the same as the uppercase mapping. - -For the digit values, bits 31..16 contain the decimal digit value, and -bits 15..0 contain the digit value. A value of -1 indicates that -this value is missing. - -For the numeric/numerator value, an int32_t word contains the value directly, -except for when there is no numerator but a denominator, then the numerator -is implicitly 1. This means: - numerator denominator result - none none none - x none x - none y 1/y - x y x/y - -If the numerator value is 0x7fffff00+x then it is replaced with 10^x. - -For the denominator value, a uint32_t word contains the value directly. - -For special casing mappings, the 32-bit exception word contains: -31 if set, this character has complex, conditional mappings - that are not stored; - otherwise, the mappings are stored according to the following bits -30..24 number of UChars used for mappings -23..16 reserved -15.. 0 UChar offset from the beginning of the UChars array where the - UChars for the special case mappings are stored in the following format: - -Format of special casing UChars: -One UChar value with lengths as follows: -14..10 number of UChars for titlecase mapping - 9.. 5 number of UChars for uppercase mapping - 4.. 0 number of UChars for lowercase mapping - -Followed by the UChars for lowercase, uppercase, titlecase mappings in this order. - -For case folding mappings, the 32-bit exception word contains: -31..24 number of UChars used for the full mapping -23..16 reserved -15.. 0 UChar offset from the beginning of the UChars array where the - UChars for the special case mappings are stored in the following format: - -Format of case folding UChars: -Two UChars contain the simple mapping as follows: - 0, 0 no simple mapping - BMP,0 a simple mapping to a BMP code point - s1, s2 a simple mapping to a supplementary code point stored as two surrogates -This is followed by the UChars for the full case folding mappings. - -Example: -U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase -mapping and a numeric value. -Its exception values would be stored as 3 uint32_t words: - -- flags=0x0a (see above) with combining class 0 -- lowercase mapping 0x2170 -- numeric value=1 +Large numbers: + // n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down) + int32_t m, e; + m=n>>4; // m=0..15 + e=(n&0xf); + if(m==0) { + m=1; // for large powers of 10 + e+=18; // e=18..33 + } else { + e+=2; // e=2..17 + } // m==10..15 are reserved + double result=(double)m*10^e; --- Additional properties (new in format version 2.1) --- @@ -277,6 +192,32 @@ See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT. - i10 also contains U_LB_COUNT and U_EA_COUNT. - i11 contains maxValues2 for vector word 2. +--- Changes in format version 4 --- + +The format changes between version 3 and 4 because the properties related to +case mappings and bidi/shaping are pulled out into separate files +for modularization. +In order to reduce the need for code changes, some of the previous data +structures are omitted, rather than rearranging everything. + +(The change to format version 4 is for ICU 3.4. The last CVS revision of +genprops/store.c for format version 3.2 is 1.48.) + +The main trie's data is significantly simplified: +- The trie's 16-bit data word is used directly instead of as an index + into props32[]. +- The trie uses the default trie folding functions instead of custom ones. +- Numeric values are stored directly in the trie data word, with special + encodings. +- No more exception data (the data that needed it was pulled out, or, in the + case of numeric values, encoded differently). +- No more string data (pulled out - was for case mappings). + +Also, some of the previously used properties vector bits are reserved again. + +The indexes[] values for the omitted structures are still filled in +(indicating zero-length arrays) so that the swapper code remains unchanged. + ----------------------------------------------------------------------------- */ /* UDataInfo cf. udata.h */ @@ -290,46 +231,12 @@ static UDataInfo dataInfo={ 0, { 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */ - { 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ + { 4, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ { 4, 0, 1, 0 } /* dataVersion */ }; -/* definitions of expected data size limits */ -enum { - MAX_PROPS_COUNT=25000, - MAX_UCHAR_COUNT=10000 -}; - static UNewTrie *pTrie=NULL; -/* props32[] contains unique properties words after compacting the array of properties */ -static uint32_t props32[MAX_PROPS_COUNT]; - -/* context pointer for compareProps() - temporarily holds a pointer to the trie data */ -static uint32_t *props; - -/* length of props32[] after compaction */ -static int32_t propsTop; - -/* exceptions values */ -static uint32_t exceptions[UPROPS_MAX_EXCEPTIONS_COUNT+20]; -static uint16_t exceptionsTop=0; - -/* Unicode characters, e.g. for special casing or decomposition */ -static UChar uchars[MAX_UCHAR_COUNT+20]; -static uint32_t ucharsTop=0; - -/* statistics */ -static uint16_t exceptionsCount=0; - -/* prototypes --------------------------------------------------------------- */ - -static int -compareProps(const void *l, const void *r); - -static uint32_t -addUChars(const UChar *s, uint32_t length); - /* -------------------------------------------------------------------------- */ extern void @@ -341,266 +248,106 @@ setUnicodeVersion(const char *v) { extern void initStore() { - pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, 0, 0, TRUE); + pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE); if(pTrie==NULL) { fprintf(stderr, "error: unable to create a UNewTrie\n"); exit(U_MEMORY_ALLOCATION_ERROR); } - uprv_memset(props32, 0, sizeof(props32)); initAdditionalProperties(); } +extern void +exitStore() { + utrie_close(pTrie); + exitAdditionalProperties(); +} + /* store a character's properties ------------------------------------------- */ extern uint32_t makeProps(Props *p) { - uint32_t x; - int32_t value; - uint16_t count; - UBool isNumber; + uint32_t den; + int32_t type, value, exp; - /* - * Simple ideas for reducing the number of bits for one character's - * properties: - * - * Some fields are only used for characters of certain - * general categories: - * - casing fields for letters and others, not for - * numbers & Mn - * + uppercase not for uppercase letters - * + lowercase not for lowercase letters - * + titlecase not for titlecase letters - * - * * most of the time, uppercase=titlecase - * - numeric fields for various digit & other types - * - canonical combining classes for non-spacing marks (Mn) - * * the above is not always true, for all three cases - * - * Using the same bits for alternate fields saves some space. - * - * For the canonical categories, there are only few actually used - * most of the time. - * They can be stored using 5 bits. - * - * In the BiDi categories, the 5 explicit codes are only ever - * assigned 1:1 to 5 well-known code points. Storing only one - * value for all "explicit codes" gets this down to 4 bits. - * Client code then needs to check for this special value - * and replace it by the real one using a 5-element table. - * - * The general categories Mn & Me, non-spacing & enclosing marks, - * are always NSM, and NSM are always of those categories. - * - * Digit values can often be derived from the code point value - * itself in a simple way. - * - */ - - /* count the case mappings and other values competing for the value bit field */ - x=0; - value=0; - count=0; - isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N'); - - if(p->upperCase!=0) { - /* verify that no numbers and no Mn have case mappings */ - if(p->generalCategory==U_LOWERCASE_LETTER) { - value=(int32_t)p->code-(int32_t)p->upperCase; - } else { - x=UPROPS_EXCEPTION_BIT; - } - ++count; - } - if(p->lowerCase!=0) { - /* verify that no numbers and no Mn have case mappings */ - if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) { - value=(int32_t)p->lowerCase-(int32_t)p->code; - } else { - x=UPROPS_EXCEPTION_BIT; - } - ++count; - } - if(p->upperCase!=p->titleCase) { - x=UPROPS_EXCEPTION_BIT; - ++count; - } - if(p->numericType!=0) { + do { /* pseudo-loop to allow break instead of goto */ + /* encode numeric type & value */ + type=p->numericType; value=p->numericValue; - ++count; - } - if(p->denominator!=0) { - x=UPROPS_EXCEPTION_BIT; - ++count; - } - if(p->isMirrored) { - if(p->mirrorMapping!=0) { - value=(int32_t)p->mirrorMapping-(int32_t)p->code; - } - ++count; - } - if(p->specialCasing!=NULL) { - x=UPROPS_EXCEPTION_BIT; - ++count; - } - if(p->caseFolding!=NULL) { - x=UPROPS_EXCEPTION_BIT; - ++count; - } + den=p->denominator; + exp=p->exponent; - /* handle exceptions */ - if(count>1 || x!=0 || valuecode); - */ - } else if(valuecode, (long)value, (long)UPROPS_MIN_VALUE, (long)UPROPS_MAX_VALUE); + if(den!=0) { + /* fraction */ + if( type!=U_NT_NUMERIC || + value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM || + dencode, count); + /* 1 * 10^(18..33) */ + if(value!=1) { + break; + } + value=0; + exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA; } + value=(value<UPROPS_MAX_SMALL_NUMBER) { + /* large value */ + if(type!=U_NT_NUMERIC) { + break; + } + type=UPROPS_NT_LARGE; + + /* split the value into mantissa and exponent, base 10 */ + while((value%10)==0) { + value/=10; + ++exp; + } + if(value>9) { + break; + } + + exp-=UPROPS_LARGE_EXP_OFFSET; + value=(value<generalCategory | + ((uint32_t)type<=UPROPS_MAX_EXCEPTIONS_COUNT) { - fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n", - (int)p->code, (int)value); - exit(U_MEMORY_ALLOCATION_ERROR); - } else { - uint32_t first=0; - uint16_t length=1; - - if(p->upperCase!=0) { - first|=1; - exceptions[value+length++]=p->upperCase; - } - if(p->lowerCase!=0) { - first|=2; - exceptions[value+length++]=p->lowerCase; - } - if(p->upperCase!=p->titleCase) { - first|=4; - if(p->titleCase!=0) { - exceptions[value+length++]=p->titleCase; - } else { - exceptions[value+length++]=p->code; - } - } - if(p->numericType!=0) { - if(p->denominator==0) { - first|=0x10; - exceptions[value+length++]=(uint32_t)p->numericValue; - } else { - if(p->numericValue!=1) { - first|=0x10; - exceptions[value+length++]=(uint32_t)p->numericValue; - } - first|=0x20; - exceptions[value+length++]=p->denominator; - } - } - if(p->isMirrored) { - first|=0x40; - exceptions[value+length++]=p->mirrorMapping; - } - if(p->specialCasing!=NULL) { - first|=0x80; - if(p->specialCasing->isComplex) { - /* complex special casing */ - exceptions[value+length++]=0x80000000; - } else { - /* unconditional special casing */ - UChar u[128]; - uint32_t i; - uint16_t j, entry; - - i=1; - entry=0; - j=p->specialCasing->lowerCase[0]; - if(j>0) { - uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j); - i+=j; - entry=j; - } - j=p->specialCasing->upperCase[0]; - if(j>0) { - uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j); - i+=j; - entry|=j<<5; - } - j=p->specialCasing->titleCase[0]; - if(j>0) { - uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j); - i+=j; - entry|=j<<10; - } - u[0]=entry; - - exceptions[value+length++]=(i<<24)|addUChars(u, i); - } - } - if(p->caseFolding!=NULL) { - first|=0x100; - if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { - /* special case folding, store only a marker */ - exceptions[value+length++]=0; - } else { - /* normal case folding with a simple and a full mapping */ - UChar u[128]; - uint16_t i; - - /* store the simple mapping into the first two UChars */ - i=0; - u[1]=0; - UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple); - - /* store the full mapping after that */ - i=p->caseFolding->full[0]; - if(i>0) { - uprv_memcpy(u+2, p->caseFolding->full+1, 2*i); - } - - exceptions[value+length++]=(i<<24)|addUChars(u, 2+i); - } - } - exceptions[value]=first; - exceptionsTop+=length; - } - } - - /* put together the 32-bit word of encoded properties */ - x|= - (uint32_t)p->generalCategory | - (uint32_t)p->bidi<isMirrored<numericType<numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent); + exit(U_ILLEGAL_ARGUMENT_ERROR); + return 0; } extern void @@ -611,21 +358,6 @@ addProps(uint32_t c, uint32_t x) { } } -extern void -addCaseSensitive(UChar32 first, UChar32 last) { - uint32_t x, cs; - - cs=U_MASK(UPROPS_CASE_SENSITIVE_SHIFT); - while(first<=last) { - x=utrie_get32(pTrie, first, NULL); - if(!utrie_set32(pTrie, first, x|cs)) { - fprintf(stderr, "error: too many entries for the properties trie\n"); - exit(U_BUFFER_OVERFLOW_ERROR); - } - ++first; - } -} - extern uint32_t getProps(uint32_t c) { return utrie_get32(pTrie, (UChar32)c, NULL); @@ -641,125 +373,8 @@ repeatProps(uint32_t first, uint32_t last, uint32_t x) { } } -/* compacting --------------------------------------------------------------- */ - -static void -compactProps(void) { - /* - * At this point, all the propsTop properties are in props[], but they - * are not all unique. - * Now we sort them, reduce them to unique ones in props32[], and - * build an index in stage3[] from the old to the new indexes. - * (The quick sort averages at N*log(N) with N=propsTop. The inverting - * yields linear performance.) - */ - - /* - * We are going to sort only an index table in map[] because we need this - * index table anyway and qsort() does not allow to sort two tables together - * directly. This will thus also reduce the amount of data moved around. - */ - uint32_t x; - int32_t i, oldIndex, newIndex; - - static uint16_t map[MAX_PROPS_COUNT]; - -#if DO_DEBUG_OUT - { - /* debug output */ - uint16_t i1, i2, i3; - uint32_t c; - for(c=0; c<0xffff; c+=307) { - printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3)); - } - } -#endif - - props=utrie_getData(pTrie, &propsTop); - - /* build the index table */ - for(i=propsTop; i>0;) { - --i; - map[i]=(uint16_t)i; - } - - /* reorder */ - qsort(map, propsTop, 2, compareProps); - - /* - * Now invert the reordered table and compact it in the same step. - * The result will be props32[] having only unique properties words - * and stage3[] having indexes to them. - */ - newIndex=0; - for(i=0; i>2; - indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */ - - offset+=propsTop; - indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */ - - offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */ - indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset; - - /* round up UChar count to 4-alignement */ - ucharsTop=(ucharsTop+1)&~1; - offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */ + indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */ + indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */ + indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */ indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset; if(beVerbose) { printf("trie size in bytes: %5u\n", (int)trieSize); - printf("number of unique properties values: %5u\n", (int)propsTop); - printf("number of code points with exceptions: %5u\n", exceptionsCount); - printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop); - printf("number of UChars for special mappings: %5u\n", (int)ucharsTop); } additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes); @@ -828,9 +429,6 @@ generateData(const char *dataDir) { udata_writeBlock(pData, indexes, sizeof(indexes)); udata_writeBlock(pData, trieBlock, trieSize); - udata_writeBlock(pData, props32, 4*propsTop); - udata_writeBlock(pData, exceptions, 4*exceptionsTop); - udata_writeBlock(pData, uchars, 2*ucharsTop); udata_writeBlock(pData, additionalProps, additionalPropsSize); /* finish up */ @@ -845,25 +443,6 @@ generateData(const char *dataDir) { dataLength, (unsigned long)size); exit(U_INTERNAL_PROGRAM_ERROR); } - - utrie_close(pTrie); -} - -/* helpers ------------------------------------------------------------------ */ - -static uint32_t -addUChars(const UChar *s, uint32_t length) { - uint32_t top=(uint16_t)(ucharsTop+length); - UChar *p; - - if(top>=MAX_UCHAR_COUNT) { - fprintf(stderr, "genprops: out of UChars memory\n"); - exit(U_MEMORY_ALLOCATION_ERROR); - } - p=uchars+ucharsTop; - uprv_memcpy(p, s, 2*length); - ucharsTop=top; - return (uint32_t)(p-uchars); } /*