ICU-3858 remove data for properties that were moved to ucase.icu and ubidi.icu, and simplify remaining structure

X-SVN-Rev: 17074
This commit is contained in:
Markus Scherer 2005-01-02 00:22:48 +00:00
parent 1f69d77027
commit aa6cd66256
6 changed files with 351 additions and 1454 deletions

View File

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 1996-2004, International Business Machines
* Copyright (C) 1996-2005, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
@ -49,8 +49,7 @@ static uint8_t formatVersion[4]={ 0, 0, 0, 0 };
static UVersionInfo dataVersion={ 0, 0, 0, 0 };
static UTrie propsTrie={ 0 }, propsVectorsTrie={ 0 };
static const uint32_t *pData32=NULL, *props32Table=NULL, *exceptionsTable=NULL, *propsVectors=NULL;
static const UChar *ucharsTable=NULL;
static const uint32_t *pData32=NULL, *propsVectors=NULL;
static int32_t countPropsVectors=0, propsVectorsColumns=0;
static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
@ -61,16 +60,6 @@ static int8_t havePropsData=0; /* == 0 -> Data has not been loaded.
/* index values loaded from uprops.dat */
static int32_t indexes[UPROPS_INDEX_COUNT];
/* if bit 15 is set, then the folding offset is in bits 14..0 of the 16-bit trie result */
static int32_t U_CALLCONV
getFoldingPropsOffset(uint32_t data) {
if(data&0x8000) {
return (int32_t)(data&0x7fff);
} else {
return 0;
}
}
static UBool U_CALLCONV
isAcceptable(void *context,
const char *type, const char *name,
@ -83,7 +72,7 @@ isAcceptable(void *context,
pInfo->dataFormat[1]==0x50 &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6f &&
pInfo->formatVersion[0]==3 &&
pInfo->formatVersion[0]==4 &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
) {
@ -102,9 +91,6 @@ static UBool U_CALLCONV uchar_cleanup(void)
propsData=NULL;
}
pData32=NULL;
props32Table=NULL;
exceptionsTable=NULL;
ucharsTable=NULL;
propsVectors=NULL;
countPropsVectors=0;
dataErrorCode=U_ZERO_ERROR;
@ -139,19 +125,12 @@ _openProps(UCharProps *ucp, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return;
}
ucp->propsTrie.getFoldingOffset=getFoldingPropsOffset;
/* unserialize the properties vectors trie, if any */
if( p[UPROPS_ADDITIONAL_TRIE_INDEX]!=0 &&
p[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0
) {
/* unserialize the properties vectors trie */
length=(int32_t)(p[UPROPS_ADDITIONAL_VECTORS_INDEX]-p[UPROPS_ADDITIONAL_TRIE_INDEX])*4;
length=utrie_unserialize(&ucp->propsVectorsTrie, (const uint8_t *)(p+p[UPROPS_ADDITIONAL_TRIE_INDEX]), length, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
uprv_memset(&ucp->propsVectorsTrie, 0, sizeof(ucp->propsVectorsTrie));
} else {
ucp->propsVectorsTrie.getFoldingOffset=getFoldingPropsOffset;
}
}
}
@ -190,9 +169,6 @@ uprv_loadPropsData(UErrorCode *pErrorCode) {
/* initialize some variables */
uprv_memcpy(indexes, pData32, sizeof(indexes));
props32Table=pData32+indexes[UPROPS_PROPS32_INDEX];
exceptionsTable=pData32+indexes[UPROPS_EXCEPTIONS_INDEX];
ucharsTable=(const UChar *)(pData32+indexes[UPROPS_EXCEPTIONS_TOP_INDEX]);
/* additional properties */
if(indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]!=0) {
@ -250,7 +226,7 @@ uprops_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==0x50 &&
pInfo->dataFormat[2]==0x72 &&
pInfo->dataFormat[3]==0x6f &&
pInfo->formatVersion[0]==3 &&
(pInfo->formatVersion[0]==3 || pInfo->formatVersion[0]==4) &&
pInfo->formatVersion[2]==UTRIE_SHIFT &&
pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
)) {
@ -360,10 +336,8 @@ uprops_swap(const UDataSwapper *ds,
/* getting a uint32_t properties word from the data */
#define HAVE_DATA (havePropsData>0 || loadPropsData()>0)
#define VALIDATE(c) (((uint32_t)(c))<=0x10ffff && HAVE_DATA)
#define GET_PROPS_UNSAFE(c, result) \
UTRIE_GET16(&propsTrie, c, result); \
(result)=props32Table[(result)]
UTRIE_GET16(&propsTrie, c, result);
#define GET_PROPS(c, result) \
if(HAVE_DATA) { \
GET_PROPS_UNSAFE(c, result); \
@ -371,39 +345,6 @@ uprops_swap(const UDataSwapper *ds,
(result)=0; \
}
/* finding an exception value */
#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1UL<<(index)))
/* number of bits in an 8-bit integer value */
#define EXC_GROUP 8
static const uint8_t flagsOffset[256]={
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
};
#define ADD_EXCEPTION_OFFSET(flags, index, offset) { \
if((index)>=EXC_GROUP) { \
(offset)+=flagsOffset[(flags)&((1<<EXC_GROUP)-1)]; \
(flags)>>=EXC_GROUP; \
(index)-=EXC_GROUP; \
} \
(offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
}
U_CFUNC UBool
uprv_haveProperties(UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
@ -437,8 +378,7 @@ struct _EnumTypeCallback {
static uint32_t U_CALLCONV
_enumTypeValue(const void *context, uint32_t value) {
/* access the general category from the 32-bit properties, and those from the 16-bit trie value */
return GET_CATEGORY(props32Table[value]);
return GET_CATEGORY(value);
}
static UBool U_CALLCONV
@ -695,75 +635,67 @@ u_isJavaIDPart(UChar32 c) {
U_CAPI int32_t U_EXPORT2
u_charDigitValue(UChar32 c) {
uint32_t props, numericType;
uint32_t props;
GET_PROPS(c, props);
numericType=GET_NUMERIC_TYPE(props);
if(numericType==1) {
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
return GET_SIGNED_VALUE(props);
if(GET_NUMERIC_TYPE(props)==1) {
return GET_NUMERIC_VALUE(props);
} else {
const uint32_t *pe=GET_EXCEPTIONS(props);
uint32_t firstExceptionValue=*pe;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
int i=EXC_NUMERIC_VALUE;
++pe;
ADD_EXCEPTION_OFFSET(firstExceptionValue, i, pe);
return (int32_t)*pe;
}
}
}
return -1;
}
}
U_CAPI double U_EXPORT2
u_getNumericValue(UChar32 c) {
uint32_t props, numericType;
uint32_t props, numericType, numericValue;
GET_PROPS(c, props);
numericType=GET_NUMERIC_TYPE(props);
if(numericType==0 || numericType>=(int32_t)U_NT_COUNT) {
if(numericType==0 || numericType>=UPROPS_NT_COUNT) {
return U_NO_NUMERIC_VALUE;
} else {
if(!PROPS_VALUE_IS_EXCEPTION(props)) {
return GET_SIGNED_VALUE(props);
} else {
const uint32_t *pe;
uint32_t firstExceptionValue;
}
double numValue;
numericValue=GET_NUMERIC_VALUE(props);
if(numericType<U_NT_COUNT) {
/* normal type, the value is stored directly */
return numericValue;
} else if(numericType==UPROPS_NT_FRACTION) {
/* fraction value */
int32_t numerator;
uint32_t denominator;
pe=GET_EXCEPTIONS(props);
firstExceptionValue=*pe++;
numerator=(int32_t)numericValue>>UPROPS_FRACTION_NUM_SHIFT;
denominator=(numericValue&UPROPS_FRACTION_DEN_MASK)+UPROPS_FRACTION_DEN_OFFSET;
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_NUMERIC_VALUE)) {
uint32_t flags=firstExceptionValue;
int i=EXC_NUMERIC_VALUE;
const uint32_t *p=pe;
int32_t numerator;
ADD_EXCEPTION_OFFSET(flags, i, p);
numerator=(int32_t)*p;
/*
* There are special values for huge numbers that are powers of ten.
* genprops/store.c documents:
* if numericValue=0x7fffff00+x then numericValue=10^x
*/
if(numerator<0x7fffff00) {
numValue=(double)numerator;
} else {
numerator&=0xff;
/* 10^x without math.h */
numValue=1.;
while(numerator>=4) {
numValue*=10000.;
numerator-=4;
if(numerator==0) {
numerator=-1;
}
switch(numerator) {
return (double)numerator/(double)denominator;
} else /* numericType==UPROPS_NT_LARGE */ {
/* large value with exponent */
double numValue;
int32_t mant, exp;
mant=(int32_t)numericValue>>UPROPS_LARGE_MANT_SHIFT;
exp=(int32_t)numericValue&UPROPS_LARGE_EXP_MASK;
if(mant==0) {
mant=1;
exp+=UPROPS_LARGE_EXP_OFFSET_EXTRA;
} else if(mant>9) {
return U_NO_NUMERIC_VALUE; /* reserved mantissa value */
} else {
exp+=UPROPS_LARGE_EXP_OFFSET;
}
numValue=mant;
/* multiply by 10^exp without math.h */
while(exp>=4) {
numValue*=10000.;
exp-=4;
}
switch(exp) {
case 3:
numValue*=1000.;
break;
@ -777,32 +709,8 @@ u_getNumericValue(UChar32 c) {
default:
break;
}
}
} else {
numValue=0.;
}
if(HAVE_EXCEPTION_VALUE(firstExceptionValue, EXC_DENOMINATOR_VALUE)) {
uint32_t flags=firstExceptionValue;
int i=EXC_DENOMINATOR_VALUE;
const uint32_t *p=pe;
ADD_EXCEPTION_OFFSET(flags, i, p);
denominator=*p;
} else {
denominator=0;
}
switch(firstExceptionValue&((1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE))) {
case 1UL<<EXC_NUMERIC_VALUE:
return numValue;
case 1UL<<EXC_DENOMINATOR_VALUE:
return (double)1./(double)denominator;
case (1UL<<EXC_NUMERIC_VALUE)|(1UL<<EXC_DENOMINATOR_VALUE):
return numValue/(double)denominator;
case 0: /* none (should not occur with numericType>0) */
default:
return U_NO_NUMERIC_VALUE;
}
}
}
}
@ -866,7 +774,6 @@ u_getUnicodeProperties(UChar32 c, int32_t column) {
GET_PROPS(c, props);
return props;
} else if( !HAVE_DATA || countPropsVectors==0 ||
(uint32_t)c>0x10ffff ||
column<0 || column>=propsVectorsColumns
) {
return 0;
@ -1069,18 +976,6 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
USET_ADD_CP_AND_NEXT(sa, FIGURESP);
USET_ADD_CP_AND_NEXT(sa, NNBSP);
/* add for u_charDigitValue() */
USET_ADD_CP_AND_NEXT(sa, 0x3007);
USET_ADD_CP_AND_NEXT(sa, 0x4e00);
USET_ADD_CP_AND_NEXT(sa, 0x4e8c);
USET_ADD_CP_AND_NEXT(sa, 0x4e09);
USET_ADD_CP_AND_NEXT(sa, 0x56db);
USET_ADD_CP_AND_NEXT(sa, 0x4e94);
USET_ADD_CP_AND_NEXT(sa, 0x516d);
USET_ADD_CP_AND_NEXT(sa, 0x4e03);
USET_ADD_CP_AND_NEXT(sa, 0x516b);
USET_ADD_CP_AND_NEXT(sa, 0x4e5d);
/* add for u_digit() */
sa->add(sa->set, U_a);
sa->add(sa->set, U_z+1);
@ -1096,8 +991,4 @@ uchar_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
/* add for UCHAR_GRAPHEME_BASE and others */
USET_ADD_CP_AND_NEXT(sa, CGJ);
/* add for UCHAR_JOINING_TYPE */
sa->add(sa->set, ZWNJ); /* range ZWNJ..ZWJ */
sa->add(sa->set, ZWJ+1);
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2004, International Business Machines
* Copyright (C) 2002-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -14,7 +14,7 @@
* created by: Markus W. Scherer
*
* Constants for mostly non-core Unicode character properties
* stored in uprops.dat.
* stored in uprops.icu.
*/
#ifndef __UPROPS_H__
@ -23,7 +23,6 @@
#include "unicode/utypes.h"
#include "unicode/uset.h"
#include "uset_imp.h"
#include "ucase.h"
#include "udataswp.h"
/* indexes[] entries */
@ -49,42 +48,44 @@ enum {
/* definitions for the main properties words */
enum {
/* general category shift==0 0 (5 bits) */
UPROPS_EXCEPTION_SHIFT=5, /* 5 (1 bit) */
UPROPS_BIDI_SHIFT, /* 6 (5 bits) */
UPROPS_MIRROR_SHIFT=UPROPS_BIDI_SHIFT+5, /* 11 (1 bit) */
UPROPS_NUMERIC_TYPE_SHIFT, /* 12 (3 bits) */
UPROPS_CASE_SENSITIVE_SHIFT=UPROPS_NUMERIC_TYPE_SHIFT+3,/* 15 (1 bit) format version 3.2 */
UPROPS_RESERVED_SHIFT, /* 16 (4 bits) */
UPROPS_VALUE_SHIFT=20, /* 20 */
UPROPS_EXCEPTION_BIT=1UL<<UPROPS_EXCEPTION_SHIFT,
UPROPS_VALUE_BITS=32-UPROPS_VALUE_SHIFT,
UPROPS_MIN_VALUE=-(1L<<(UPROPS_VALUE_BITS-1)),
UPROPS_MAX_VALUE=(1L<<(UPROPS_VALUE_BITS-1))-1,
UPROPS_MAX_EXCEPTIONS_COUNT=1L<<UPROPS_VALUE_BITS
UPROPS_NUMERIC_TYPE_SHIFT=5, /* 5 (3 bits) */
UPROPS_NUMERIC_VALUE_SHIFT=8 /* 8 (8 bits) */
};
#define PROPS_VALUE_IS_EXCEPTION(props) ((props)&UPROPS_EXCEPTION_BIT)
#define GET_CATEGORY(props) ((props)&0x1f)
#define GET_BIDI_CLASS(props) ((props>>UPROPS_BIDI_SHIFT)&0x1f)
#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7)
#define GET_UNSIGNED_VALUE(props) ((props)>>UPROPS_VALUE_SHIFT)
#define GET_SIGNED_VALUE(props) ((int32_t)(props)>>UPROPS_VALUE_SHIFT)
#define GET_EXCEPTIONS(props) (exceptionsTable+GET_UNSIGNED_VALUE(props))
#define CAT_MASK(props) U_MASK(GET_CATEGORY(props))
#define GET_NUMERIC_TYPE(props) (((props)>>UPROPS_NUMERIC_TYPE_SHIFT)&7)
#define GET_NUMERIC_VALUE(props) (((props)>>UPROPS_NUMERIC_VALUE_SHIFT)&0xff)
/* internal numeric pseudo-types for special encodings of numeric values */
enum {
EXC_UPPERCASE,
EXC_LOWERCASE,
EXC_TITLECASE,
EXC_UNUSED,
EXC_NUMERIC_VALUE,
EXC_DENOMINATOR_VALUE,
EXC_MIRROR_MAPPING,
EXC_SPECIAL_CASING,
EXC_CASE_FOLDING
UPROPS_NT_FRACTION=4, /* ==U_NT_COUNT, must not change unless binary format version changes */
UPROPS_NT_LARGE,
UPROPS_NT_COUNT
};
/* encoding of fractional and large numbers */
enum {
UPROPS_MAX_SMALL_NUMBER=0xff,
UPROPS_FRACTION_NUM_SHIFT=3, /* numerator: bits 7..3 */
UPROPS_FRACTION_DEN_MASK=7, /* denominator: bits 2..0 */
UPROPS_FRACTION_MAX_NUM=31,
UPROPS_FRACTION_DEN_OFFSET=2, /* denominator values are 2..9 */
UPROPS_FRACTION_MIN_DEN=UPROPS_FRACTION_DEN_OFFSET,
UPROPS_FRACTION_MAX_DEN=UPROPS_FRACTION_MIN_DEN+UPROPS_FRACTION_DEN_MASK,
UPROPS_LARGE_MANT_SHIFT=4, /* mantissa: bits 7..4 */
UPROPS_LARGE_EXP_MASK=0xf, /* exponent: bits 3..0 */
UPROPS_LARGE_EXP_OFFSET=2, /* regular exponents 2..17 */
UPROPS_LARGE_EXP_OFFSET_EXTRA=18, /* extra large exponents 18..33 */
UPROPS_LARGE_MIN_EXP=UPROPS_LARGE_EXP_OFFSET,
UPROPS_LARGE_MAX_EXP=UPROPS_LARGE_MIN_EXP+UPROPS_LARGE_EXP_MASK,
UPROPS_LARGE_MAX_EXP_EXTRA=UPROPS_LARGE_EXP_OFFSET_EXTRA+UPROPS_LARGE_EXP_MASK
};
/* number of properties vector words */
@ -129,8 +130,8 @@ enum {
*/
enum {
UPROPS_WHITE_SPACE,
UPROPS_BIDI_CONTROL,
UPROPS_JOIN_CONTROL,
UPROPS_WAS_BIDI_CONTROL, /* reserved, was used in format version 3 */
UPROPS_WAS_JOIN_CONTROL,
UPROPS_DASH,
UPROPS_HYPHEN,
UPROPS_QUOTATION_MARK,
@ -142,8 +143,8 @@ enum {
UPROPS_IDEOGRAPHIC,
UPROPS_DIACRITIC,
UPROPS_EXTENDER,
UPROPS_LOWERCASE,
UPROPS_UPPERCASE,
UPROPS_WAS_LOWERCASE, /* reserved, was used in format version 3 */
UPROPS_WAS_UPPERCASE,
UPROPS_NONCHARACTER_CODE_POINT,
UPROPS_GRAPHEME_EXTEND,
UPROPS_GRAPHEME_LINK,
@ -153,7 +154,7 @@ enum {
UPROPS_UNIFIED_IDEOGRAPH,
UPROPS_DEFAULT_IGNORABLE_CODE_POINT,
UPROPS_DEPRECATED,
UPROPS_SOFT_DOTTED,
UPROPS_WAS_SOFT_DOTTED, /* reserved, was used in format version 3 */
UPROPS_LOGICAL_ORDER_EXCEPTION,
UPROPS_XID_START,
UPROPS_XID_CONTINUE,
@ -167,15 +168,15 @@ enum {
* Properties in vector word 2
* Bits
* 31..24 More binary properties
* 13..11 Joining Type
* 10.. 5 Joining Group
* 13..11 reserved, was Joining Type in format version 3
* 10.. 5 reserved, was Joining Group in format version 3
* 4.. 0 Decomposition Type
*/
#define UPROPS_JT_MASK 0x00003800
#define UPROPS_JT_SHIFT 11
#define UPROPS_WAS_JT_MASK 0x00003800
#define UPROPS_WAS_JT_SHIFT 11
#define UPROPS_JG_MASK 0x000007e0
#define UPROPS_JG_SHIFT 5
#define UPROPS_WAS_JG_MASK 0x000007e0
#define UPROPS_WAS_JG_SHIFT 5
#define UPROPS_DT_MASK 0x0000001f

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2003, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -24,7 +24,6 @@
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/uset.h"
#include "unicode/putil.h"
#include "unicode/uclean.h"
#include "cmemory.h"
@ -43,31 +42,13 @@ U_CDECL_END
UBool beVerbose=FALSE, haveCopyright=TRUE;
/*
* Unicode set collecting the case-sensitive characters;
* see uchar.h UCHAR_CASE_SENSITIVE.
* Add code points from case mappings/foldings in
* the root locale and with default options.
*/
static USet *caseSensitive;
/* prototypes --------------------------------------------------------------- */
static void
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode);
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode);
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode);
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
enum
{
HELP_H,
@ -174,19 +155,6 @@ main(int argc, char* argv[]) {
/* initialize */
initStore();
caseSensitive=uset_open(1, 0); /* empty set (start>end) */
/* process BidiMirroring.txt */
writeUCDFilename(basename, "BidiMirroring", suffix);
parseBidiMirroring(filename, &errorCode);
/* process SpecialCasing.txt */
writeUCDFilename(basename, "SpecialCasing", suffix);
parseSpecialCasing(filename, &errorCode);
/* process CaseFolding.txt */
writeUCDFilename(basename, "CaseFolding", suffix);
parseCaseFolding(filename, &errorCode);
/* process UnicodeData.txt */
writeUCDFilename(basename, "UnicodeData", suffix);
@ -202,6 +170,7 @@ main(int argc, char* argv[]) {
generateData(destDir);
}
exitStore();
u_cleanup();
return errorCode;
}
@ -270,301 +239,6 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
return -1;
}
static void
_set_addAll(USet *set, const UChar *s, int32_t length) {
UChar32 c;
int32_t i;
/* needs length>=0 */
for(i=0; i<length; /* U16_NEXT advances i */) {
U16_NEXT(s, i, length, c);
uset_add(set, c);
}
}
/* parser for BidiMirroring.txt --------------------------------------------- */
#define MAX_MIRROR_COUNT 2000
static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
static int32_t mirrorCount=0;
static void U_CALLCONV
mirrorLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
static uint32_t prevCode=0;
mirrorMappings[mirrorCount][0]=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
mirrorMappings[mirrorCount][1]=(uint32_t)uprv_strtoul(fields[1][0], &end, 16);
if(end<=fields[1][0] || end!=fields[1][1]) {
fprintf(stderr, "genprops: syntax error in BidiMirroring.txt field 1 at %s\n", fields[1][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* check that the code points (mirrorMappings[mirrorCount][0]) are in ascending order */
if(mirrorMappings[mirrorCount][0]<=prevCode && mirrorMappings[mirrorCount][0]>0) {
fprintf(stderr, "genprops: error - BidiMirroring entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)mirrorMappings[mirrorCount][0],
(unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=mirrorMappings[mirrorCount][0];
if(++mirrorCount==MAX_MIRROR_COUNT) {
fprintf(stderr, "genprops: too many mirror mappings\n");
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static void
parseBidiMirroring(const char *filename, UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 2, mirrorLineFn, NULL, pErrorCode);
}
/* parser for SpecialCasing.txt --------------------------------------------- */
#define MAX_SPECIAL_CASING_COUNT 500
static SpecialCasing specialCasings[MAX_SPECIAL_CASING_COUNT];
static int32_t specialCasingCount=0;
static void U_CALLCONV
specialCasingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
/* get code point */
specialCasings[specialCasingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* is this a complex mapping? */
if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
/* there is some condition text in the fifth field */
specialCasings[specialCasingCount].isComplex=TRUE;
/* do not store any actual mappings for this */
specialCasings[specialCasingCount].lowerCase[0]=0;
specialCasings[specialCasingCount].upperCase[0]=0;
specialCasings[specialCasingCount].titleCase[0]=0;
} else {
/* just set the "complex" flag and get the case mappings */
specialCasings[specialCasingCount].isComplex=FALSE;
specialCasings[specialCasingCount].lowerCase[0]=
(UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode);
specialCasings[specialCasingCount].upperCase[0]=
(UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode);
specialCasings[specialCasingCount].titleCase[0]=
(UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
exit(*pErrorCode);
}
uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
_set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
}
if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
fprintf(stderr, "genprops: too many special casing mappings\n");
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static int
compareSpecialCasings(const void *left, const void *right) {
return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code;
}
static void
parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
char *fields[5][2];
int32_t i, j;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode);
/* sort the special casing entries by code point */
if(specialCasingCount>0) {
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
}
/* replace multiple entries for any code point by one "complex" one */
j=0;
for(i=1; i<specialCasingCount; ++i) {
if(specialCasings[i-1].code==specialCasings[i].code) {
/* there is a duplicate code point */
specialCasings[i-1].code=0x7fffffff; /* remove this entry in the following qsort */
specialCasings[i].isComplex=TRUE; /* make the following one complex */
specialCasings[i].lowerCase[0]=0;
specialCasings[i].upperCase[0]=0;
specialCasings[i].titleCase[0]=0;
++j;
}
}
/* if some entries just were removed, then re-sort */
if(j>0) {
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
specialCasingCount-=j;
}
/*
* Add one complex mapping to caseSensitive that was filtered out above:
* Greek final Sigma has a conditional mapping but not locale-sensitive,
* and it is taken when lowercasing just U+03A3 alone.
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
*/
uset_add(caseSensitive, 0x3c2);
}
/* parser for CaseFolding.txt ----------------------------------------------- */
#define MAX_CASE_FOLDING_COUNT 2000
static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT];
static int32_t caseFoldingCount=0;
static void U_CALLCONV
caseFoldingLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *end;
static uint32_t prevCode=0;
int32_t count;
char status;
/* get code point */
caseFoldings[caseFoldingCount].code=(uint32_t)uprv_strtoul(u_skipWhitespace(fields[0][0]), &end, 16);
end=(char *)u_skipWhitespace(end);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get the status of this mapping */
caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]);
if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') {
fprintf(stderr, "genprops: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */
if(status=='L') {
return;
}
/* get the mapping */
count=caseFoldings[caseFoldingCount].full[0]=
(UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, &caseFoldings[caseFoldingCount].simple, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/* there is a simple mapping only if there is exactly one code point (count is in UChars) */
if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) {
caseFoldings[caseFoldingCount].simple=0;
}
/* update the case-sensitive set */
if(status!='T') {
uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
_set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
}
/* check the status */
if(status=='S') {
/* check if there was a full mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='F'
) {
/* merge the two entries */
caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple;
return;
}
} else if(status=='F') {
/* check if there was a simple mapping for this code point before */
if( caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code &&
caseFoldings[caseFoldingCount-1].status=='S'
) {
/* merge the two entries */
uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR);
return;
}
} else if(status=='I' || status=='T') {
/* check if there was a default mapping for this code point before (remove it) */
while(caseFoldingCount>0 &&
caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code
) {
prevCode=0;
--caseFoldingCount;
}
/* store only a marker for special handling for cases like dotless i */
caseFoldings[caseFoldingCount].simple=0;
caseFoldings[caseFoldingCount].full[0]=0;
}
/* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */
if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) {
fprintf(stderr, "genprops: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)caseFoldings[caseFoldingCount].code,
(unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=caseFoldings[caseFoldingCount].code;
if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) {
fprintf(stderr, "genprops: too many case folding mappings\n");
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static void
parseCaseFolding(const char *filename, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode);
}
/* parser for UnicodeData.txt ----------------------------------------------- */
/* general categories */
@ -580,12 +254,6 @@ genCategoryNames[U_CHAR_CATEGORY_COUNT]={
"Pi", "Pf"
};
const char *const
bidiNames[U_CHAR_DIRECTION_COUNT]={
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
};
const char *const
decompositionTypeNames[U_DT_COUNT]={
NULL,
@ -613,7 +281,7 @@ static struct {
char name[80];
} unicodeAreas[32];
static int32_t unicodeAreaIndex=0, mirrorIndex=0, specialCasingIndex=0, caseFoldingIndex=0;
static int32_t unicodeAreaIndex=0;
static void U_CALLCONV
unicodeDataLineFn(void *context,
@ -647,17 +315,6 @@ unicodeDataLineFn(void *context,
exit(U_PARSE_ERROR);
}
/* get BiDi category, field 4 */
i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
if(i>=0) {
p.bidi=(uint8_t)i;
} else {
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
fields[4][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get decomposition type, field 5 */
if(fields[5][0]<fields[5][1]) {
/* there is some decomposition */
@ -771,80 +428,6 @@ unicodeDataLineFn(void *context,
}
}
/* get Mirrored flag, field 9 */
if(*fields[9][0]=='Y') {
p.isMirrored=1;
} else if(fields[9][1]-fields[9][0]!=1 || *fields[9][0]!='N') {
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get uppercase mapping, field 12 */
value=(uint32_t)uprv_strtoul(fields[12][0], &end, 16);
if(end!=fields[12][1]) {
fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.upperCase=value;
uset_add(caseSensitive, (UChar32)p.code);
uset_add(caseSensitive, (UChar32)value);
}
/* get lowercase value, field 13 */
value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
if(end!=fields[13][1]) {
fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.lowerCase=value;
uset_add(caseSensitive, (UChar32)p.code);
uset_add(caseSensitive, (UChar32)value);
}
/* get titlecase value, field 14 */
value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
if(end!=fields[14][1]) {
fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(value!=0 && value!=p.code) {
p.titleCase=value;
uset_add(caseSensitive, (UChar32)p.code);
uset_add(caseSensitive, (UChar32)value);
}
/* set additional properties from previously parsed files */
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
p.mirrorMapping=mirrorMappings[mirrorIndex++][1];
}
if(specialCasingIndex<specialCasingCount && p.code==specialCasings[specialCasingIndex].code) {
p.specialCasing=specialCasings+specialCasingIndex++;
} else {
p.specialCasing=NULL;
}
if(caseFoldingIndex<caseFoldingCount && p.code==caseFoldings[caseFoldingIndex].code) {
p.caseFolding=caseFoldings+caseFoldingIndex++;
/* ignore "Common" mappings (simple==full) that map to the same code point as the regular lowercase mapping */
if( p.caseFolding->status=='C' &&
p.caseFolding->simple==p.lowerCase
) {
p.caseFolding=NULL;
}
} else {
p.caseFolding=NULL;
}
value=makeProps(&p);
if(*fields[1][0]=='<') {
@ -966,41 +549,12 @@ repeatAreaProps() {
static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
/* default Bidi classes for unassigned code points */
static const uint32_t defaultBidi[][2]={ /* { limit, class } */
{ 0x0590, U_LEFT_TO_RIGHT },
{ 0x0600, U_RIGHT_TO_LEFT },
{ 0x07C0, U_RIGHT_TO_LEFT_ARABIC },
{ 0xFB1D, U_LEFT_TO_RIGHT },
{ 0xFB50, U_RIGHT_TO_LEFT },
{ 0xFE00, U_RIGHT_TO_LEFT_ARABIC },
{ 0xFE70, U_LEFT_TO_RIGHT },
{ 0xFF00, U_RIGHT_TO_LEFT_ARABIC },
{ 0x110000, U_LEFT_TO_RIGHT }
};
char *fields[15][2];
UChar32 start, end;
uint32_t prev;
int32_t i;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
/*
* Set default Bidi classes for unassigned code points.
* See table 3-7 "Bidirectional Character Types" in UAX #9.
* http://www.unicode.org/reports/tr9/
*/
prev=0;
for(i=0; i<LENGTHOF(defaultBidi); ++i) {
if(defaultBidi[i][1]!=0) {
repeatProps(prev, defaultBidi[i][0]-1, defaultBidi[i][1]<<UPROPS_BIDI_SHIFT);
}
prev=defaultBidi[i][0];
}
/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
unicodeAreas[0].first=0xffffffff;
@ -1016,36 +570,9 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
repeatAreaProps();
/* are all sub-properties consumed? */
if(mirrorIndex<mirrorCount) {
fprintf(stderr, "genprops: error - some code points in BidiMirroring.txt are missing from UnicodeData.txt\n");
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(specialCasingIndex<specialCasingCount) {
fprintf(stderr, "genprops: error - some code points in SpecialCasing.txt are missing from UnicodeData.txt\n");
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(caseFoldingIndex<caseFoldingCount) {
fprintf(stderr, "genprops: error - some code points in CaseFolding.txt are missing from UnicodeData.txt\n");
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(U_FAILURE(*pErrorCode)) {
return;
}
for(i=0;
0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
++i
) {
addCaseSensitive(start, end);
}
if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
}
/*
@ -1056,4 +583,3 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
* End:
*
*/

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -24,37 +24,17 @@
#define DATA_NAME "uprops"
#define DATA_TYPE "icu"
/* special casing data */
typedef struct {
uint32_t code;
UBool isComplex;
UChar lowerCase[32], upperCase[32], titleCase[32];
} SpecialCasing;
/* case folding data */
typedef struct {
uint32_t code, simple;
char status;
UChar full[32];
} CaseFolding;
/* character properties */
typedef struct {
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
uint32_t code;
int32_t numericValue; /* see numericType */
uint32_t denominator; /* 0: no value */
uint8_t generalCategory, bidi, isMirrored, numericType;
SpecialCasing *specialCasing;
CaseFolding *caseFolding;
uint8_t generalCategory, numericType, exponent;
} Props;
/* global flags */
extern UBool beVerbose, haveCopyright;
/* name tables */
extern const char *const
bidiNames[];
extern const char *const
genCategoryNames[];
@ -77,6 +57,9 @@ setUnicodeVersion(const char *v);
extern void
initStore(void);
extern void
exitStore();
extern uint32_t
makeProps(Props *p);
@ -89,12 +72,6 @@ getProps(uint32_t c);
extern void
repeatProps(uint32_t first, uint32_t last, uint32_t props);
U_CFUNC uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
extern void
addCaseSensitive(UChar32 first, UChar32 last);
extern void
generateData(const char *dataDir);
@ -102,6 +79,9 @@ generateData(const char *dataDir);
U_CFUNC void
initAdditionalProperties(void);
U_CFUNC void
exitAdditionalProperties();
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);
@ -109,4 +89,3 @@ U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[16]);
#endif

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2004, International Business Machines
* Copyright (C) 2002-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -102,11 +102,6 @@ numericLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
static void U_CALLCONV
bidiClassLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
/* parse files with single enumerated properties ---------------------------- */
struct SingleEnum {
@ -146,18 +141,6 @@ static const SingleEnum eawSingleEnum={
0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
};
static const SingleEnum jtSingleEnum={
"DerivedJoiningType", "joining type",
UCHAR_JOINING_TYPE,
2, UPROPS_JT_SHIFT, UPROPS_JT_MASK
};
static const SingleEnum jgSingleEnum={
"DerivedJoiningGroup", "joining group",
UCHAR_JOINING_GROUP,
2, UPROPS_JG_SHIFT, UPROPS_JG_MASK
};
static void U_CALLCONV
singleEnumLineFn(void *context,
char *fields[][2], int32_t fieldCount,
@ -246,8 +229,6 @@ typedef struct Binaries Binaries;
static const Binary
propListNames[]={
{ "White_Space", 1, UPROPS_WHITE_SPACE },
{ "Bidi_Control", 1, UPROPS_BIDI_CONTROL },
{ "Join_Control", 1, UPROPS_JOIN_CONTROL },
{ "Dash", 1, UPROPS_DASH },
{ "Hyphen", 1, UPROPS_HYPHEN },
{ "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
@ -264,7 +245,6 @@ propListNames[]={
{ "Radical", 1, UPROPS_RADICAL },
{ "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
{ "Deprecated", 1, UPROPS_DEPRECATED },
{ "Soft_Dotted", 1, UPROPS_SOFT_DOTTED },
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
/* new properties in Unicode 4.0.1 */
@ -285,8 +265,6 @@ derCorePropsNames[]={
/* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
{ "Math", 1, UPROPS_MATH },
{ "Alphabetic", 1, UPROPS_ALPHABETIC },
{ "Lowercase", 1, UPROPS_LOWERCASE },
{ "Uppercase", 1, UPROPS_UPPERCASE },
{ "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
{ "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
@ -340,7 +318,9 @@ binariesLineFn(void *context,
for(i=0;; ++i) {
if(i==bin->binariesCount) {
/* ignore unrecognized properties */
if(beVerbose) {
addIgnoredProp(s, fields[1][1]);
}
return;
}
if(isToken(bin->binaries[i].propName, s)) {
@ -382,9 +362,11 @@ parseBinariesFile(char *filename, char *basename, const char *suffix,
fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
}
if(beVerbose) {
for(i=0; i<ignoredPropsCount; ++i) {
printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
}
}
}
/* -------------------------------------------------------------------------- */
@ -394,6 +376,12 @@ initAdditionalProperties() {
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
}
U_CFUNC void
exitAdditionalProperties() {
utrie_close(trie);
upvec_close(pv);
}
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
char *basename;
@ -405,9 +393,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
/* add Han numeric types & values */
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
/* set proper bidi class for unassigned code points (Cn) */
parseTwoFieldFile(filename, basename, "DerivedBidiClass", suffix, bidiClassLineFn, pErrorCode);
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
/*
@ -441,10 +426,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
*/
parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
parseSingleEnumFile(filename, basename, suffix, &jtSingleEnum, pErrorCode);
parseSingleEnumFile(filename, basename, suffix, &jgSingleEnum, pErrorCode);
/*
* Preset East Asian Width defaults:
*
@ -481,7 +462,7 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
return;
}
pvCount=upvec_toTrie(pv, trie, pErrorCode);
pvCount=upvec_compact(pv, upvec_compactToTrieHandler, trie, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
@ -538,7 +519,7 @@ static void U_CALLCONV
numericLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Props newProps;
Props newProps={ 0 };
char *s, *end;
uint32_t start, limit, value, oldProps32;
int32_t oldType;
@ -575,11 +556,14 @@ numericLineFn(void *context,
/* try large powers of 10 first, may otherwise overflow strtoul() */
if(0==uprv_strncmp(s, "10000000000", 11)) {
/* large powers of 10 are encoded in a special way, see store.c */
value=0x7fffff00;
uint8_t exp=0;
end=s;
while(*(++end)=='0') {
++value;
++exp;
}
value=1;
newProps.exponent=exp;
} else {
/* normal number parsing */
value=(uint32_t)uprv_strtoul(s, &end, 10);
@ -599,108 +583,51 @@ numericLineFn(void *context,
* specific properties for single characters.
*/
/* set the new numeric type and value */
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
/* the exponent may have been set above */
value=makeProps(&newProps);
for(; start<limit; ++start) {
oldProps32=getProps(start);
oldType=(int32_t)GET_NUMERIC_TYPE(oldProps32);
if(isFraction) {
if(oldType!=0) {
/* this code point was already listed with its numeric value in UnicodeData.txt */
continue;
}
/*
* Do not set a numeric value for code points that have other
* values or exceptions because the code below is not prepared
* to maintain such values and exceptions.
*
* Check store.c (e.g., file format description and makeProps())
* for details of what code points get their value field interpreted.
* For example, case mappings for Ll/Lt/Lu and mirror mappings for mirrored characters.
*
* For simplicity, and because we only expect to set numeric values for Han characters,
* for now we only allow to set these values for Lo characters.
*/
if(GET_UNSIGNED_VALUE(oldProps32)!=0 || PROPS_VALUE_IS_EXCEPTION(oldProps32) || GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
fprintf(stderr, "genprops error: new numeric value for a character with some other value in DerivedNumericValues.txt at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
if(isFraction) {
} else {
fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
exit(U_PARSE_ERROR);
}
}
/*
* For simplicity, and because we only expect to set numeric values for Han characters,
* for now we only allow to set these values for Lo characters.
*/
if(oldType==0 && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
/* verify that we do not change an existing value (fractions were excluded above) */
if(oldType!=0) {
/* the code point already has a value stored */
if((oldProps32&0xff00)!=(value&0xff00)) {
fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
exit(U_PARSE_ERROR);
}
/* same value, continue */
} else {
/* the code point is getting a new numeric value */
if(beVerbose) {
printf("adding U+%04x numeric type %d value %u\n", (int)start, U_NT_NUMERIC, (int)value);
printf("adding U+%04x numeric type %d value 0x%04x from %s\n", (int)start, U_NT_NUMERIC, (int)value, fields[0][0]);
}
/* reconstruct the properties and set the new numeric type and value */
uprv_memset(&newProps, 0, sizeof(newProps));
newProps.code=start;
newProps.generalCategory=(uint8_t)GET_CATEGORY(oldProps32);
newProps.bidi=(uint8_t)GET_BIDI_CLASS(oldProps32);
newProps.isMirrored=(uint8_t)(oldProps32&(1UL<<UPROPS_MIRROR_SHIFT) ? TRUE : FALSE);
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
addProps(start, makeProps(&newProps));
addProps(start, value|GET_CATEGORY(oldProps32));
}
}
/* DerivedBidiClass.txt ----------------------------------------------------- */
static void U_CALLCONV
bidiClassLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
char *s;
uint32_t oldStart, start, limit, value, props32;
UBool didSet;
/* get the code point range */
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in DerivedBidiClass.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse bidi class */
s=trimTerminateField(fields[1][0], fields[1][1]);
value=u_getPropertyValueEnum(UCHAR_BIDI_CLASS, s);
if((int32_t)value<0) {
fprintf(stderr, "genprops error: unknown bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
exit(U_PARSE_ERROR);
}
didSet=FALSE;
oldStart=start;
for(; start<limit; ++start) {
props32=getProps(start);
/* ignore if this bidi class is already set */
if(value==GET_BIDI_CLASS(props32)) {
continue;
}
/* ignore old bidi class, set only for unassigned code points (Cn) */
if(GET_CATEGORY(props32)!=0) {
/* error if this one contradicts what we parsed from UnicodeData.txt */
fprintf(stderr, "genprops error: different bidi class in DerivedBidiClass.txt field 1 at %s\n", s);
exit(U_PARSE_ERROR);
}
/* remove whatever bidi class was set before */
props32&=~(0x1f<<UPROPS_BIDI_SHIFT);
/* set bidi class for Cn according to DerivedBidiClass.txt */
props32|=value<<UPROPS_BIDI_SHIFT;
/* set the modified properties */
addProps(start, props32);
didSet=TRUE;
}
if(didSet && beVerbose) {
printf("setting U+%04x..U+%04x bidi class %d\n", (int)oldStart, (int)limit-1, (int)value);
}
}
@ -712,7 +639,7 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
UErrorCode errorCode;
errorCode=U_ZERO_ERROR;
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
length=utrie_serialize(trie, p, capacity, NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
exit(errorCode);
@ -737,8 +664,6 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
((int32_t)USCRIPT_CODE_LIMIT-1);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_JT_COUNT-1)<<UPROPS_JT_SHIFT)|
(((int32_t)U_JG_COUNT-1)<<UPROPS_JG_SHIFT)|
((int32_t)U_DT_COUNT-1);
}
@ -751,9 +676,5 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
}
length+=pvCount*4;
if(p!=NULL) {
utrie_close(trie);
upvec_close(pv);
}
return length;
}

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2004, International Business Machines
* Copyright (C) 1999-2005, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -18,12 +18,10 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "utrie.h"
#include "unicode/udata.h"
#include "unewdata.h"
@ -42,7 +40,15 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 3 .
The following is a description of format version 4 .
The format changes between version 3 and 4 because the properties related to
case mappings and bidi/shaping are pulled out into separate files
for modularization.
In order to reduce the need for code changes, some of the previous data
structures are omitted, rather than rearranging everything.
For details see "Changes in format version 4" below.
Data contents:
@ -63,6 +69,10 @@ Formally, the file contains the following structures:
const int32_t indexes[16] with values i0..i15:
i0 indicates the length of the main trie.
i0..i3 all have the same value in format version 4.0;
the related props32[] and exceptions[] and uchars[] were used in format version 3
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
@ -74,12 +84,14 @@ Formally, the file contains the following structures:
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
i7..i9 reservedIndexes; -- reserved values; 0 for now
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (format version 3.2)
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
i12..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
P, E, and U are not used (empty) in format version 4
P const uint32_t props32[i1-i0];
E const uint32_t exceptions[i2-i1];
U const UChar uchars[2*(i3-i2)];
@ -99,14 +111,7 @@ the Unicode code assignment are exploited:
The lookup of properties for a given code point is done with a trie lookup,
using the UTrie implementation.
The trie lookup result is a 16-bit index in the props32[] table where the
actual 32-bit properties word is stored. This is done to save space.
(There are thousands of 16-bit entries in the trie data table, but
only a few hundred unique 32-bit properties words.
If the trie data table contained 32-bit words directly, then that would be
larger because the length of the table would be the same as now but the
width would be 32 bits instead of 16. This saves more than 10kB.)
The trie lookup result is a 16-bit properties word.
With a given Unicode code point
@ -114,141 +119,51 @@ With a given Unicode code point
and 0<=c<0x110000, the lookup is done like this:
uint16_t i;
UTRIE_GET16(c, i);
uint32_t props=p32[i];
uint16_t props;
UTRIE_GET16(trie, c, props);
For some characters, not all of the properties can be efficiently encoded
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
array:
if(props&EXCEPTION_BIT)) {
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
...
}
The exception values are a variable number of uint32_t starting at
const uint32_t *pe=p32+exceptionsIndex+e;
The first uint32_t there contains flags about what values actually follow it.
Some of the exception values are UChar32 code points for the case mappings,
others are numeric values etc.
32-bit properties sets:
Each 32-bit properties word contains:
Each 16-bit properties word contains:
0.. 4 general category
5 has exception values
6..10 BiDi category
11 is mirrored
12..14 numericType:
0 no numeric value
1 decimal digit value
2 digit value
3 numeric value
### TODO: type 4 for Han digits & numbers?!
15..19 reserved
20..31 value according to bits 0..5:
if(has exception) {
exception index;
} else switch(general category) {
case Ll: delta to uppercase; -- same as titlecase
case Lu: -delta to lowercase; -- titlecase is same as c
case Lt: -delta to lowercase; -- uppercase is same as c
default:
if(is mirrored) {
delta to mirror;
} else if(numericType!=0) {
numericValue;
} else {
0;
};
5.. 7 numeric type
non-digit numbers are stored with multiple types and pseudo-types
in order to facilitate compact encoding:
0 no numeric value (0)
1 decimal digit value (0..9)
2 digit value (0..9)
3 (U_NT_NUMERIC) normal non-digit numeric value 0..0xff
4 (internal type UPROPS_NT_FRACTION) fraction
5 (internal type UPROPS_NT_LARGE) large number >0xff
6..7 reserved
when returning the numeric type from a public API,
internal types must be turned into U_NT_NUMERIC
8..15 numeric value
encoding of fractions and large numbers see below
Fractions:
// n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
int32_t num, den;
num=n>>3; // num=0..31
den=(n&7)+2; // den=2..9
if(num==0) {
num=-1; // num=-1 or 1..31
}
double result=(double)num/(double)den;
Exception values:
In the first uint32_t exception word for a code point,
bits
31..16 reserved
15..0 flags that indicate which values follow:
bit
0 has uppercase mapping
1 has lowercase mapping
2 has titlecase mapping
3 unused
4 has numeric value (numerator)
if numericValue=0x7fffff00+x then numericValue=10^x
5 has denominator value
6 has a mirror-image Unicode code point
7 has SpecialCasing.txt entries
8 has CaseFolding.txt entries
According to the flags in this word, one or more uint32_t words follow it
in the sequence of the bit flags in the flags word; if a flag is not set,
then the value is missing or 0:
For the case mappings and the mirror-image Unicode code point,
one uint32_t or UChar32 each is the code point.
If the titlecase mapping is missing, then it is the same as the uppercase mapping.
For the digit values, bits 31..16 contain the decimal digit value, and
bits 15..0 contain the digit value. A value of -1 indicates that
this value is missing.
For the numeric/numerator value, an int32_t word contains the value directly,
except for when there is no numerator but a denominator, then the numerator
is implicitly 1. This means:
numerator denominator result
none none none
x none x
none y 1/y
x y x/y
If the numerator value is 0x7fffff00+x then it is replaced with 10^x.
For the denominator value, a uint32_t word contains the value directly.
For special casing mappings, the 32-bit exception word contains:
31 if set, this character has complex, conditional mappings
that are not stored;
otherwise, the mappings are stored according to the following bits
30..24 number of UChars used for mappings
23..16 reserved
15.. 0 UChar offset from the beginning of the UChars array where the
UChars for the special case mappings are stored in the following format:
Format of special casing UChars:
One UChar value with lengths as follows:
14..10 number of UChars for titlecase mapping
9.. 5 number of UChars for uppercase mapping
4.. 0 number of UChars for lowercase mapping
Followed by the UChars for lowercase, uppercase, titlecase mappings in this order.
For case folding mappings, the 32-bit exception word contains:
31..24 number of UChars used for the full mapping
23..16 reserved
15.. 0 UChar offset from the beginning of the UChars array where the
UChars for the special case mappings are stored in the following format:
Format of case folding UChars:
Two UChars contain the simple mapping as follows:
0, 0 no simple mapping
BMP,0 a simple mapping to a BMP code point
s1, s2 a simple mapping to a supplementary code point stored as two surrogates
This is followed by the UChars for the full case folding mappings.
Example:
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
mapping and a numeric value.
Its exception values would be stored as 3 uint32_t words:
- flags=0x0a (see above) with combining class 0
- lowercase mapping 0x2170
- numeric value=1
Large numbers:
// n is the 8-bit numeric value from bits 8..15 of the trie word (shifted down)
int32_t m, e;
m=n>>4; // m=0..15
e=(n&0xf);
if(m==0) {
m=1; // for large powers of 10
e+=18; // e=18..33
} else {
e+=2; // e=2..17
} // m==10..15 are reserved
double result=(double)m*10^e;
--- Additional properties (new in format version 2.1) ---
@ -277,6 +192,32 @@ See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
- i10 also contains U_LB_COUNT and U_EA_COUNT.
- i11 contains maxValues2 for vector word 2.
--- Changes in format version 4 ---
The format changes between version 3 and 4 because the properties related to
case mappings and bidi/shaping are pulled out into separate files
for modularization.
In order to reduce the need for code changes, some of the previous data
structures are omitted, rather than rearranging everything.
(The change to format version 4 is for ICU 3.4. The last CVS revision of
genprops/store.c for format version 3.2 is 1.48.)
The main trie's data is significantly simplified:
- The trie's 16-bit data word is used directly instead of as an index
into props32[].
- The trie uses the default trie folding functions instead of custom ones.
- Numeric values are stored directly in the trie data word, with special
encodings.
- No more exception data (the data that needed it was pulled out, or, in the
case of numeric values, encoded differently).
- No more string data (pulled out - was for case mappings).
Also, some of the previously used properties vector bits are reserved again.
The indexes[] values for the omitted structures are still filled in
(indicating zero-length arrays) so that the swapper code remains unchanged.
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
@ -290,46 +231,12 @@ static UDataInfo dataInfo={
0,
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
{ 3, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 4, 0, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 4, 0, 1, 0 } /* dataVersion */
};
/* definitions of expected data size limits */
enum {
MAX_PROPS_COUNT=25000,
MAX_UCHAR_COUNT=10000
};
static UNewTrie *pTrie=NULL;
/* props32[] contains unique properties words after compacting the array of properties */
static uint32_t props32[MAX_PROPS_COUNT];
/* context pointer for compareProps() - temporarily holds a pointer to the trie data */
static uint32_t *props;
/* length of props32[] after compaction */
static int32_t propsTop;
/* exceptions values */
static uint32_t exceptions[UPROPS_MAX_EXCEPTIONS_COUNT+20];
static uint16_t exceptionsTop=0;
/* Unicode characters, e.g. for special casing or decomposition */
static UChar uchars[MAX_UCHAR_COUNT+20];
static uint32_t ucharsTop=0;
/* statistics */
static uint16_t exceptionsCount=0;
/* prototypes --------------------------------------------------------------- */
static int
compareProps(const void *l, const void *r);
static uint32_t
addUChars(const UChar *s, uint32_t length);
/* -------------------------------------------------------------------------- */
extern void
@ -341,266 +248,106 @@ setUnicodeVersion(const char *v) {
extern void
initStore() {
pTrie=utrie_open(NULL, NULL, MAX_PROPS_COUNT, 0, 0, TRUE);
pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
if(pTrie==NULL) {
fprintf(stderr, "error: unable to create a UNewTrie\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_memset(props32, 0, sizeof(props32));
initAdditionalProperties();
}
extern void
exitStore() {
utrie_close(pTrie);
exitAdditionalProperties();
}
/* store a character's properties ------------------------------------------- */
extern uint32_t
makeProps(Props *p) {
uint32_t x;
int32_t value;
uint16_t count;
UBool isNumber;
uint32_t den;
int32_t type, value, exp;
/*
* Simple ideas for reducing the number of bits for one character's
* properties:
*
* Some fields are only used for characters of certain
* general categories:
* - casing fields for letters and others, not for
* numbers & Mn
* + uppercase not for uppercase letters
* + lowercase not for lowercase letters
* + titlecase not for titlecase letters
*
* * most of the time, uppercase=titlecase
* - numeric fields for various digit & other types
* - canonical combining classes for non-spacing marks (Mn)
* * the above is not always true, for all three cases
*
* Using the same bits for alternate fields saves some space.
*
* For the canonical categories, there are only few actually used
* most of the time.
* They can be stored using 5 bits.
*
* In the BiDi categories, the 5 explicit codes are only ever
* assigned 1:1 to 5 well-known code points. Storing only one
* value for all "explicit codes" gets this down to 4 bits.
* Client code then needs to check for this special value
* and replace it by the real one using a 5-element table.
*
* The general categories Mn & Me, non-spacing & enclosing marks,
* are always NSM, and NSM are always of those categories.
*
* Digit values can often be derived from the code point value
* itself in a simple way.
*
*/
/* count the case mappings and other values competing for the value bit field */
x=0;
value=0;
count=0;
isNumber= (UBool)(genCategoryNames[p->generalCategory][0]=='N');
if(p->upperCase!=0) {
/* verify that no numbers and no Mn have case mappings */
if(p->generalCategory==U_LOWERCASE_LETTER) {
value=(int32_t)p->code-(int32_t)p->upperCase;
} else {
x=UPROPS_EXCEPTION_BIT;
}
++count;
}
if(p->lowerCase!=0) {
/* verify that no numbers and no Mn have case mappings */
if(p->generalCategory==U_UPPERCASE_LETTER || p->generalCategory==U_TITLECASE_LETTER) {
value=(int32_t)p->lowerCase-(int32_t)p->code;
} else {
x=UPROPS_EXCEPTION_BIT;
}
++count;
}
if(p->upperCase!=p->titleCase) {
x=UPROPS_EXCEPTION_BIT;
++count;
}
if(p->numericType!=0) {
do { /* pseudo-loop to allow break instead of goto */
/* encode numeric type & value */
type=p->numericType;
value=p->numericValue;
++count;
}
if(p->denominator!=0) {
x=UPROPS_EXCEPTION_BIT;
++count;
}
if(p->isMirrored) {
if(p->mirrorMapping!=0) {
value=(int32_t)p->mirrorMapping-(int32_t)p->code;
}
++count;
}
if(p->specialCasing!=NULL) {
x=UPROPS_EXCEPTION_BIT;
++count;
}
if(p->caseFolding!=NULL) {
x=UPROPS_EXCEPTION_BIT;
++count;
}
den=p->denominator;
exp=p->exponent;
/* handle exceptions */
if(count>1 || x!=0 || value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
/* this code point needs exception values */
if(beVerbose) {
if(x!=0) {
/* do not print - many code points because of SpecialCasing & CaseFolding
printf("*** code 0x%06x needs an exception because it is irregular\n", p->code);
*/
} else if(value<UPROPS_MIN_VALUE || UPROPS_MAX_VALUE<value) {
printf("*** U+%04x needs an exception because its value is out-of-bounds at %ld (not [%ld..%ld]\n",
(int)p->code, (long)value, (long)UPROPS_MIN_VALUE, (long)UPROPS_MAX_VALUE);
if(den!=0) {
/* fraction */
if( type!=U_NT_NUMERIC ||
value<-1 || value==0 || value>UPROPS_FRACTION_MAX_NUM ||
den<UPROPS_FRACTION_MIN_DEN || UPROPS_FRACTION_MAX_DEN<den ||
exp!=0
) {
break;
}
type=UPROPS_NT_FRACTION;
if(value==-1) {
value=0;
}
den-=UPROPS_FRACTION_DEN_OFFSET;
value=(value<<UPROPS_FRACTION_NUM_SHIFT)|den;
} else if(exp!=0) {
/* very large value */
if( type!=U_NT_NUMERIC ||
value<1 || 9<value ||
exp<UPROPS_LARGE_MIN_EXP || UPROPS_LARGE_MAX_EXP_EXTRA<exp
) {
break;
}
type=UPROPS_NT_LARGE;
if(exp<=UPROPS_LARGE_MAX_EXP) {
/* 1..9 * 10^(2..17) */
exp-=UPROPS_LARGE_EXP_OFFSET;
} else {
printf("*** U+%04x needs an exception because it has %u values\n",
(int)p->code, count);
/* 1 * 10^(18..33) */
if(value!=1) {
break;
}
value=0;
exp-=UPROPS_LARGE_EXP_OFFSET_EXTRA;
}
value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
} else if(value>UPROPS_MAX_SMALL_NUMBER) {
/* large value */
if(type!=U_NT_NUMERIC) {
break;
}
type=UPROPS_NT_LARGE;
/* split the value into mantissa and exponent, base 10 */
while((value%10)==0) {
value/=10;
++exp;
}
if(value>9) {
break;
}
++exceptionsCount;
x=UPROPS_EXCEPTION_BIT;
exp-=UPROPS_LARGE_EXP_OFFSET;
value=(value<<UPROPS_LARGE_MANT_SHIFT)|exp;
/* allocate and create exception values */
value=exceptionsTop;
if(value>=UPROPS_MAX_EXCEPTIONS_COUNT) {
fprintf(stderr, "genprops: out of exceptions memory at U+%06x. (%d exceeds allocated space)\n",
(int)p->code, (int)value);
exit(U_MEMORY_ALLOCATION_ERROR);
} else {
uint32_t first=0;
uint16_t length=1;
if(p->upperCase!=0) {
first|=1;
exceptions[value+length++]=p->upperCase;
}
if(p->lowerCase!=0) {
first|=2;
exceptions[value+length++]=p->lowerCase;
}
if(p->upperCase!=p->titleCase) {
first|=4;
if(p->titleCase!=0) {
exceptions[value+length++]=p->titleCase;
} else {
exceptions[value+length++]=p->code;
}
}
if(p->numericType!=0) {
if(p->denominator==0) {
first|=0x10;
exceptions[value+length++]=(uint32_t)p->numericValue;
} else {
if(p->numericValue!=1) {
first|=0x10;
exceptions[value+length++]=(uint32_t)p->numericValue;
}
first|=0x20;
exceptions[value+length++]=p->denominator;
}
}
if(p->isMirrored) {
first|=0x40;
exceptions[value+length++]=p->mirrorMapping;
}
if(p->specialCasing!=NULL) {
first|=0x80;
if(p->specialCasing->isComplex) {
/* complex special casing */
exceptions[value+length++]=0x80000000;
} else {
/* unconditional special casing */
UChar u[128];
uint32_t i;
uint16_t j, entry;
i=1;
entry=0;
j=p->specialCasing->lowerCase[0];
if(j>0) {
uprv_memcpy(u+1, p->specialCasing->lowerCase+1, 2*j);
i+=j;
entry=j;
}
j=p->specialCasing->upperCase[0];
if(j>0) {
uprv_memcpy(u+i, p->specialCasing->upperCase+1, 2*j);
i+=j;
entry|=j<<5;
}
j=p->specialCasing->titleCase[0];
if(j>0) {
uprv_memcpy(u+i, p->specialCasing->titleCase+1, 2*j);
i+=j;
entry|=j<<10;
}
u[0]=entry;
exceptions[value+length++]=(i<<24)|addUChars(u, i);
}
}
if(p->caseFolding!=NULL) {
first|=0x100;
if(p->caseFolding->simple==0 && p->caseFolding->full[0]==0) {
/* special case folding, store only a marker */
exceptions[value+length++]=0;
} else {
/* normal case folding with a simple and a full mapping */
UChar u[128];
uint16_t i;
/* store the simple mapping into the first two UChars */
i=0;
u[1]=0;
UTF_APPEND_CHAR_UNSAFE(u, i, p->caseFolding->simple);
/* store the full mapping after that */
i=p->caseFolding->full[0];
if(i>0) {
uprv_memcpy(u+2, p->caseFolding->full+1, 2*i);
/* } else normal value=0..0xff { */
}
exceptions[value+length++]=(i<<24)|addUChars(u, 2+i);
}
}
exceptions[value]=first;
exceptionsTop+=length;
}
}
/* put together the 32-bit word of encoded properties */
x|=
/* encode the properties */
return
(uint32_t)p->generalCategory |
(uint32_t)p->bidi<<UPROPS_BIDI_SHIFT |
(uint32_t)p->isMirrored<<UPROPS_MIRROR_SHIFT |
(uint32_t)p->numericType<<UPROPS_NUMERIC_TYPE_SHIFT |
(uint32_t)value<<UPROPS_VALUE_SHIFT;
((uint32_t)type<<UPROPS_NUMERIC_TYPE_SHIFT) |
((uint32_t)value<<UPROPS_NUMERIC_VALUE_SHIFT);
} while(0);
return x;
/*
* "Higher-hanging fruit" (not implemented):
*
* For some sets of fields, there are fewer sets of values
* than the product of the numbers of values per field.
* This means that storing one single value for more than
* one field and later looking up both field values in a table
* saves space.
* Examples:
* - general category & BiDi
*
* There are only few common displacements between a code point
* and its case mappings. Store deltas. Store codes for few
* occuring deltas.
*/
fprintf(stderr, "genprops error: unable to encode numeric type & value %d %ld/%lu E%d\n",
(int)p->numericType, (long)p->numericValue, (unsigned long)p->denominator, p->exponent);
exit(U_ILLEGAL_ARGUMENT_ERROR);
return 0;
}
extern void
@ -611,21 +358,6 @@ addProps(uint32_t c, uint32_t x) {
}
}
extern void
addCaseSensitive(UChar32 first, UChar32 last) {
uint32_t x, cs;
cs=U_MASK(UPROPS_CASE_SENSITIVE_SHIFT);
while(first<=last) {
x=utrie_get32(pTrie, first, NULL);
if(!utrie_set32(pTrie, first, x|cs)) {
fprintf(stderr, "error: too many entries for the properties trie\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
++first;
}
}
extern uint32_t
getProps(uint32_t c) {
return utrie_get32(pTrie, (UChar32)c, NULL);
@ -641,125 +373,8 @@ repeatProps(uint32_t first, uint32_t last, uint32_t x) {
}
}
/* compacting --------------------------------------------------------------- */
static void
compactProps(void) {
/*
* At this point, all the propsTop properties are in props[], but they
* are not all unique.
* Now we sort them, reduce them to unique ones in props32[], and
* build an index in stage3[] from the old to the new indexes.
* (The quick sort averages at N*log(N) with N=propsTop. The inverting
* yields linear performance.)
*/
/*
* We are going to sort only an index table in map[] because we need this
* index table anyway and qsort() does not allow to sort two tables together
* directly. This will thus also reduce the amount of data moved around.
*/
uint32_t x;
int32_t i, oldIndex, newIndex;
static uint16_t map[MAX_PROPS_COUNT];
#if DO_DEBUG_OUT
{
/* debug output */
uint16_t i1, i2, i3;
uint32_t c;
for(c=0; c<0xffff; c+=307) {
printf("properties(0x%06x)=0x%06x\n", c, getProps(c, &i1, &i2, &i3));
}
}
#endif
props=utrie_getData(pTrie, &propsTop);
/* build the index table */
for(i=propsTop; i>0;) {
--i;
map[i]=(uint16_t)i;
}
/* reorder */
qsort(map, propsTop, 2, compareProps);
/*
* Now invert the reordered table and compact it in the same step.
* The result will be props32[] having only unique properties words
* and stage3[] having indexes to them.
*/
newIndex=0;
for(i=0; i<propsTop;) {
/* set the first of a possible series of the same properties */
oldIndex=map[i];
props32[newIndex]=x=props[oldIndex];
props[oldIndex]=newIndex;
/* set the following same properties only in stage3 */
while(++i<propsTop && x==props[map[i]]) {
props[map[i]]=newIndex;
}
++newIndex;
}
/* we saved some space */
if(beVerbose) {
printf("compactProps() reduced propsTop from %u to %u\n",
(int)propsTop, (int)newIndex);
}
propsTop=newIndex;
#if DO_DEBUG_OUT
{
/* debug output */
uint16_t i1, i2, i3, i4;
uint32_t c;
for(c=0; c<0xffff; c+=307) {
printf("properties(0x%06x)=0x%06x\n", c, getProps2(c, &i1, &i2, &i3, &i4));
}
}
#endif
}
static int
compareProps(const void *l, const void *r) {
uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
/* compare general categories first */
int rc=(int)(left&0x1f)-(int)(right&0x1f);
if(rc==0 && left!=right) {
rc= left<right ? -1 : 1;
}
return rc;
}
/* generate output data ----------------------------------------------------- */
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
U_CFUNC uint32_t U_EXPORT2
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t value;
UChar32 limit;
UBool inBlockZero;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else if(value!=0) {
return (uint32_t)(offset|0x8000);
} else {
++start;
}
}
return 0;
}
extern void
generateData(const char *dataDir) {
static int32_t indexes[UPROPS_INDEX_COUNT]={
@ -777,9 +392,7 @@ generateData(const char *dataDir) {
int32_t trieSize, additionalPropsSize, offset;
long dataLength;
compactProps();
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), getFoldedPropsValue, TRUE, &errorCode);
trieSize=utrie_serialize(pTrie, trieBlock, sizeof(trieBlock), NULL, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize failed: %s (length %ld)\n", u_errorName(errorCode), (long)trieSize);
exit(errorCode);
@ -787,28 +400,16 @@ generateData(const char *dataDir) {
offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
/* round up trie size to 4-alignement */
/* round up trie size to 4-alignment */
trieSize=(trieSize+3)&~3;
offset+=trieSize>>2;
indexes[UPROPS_PROPS32_INDEX]=offset; /* uint32_t offset to props[] */
offset+=propsTop;
indexes[UPROPS_EXCEPTIONS_INDEX]=offset;/* uint32_t offset to exceptions[] */
offset+=exceptionsTop; /* uint32_t offset to the first unit after exceptions[] */
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]=offset;
/* round up UChar count to 4-alignement */
ucharsTop=(ucharsTop+1)&~1;
offset+=(uint16_t)(ucharsTop/2); /* uint32_t offset to the first unit after uchars[] */
indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
if(beVerbose) {
printf("trie size in bytes: %5u\n", (int)trieSize);
printf("number of unique properties values: %5u\n", (int)propsTop);
printf("number of code points with exceptions: %5u\n", exceptionsCount);
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
printf("number of UChars for special mappings: %5u\n", (int)ucharsTop);
}
additionalPropsSize=writeAdditionalData(additionalProps, sizeof(additionalProps), indexes);
@ -828,9 +429,6 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, trieBlock, trieSize);
udata_writeBlock(pData, props32, 4*propsTop);
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
udata_writeBlock(pData, uchars, 2*ucharsTop);
udata_writeBlock(pData, additionalProps, additionalPropsSize);
/* finish up */
@ -845,25 +443,6 @@ generateData(const char *dataDir) {
dataLength, (unsigned long)size);
exit(U_INTERNAL_PROGRAM_ERROR);
}
utrie_close(pTrie);
}
/* helpers ------------------------------------------------------------------ */
static uint32_t
addUChars(const UChar *s, uint32_t length) {
uint32_t top=(uint16_t)(ucharsTop+length);
UChar *p;
if(top>=MAX_UCHAR_COUNT) {
fprintf(stderr, "genprops: out of UChars memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
p=uchars+ucharsTop;
uprv_memcpy(p, s, 2*length);
ucharsTop=top;
return (uint32_t)(p-uchars);
}
/*