From 34e9e8fc9fce3d0a1c06a81eba4a56e0dfa0df7f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 18 Apr 2000 16:56:02 +0000 Subject: [PATCH] ICU-130 32-bit exception values, add mirror mappings, overhaul... X-SVN-Rev: 1141 --- icu4c/source/tools/genprops/genprops.c | 698 ++++++++++++++----------- icu4c/source/tools/genprops/genprops.h | 7 +- icu4c/source/tools/genprops/store.c | 244 ++++++--- 3 files changed, 565 insertions(+), 384 deletions(-) diff --git a/icu4c/source/tools/genprops/genprops.c b/icu4c/source/tools/genprops/genprops.c index a2932171b3..41e262e1d6 100644 --- a/icu4c/source/tools/genprops/genprops.c +++ b/icu4c/source/tools/genprops/genprops.c @@ -24,18 +24,190 @@ #include #include "unicode/utypes.h" #include "unicode/uchar.h" +#include "unicode/putil.h" #include "cmemory.h" #include "cstring.h" -#include "filestrm.h" #include "unicode/udata.h" #include "unewdata.h" +#include "uoptions.h" +#include "uparse.h" #include "genprops.h" -#include "unicode/putil.h" extern bool_t beVerbose=FALSE, haveCopyright=TRUE; -/* general categories */ +/* prototypes --------------------------------------------------------------- */ +static void +init(void); + +static void +parseMirror(const char *filename, UErrorCode *pErrorCode); + +static void +parseDB(const char *filename, UErrorCode *pErrorCode); + +/* -------------------------------------------------------------------------- */ + +static UOption options[]={ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_VERBOSE, + UOPTION_COPYRIGHT, + UOPTION_DESTDIR, + UOPTION_SOURCEDIR, + { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 } +}; + +extern int +main(int argc, const char *argv[]) { + char filename[300]; + const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; + char *basename=NULL; + UErrorCode errorCode=U_ZERO_ERROR; + + /* preset then read command line options */ + options[4].value=u_getDataDirectory(); + options[5].value=""; + options[6].value="3.0.0"; + argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); + + /* error handling, printing usage message */ + if(argc<0) { + fprintf(stderr, + "error in command line argument \"%s\"\n", + argv[-argc]); + } + if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + fprintf(stderr, + "usage: %s [-options] [suffix]\n" + "\tread the UnicodeData.txt file and other Unicode properties files and\n" + "\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" + "\toptions:\n" + "\t\t-h or -? or --help this usage text\n" + "\t\t-v or --verbose verbose output\n" + "\t\t-c or --copyright include a copyright notice\n" + "\t\t-d or --destdir destination directory, followed by the path\n" + "\t\t-s or --sourcedir source directory, followed by the path\n" + "\t\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" + "\t\tsuffix suffix that is to be appended with a '-'\n" + "\t\t to the source file basenames before opening;\n" + "\t\t 'genprops new' will read UnicodeData-new.txt etc.\n", + argv[0]); + return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; + } + + /* get the options values */ + beVerbose=options[2].doesOccur; + haveCopyright=options[3].doesOccur; + srcDir=options[5].value; + destDir=options[4].value; + + if(argc>=2) { + suffix=argv[1]; + } else { + suffix=NULL; + } + + setUnicodeVersion(options[6].value); + + /* prepare the filename beginning with the source dir */ + uprv_strcpy(filename, srcDir); + basename=filename+uprv_strlen(filename); + if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { + *basename=U_FILE_SEP_CHAR; + } + + /* initialize */ + init(); + initStore(); + + /* process Mirror.txt */ + if(suffix==NULL) { + uprv_strcpy(basename, "Mirror.txt"); + } else { + uprv_strcpy(basename, "Mirror"); + basename[6]='-'; + uprv_strcpy(basename+7, suffix); + uprv_strcat(basename+7, ".txt"); + } + parseMirror(filename, &errorCode); + + /* process UnicodeData.txt */ + if(suffix==NULL) { + uprv_strcpy(basename, "UnicodeData.txt"); + } else { + uprv_strcpy(basename, "UnicodeData"); + basename[11]='-'; + uprv_strcpy(basename+12, suffix); + uprv_strcat(basename+12, ".txt"); + } + parseDB(filename, &errorCode); + + /* process parsed data */ + if(U_SUCCESS(errorCode)) { + repeatProps(); + compactProps(); + compactStage3(); + compactStage2(); + + /* write the properties data file */ + generateData(destDir); + } + + return errorCode; +} + +static void +init(void) { +} + +/* parser for Mirror.txt ---------------------------------------------------- */ + +#define MAX_MIRROR_COUNT 2000 + +static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2]; +static int32_t mirrorCount=0; + +static void +MirrorCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + char *end; + + mirrorMappings[mirrorCount][fieldNr]=uprv_strtoul(start, &end, 16); + if((end-start)<1 || end!=limit) { + fprintf(stderr, "genprops: syntax error in Mirror.txt field %d at %s\n", fieldNr, start); + exit(U_PARSE_ERROR); + } +} + +static void +MirrorFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + if(++mirrorCount==MAX_MIRROR_COUNT) { + fprintf(stderr, "genprops: too many mirror mappings\n"); + exit(U_INDEX_OUTOFBOUNDS_ERROR); + } +} + +static UParseFieldFn *mirrorFields[4]={ + NULL, + MirrorCode, + MirrorCode, + MirrorFinish +}; + +static void +parseMirror(const char *filename, UErrorCode *pErrorCode) { + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', mirrorFields, 2, NULL, pErrorCode); +} + +/* parser for UnicodeData.txt ----------------------------------------------- */ + +#define NO_NUMERIC_VALUE ((uint32_t)15821005) + +/* general categories */ extern const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={ NULL, @@ -55,327 +227,229 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={ "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN" }; -/* prototypes --------------------------------------------------------------- */ +/* control code properties */ +static const struct { + uint32_t code; + uint8_t generalCategory; +} controlProps[]={ + /* TAB */ 0x9, U_SPACE_SEPARATOR, + /* VT */ 0xb, U_SPACE_SEPARATOR, + /* LF */ 0xa, U_PARAGRAPH_SEPARATOR, + /* FF */ 0xc, U_LINE_SEPARATOR, + /* CR */ 0xd, U_PARAGRAPH_SEPARATOR, + /* FS */ 0x1c, U_PARAGRAPH_SEPARATOR, + /* GS */ 0x1d, U_PARAGRAPH_SEPARATOR, + /* RS */ 0x1e, U_PARAGRAPH_SEPARATOR, + /* US */ 0x1f, U_SPACE_SEPARATOR, + /* NL */ 0x85, U_PARAGRAPH_SEPARATOR +}; static void -init(void); +UnicodeDataInit(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; -static void -parseDB(FileStream *in); - -static int16_t -getField(char *line, int16_t start, int16_t limit); - -static void -checkLineIndex(uint32_t code, int16_t limit, int16_t length); - -/* -------------------------------------------------------------------------- */ - -extern int -main(int argc, char *argv[]) { - FileStream *in; - const char *destdir = 0; - char *arg, *filename=NULL; - int i; - - if(argc<=1) { - fprintf(stderr, - "usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n" - "\tread the UnicodeData.txt file and \n" - "\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n" - "\toptions:\n" - "\t\t-v[+|-] verbose output\n" - "\t\t-c[+|-] do (not) include a copyright notice\n" - "\t\tfilename absolute path/filename for the\n" - "\t\t\tUnicode database text file (default: standard input)\n", - argv[0]); - } - - for(i=1; inumericValue=NO_NUMERIC_VALUE; } static void -init(void) { -} - -/* parsing ------------------------------------------------------------------ */ - -static void -parseDB(FileStream *in) { - char line[300]; +UnicodeDataCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; char *end; - Props p; - uint32_t value; - int16_t start, limit, length, i; - bool_t hasNumericValue; - while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) { - length=uprv_strlen(line); - - /* remove trailing newline characters */ - while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) { - line[--length]=0; - } - - /* reset the properties */ - uprv_memset(&p, 0, sizeof(p)); - hasNumericValue=FALSE; - - /* get the character code, field 0 */ - p.code=uprv_strtoul(line, &end, 16); - limit=end-line; - if(limit<1 || *end!=';') { - fprintf(stderr, "genprops: syntax error in field 0 at code 0x%lx\n", p.code); - exit(U_PARSE_ERROR); - } - - /* skip character name, field 1 */ - checkLineIndex(p.code, ++limit, length); - limit=getField(line, limit, length); - - /* get general category, field 2 */ - start=limit+1; - checkLineIndex(p.code, start, length); - limit=getField(line, start, length); - line[limit]=0; - for(i=1;;) { - if(uprv_strcmp(line+start, genCategoryNames[i])==0) { - p.generalCategory=(uint8_t)i; - break; - } - if(++i==U_CHAR_CATEGORY_COUNT) { - fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", line+start, p.code); - exit(U_PARSE_ERROR); - } - } - - /* get canonical combining class, field 3 */ - start=limit+1; - checkLineIndex(p.code, start, length); - p.canonicalCombining=(uint8_t)uprv_strtoul(line+start, &end, 10); - limit=end-line; - if(start>=limit || *end!=';') { - fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p.code); - exit(U_PARSE_ERROR); - } - - /* get BiDi category, field 4 */ - start=limit+1; - checkLineIndex(p.code, start, length); - limit=getField(line, start, length); - line[limit]=0; - for(i=0;;) { - if(uprv_strcmp(line+start, bidiNames[i])==0) { - p.bidi=(uint8_t)i; - break; - } - if(++i==U_CHAR_DIRECTION_COUNT) { - fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", line+start, p.code); - exit(U_PARSE_ERROR); - } - } - - /* character decomposition mapping, field 5 */ - /* ### skip for now */ - checkLineIndex(p.code, ++limit, length); - limit=getField(line, limit, length); - - /* decimal digit value, field 6 */ - start=limit+1; - checkLineIndex(p.code, start, length); - value=uprv_strtoul(line+start, &end, 10); - if(*end!=';') { - fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p.code); - exit(U_PARSE_ERROR); - } - limit=end-line; - if(start0 && *end=='/') { - p.denominator=uprv_strtoul(end+1, &end, 10); - } - if(*end!=';') { - fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", p.code); - exit(U_PARSE_ERROR); - } - limit=end-line; - if(start=length) { - fprintf(stderr, "genprops: too few fields at code 0x%lx\n", code); + /* get the character code, field 0 */ + p->code=uprv_strtoul(start, &end, 16); + if((end-start)<1 || end!=limit) { + fprintf(stderr, "genprops: syntax error in field 0 at %s\n", start); exit(U_PARSE_ERROR); } } +static void +UnicodeDataCategory(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + int i; + char c; + + /* get general category, field 2 */ + c=*limit; + *limit=0; + for(i=1;;) { + if(uprv_strcmp(start, genCategoryNames[i])==0) { + p->generalCategory=(uint8_t)i; + break; + } + if(++i==U_CHAR_CATEGORY_COUNT) { + fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", start, p->code); + exit(U_PARSE_ERROR); + } + } + *limit=c; +} + +static void +UnicodeDataCombining(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + char *end; + + /* get canonical combining class, field 3 */ + p->canonicalCombining=(uint8_t)uprv_strtoul(start, &end, 10); + if(start>=end || end!=limit) { + fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p->code); + exit(U_PARSE_ERROR); + } +} + +static void +UnicodeDataBiDi(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + int i; + char c; + + /* get BiDi category, field 4 */ + c=*limit; + *limit=0; + for(i=0;;) { + if(uprv_strcmp(start, bidiNames[i])==0) { + p->bidi=(uint8_t)i; + break; + } + if(++i==U_CHAR_DIRECTION_COUNT) { + fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", start, p->code); + exit(U_PARSE_ERROR); + } + } + *limit=c; +} + +static void +UnicodeDataNumeric(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + uint32_t value; + char *end; + + /* decimal digit value, field 6 */ + /* digit value, field 7 */ + /* numeric value, field 8 */ + value=uprv_strtoul(start, &end, 10); + if(fieldNr==8 && value>0 && *end=='/') { + /* field 8 may contain a fractional value, get the denominator */ + p->denominator=uprv_strtoul(end+1, &end, 10); + } + if(end!=limit) { + fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p->code); + exit(U_PARSE_ERROR); + } + if(startnumericValue!=NO_NUMERIC_VALUE && p->numericValue!=value) { + fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p->code); + exit(U_PARSE_ERROR); + } + p->numericValue=value; + } +} + +static void +UnicodeDataMirrored(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + + /* get Mirrored flag, field 9 */ + if(*start=='Y') { + p->isMirrored=1; + } else if(limit-start!=1 || *start!='N') { + fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p->code); + exit(U_PARSE_ERROR); + } +} + +static void +UnicodeDataCase(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + Props *p=(Props *)context; + char *end; + uint32_t mapping; + + /* get uppercase mapping, field 12 */ + /* get lowercase mapping, field 13 */ + /* get titlecase mapping, field 14 */ + mapping=uprv_strtoul(start, &end, 16); + if(end!=limit) { + fprintf(stderr, "genprops: syntax error in field %d at code 0x%lx\n", fieldNr, p->code); + exit(U_PARSE_ERROR); + } + switch(fieldNr) { + case 12: + p->upperCase=mapping; + break; + case 13: + p->lowerCase=mapping; + break; + case 14: + p->titleCase=mapping; + break; + } +} + +static void +UnicodeDataFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) { + static int32_t mirrorIndex=0; + Props *p=(Props *)context; + int16_t i; + + if(p->numericValue==NO_NUMERIC_VALUE) { + p->numericValue=0; + } + + /* override properties for some common control characters */ + if(p->generalCategory==U_CONTROL_CHAR) { + for(i=0; icode) { + p->generalCategory=controlProps[i].generalCategory; + } + } + } + + /* set additional properties from previously parsed files */ + if(mirrorIndexcode==mirrorMappings[mirrorIndex][0]) { + p->mirrorMapping=mirrorMappings[mirrorIndex++][1]; + } + + addProps(p); +} + +static UParseFieldFn *unicodeDBFields[17]={ + UnicodeDataInit, + + UnicodeDataCode, + NULL, /* 1: character name */ + UnicodeDataCategory, + UnicodeDataCombining, + UnicodeDataBiDi, + NULL, /* 5: character decomposition mapping */ + UnicodeDataNumeric, + UnicodeDataNumeric, + UnicodeDataNumeric, + UnicodeDataMirrored, + NULL, /* 10: Unicode 1.0 character name */ + NULL, /* 11: comment */ + UnicodeDataCase, + UnicodeDataCase, + UnicodeDataCase, + + UnicodeDataFinish +}; + +static void +parseDB(const char *filename, UErrorCode *pErrorCode) { + Props p; + + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { + return; + } + + u_parseDelimitedFile(filename, ';', unicodeDBFields, 15, &p, pErrorCode); +} + /* * Hey, Emacs, please set the following: * diff --git a/icu4c/source/tools/genprops/genprops.h b/icu4c/source/tools/genprops/genprops.h index b6c3ad2f80..416b67094d 100644 --- a/icu4c/source/tools/genprops/genprops.h +++ b/icu4c/source/tools/genprops/genprops.h @@ -25,8 +25,8 @@ /* character properties */ typedef struct { - uint32_t code, lowerCase, upperCase, titleCase; - uint32_t decomp[16]; + uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping; + /* ### uint32_t decomp[16]; */ uint32_t numericValue, denominator; uint8_t generalCategory, canonicalCombining, bidi, isMirrored; } Props; @@ -42,6 +42,9 @@ extern const char *const genCategoryNames[]; /* prototypes */ +extern void +setUnicodeVersion(const char *v); + extern void initStore(void); diff --git a/icu4c/source/tools/genprops/store.c b/icu4c/source/tools/genprops/store.c index b30d8e0202..57a19be0d1 100644 --- a/icu4c/source/tools/genprops/store.c +++ b/icu4c/source/tools/genprops/store.c @@ -47,7 +47,7 @@ The following is a description of format version 1.0 . Data contents: The contents is a parsed, binary form of several Unicode character -database files, mose prominently UnicodeData.txt. +database files, most prominently UnicodeData.txt. Any Unicode code point from 0 to 0x10ffff can be looked up to get the properties, if any, for that code point. This means that the input @@ -72,7 +72,7 @@ Formally, the file contains the following structures: A1 const uint16_t STAGE_3_BITS(=4); (STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS)) A2 const uint16_t exceptionsIndex; -- 32-bit unit index - A3 const uint16_t ucharsIndex; -- 32-bit unit index + A3 const uint16_t reservedIndex; A4 const uint16_t reservedIndex; A5 const uint16_t reservedIndex; A6 const uint16_t reservedIndex; @@ -84,10 +84,7 @@ Formally, the file contains the following structures: (possible 1*uint16_t for padding to 4-alignment) P const uint32_t props32[variable size]; - E const uint16_t exceptions[variable size]; - (possible 1*uint16_t for padding to 4-alignment) - - U const UChar uchars[variable size]; + E const uint32_t exceptions[variable size]; 3-stage lookup and properties: @@ -124,8 +121,7 @@ arrive at an index into the props32[] table containing the character properties for c. For some characters, not all of the properties can be efficiently encoded using 32 bits. For them, the 32-bit word contains an index into the exceptions[] -array. Some exception entries, in turn, may contain indexes into the uchars[] -array of Unicode strings, especially for non-1:1 case mappings. +array. The first stage consumes the 11 most significant bits of the 21-bit code point and results in an index into the second stage: @@ -142,28 +138,27 @@ specific value, which itself is only an index into the props32[] table: uint16_t i=p16[i3+(c&0xf)]; +Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS +in p16[0..1]. + There is finally the 32-bit encoded set of properties for c: uint32_t props=p32[i]; For some characters, this contains an index into the exceptions array: - if(props&0x20) { - uint16_t e=(uint16_t)(props>>20); + if(props&EXCEPTION_BIT)) { + uint16_t e=(uint16_t)(props>>VALUE_SHIFT); ... } -The exception values are a variable number of uint16_t starting at +The exception values are a variable number of uint32_t starting at - const uint16_t *pe=p16+2*exceptionsIndex+e; + const uint32_t *pe=p32+exceptionsIndex+e; -The first uint16_t there contains flags about what values actually follow it. -Some of those may be indexes for case mappings or similar and point to strings -(zero-terminated) in the uchars[] array: - - ... - uint16_t u=pe[index depends on pe[0]]; - const UChar *pu=(const UChar *)(p32+ucharsIndex)+u; +The first uint32_t there contains flags about what values actually follow it. +Some of the exception values are UChar32 code points for the case mappings, +others are numeric values etc. 32-bit properties sets: @@ -171,9 +166,9 @@ Each 32-bit properties word contains: 0.. 4 general category 5 has exception values - 6.. 9 BiDi category (the 5 explicit codes stored as one) -10 is mirrored -11..19 reserved + 6..10 BiDi category +11 is mirrored +12..19 reserved 20..31 value according to bits 0..5: if(has exception) { exception index; @@ -181,52 +176,82 @@ Each 32-bit properties word contains: case Ll: delta to uppercase; -- same as titlecase case Lu: delta to lowercase; -- titlecase is same as c case Lt: delta to lowercase; -- uppercase is same as c - case Mn: canonical category; + case Mn: combining class; case N*: numeric value; - default: *; + default: + if(is mirrored) { + delta to mirror + } else { + 0 + }; } Exception values: -The first uint16_t word of exception values for a code point contains flags -that indicate which values follow: +In the first uint32_t exception word for a code point, +bits +31..24 reserved +23..16 combining class +15..0 flags that indicate which values follow: +bit 0 has uppercase mapping 1 has lowercase mapping 2 has titlecase mapping - 3 has canonical category - 4 has numeric value (numerator) - 5 has denominator value + 3 has numeric value (numerator) + 4 has denominator value + 5 has a mirror-image Unicode code point -According to the flags in this word, one or more uint16_t words follow it +According to the flags in this word, one or more uint32_t words follow it in the sequence of the bit flags in the flags word; if a flag is not set, then the value is missing or 0: -For the case mappings, one uint16_t word each is an index into uchars[], -pointing to a zero-terminated UChar string for the case mapping. +For the case mappings and the mirror-image Unicode code point, +one uint32_t or UChar32 each is the code point. -For the canonical category, the lower 8 bits of a uint16_t word give the -category value directly. The upper 8 bits are currently reserved. - -For the numeric/numerator value, a uint16_t word contains the value directly, +For the numeric/numerator value, an int32_t word contains the value directly, except for when there is no numerator but a denominator, then the numerator is 1. -For the denominator value, a uint16_t word contains the value directly. +For the denominator value, a uint32_t word contains the value directly. Example: U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase mapping and a numeric value. -Its exception values would be stored as 3 uint16_t words: +Its exception values would be stored as 3 uint32_t words: -- flags=0x12 (see above) -- lowercase index into uchars[] +- flags=0x0a (see above) with combining class 0 +- lowercase mapping 0x2170 - numeric value=1 ----------------------------------------------------------------------------- */ +/* ### finding an exception value */ +#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1<<(index))) + +/* number of bits in an integer value 0..31 */ +static uint8_t flagsOffset[32]={ + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5 +}; + +#define GET_EXCEPTION_OFFSET(flags, index, offset) { \ + if((index)>=5) { \ + (offset)+=flagsOffset[(flags)&0x1f]; \ + (flags)>>=5; \ + (index)-=5; \ + } \ + (offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \ +} + + + + + /* UDataInfo cf. udata.h */ -static const UDataInfo dataInfo={ +static UDataInfo dataInfo={ sizeof(UDataInfo), 0, @@ -262,6 +287,19 @@ enum { MAX_STAGE_2_COUNT=MAX_PROPS_COUNT }; +/* definitions for the properties words */ +enum { + EXCEPTION_SHIFT=5, + BIDI_SHIFT, + MIRROR_SHIFT=BIDI_SHIFT+5, + VALUE_SHIFT=20, + + EXCEPTION_BIT=1UL<code-(int32_t)p->upperCase; } else { - x=1<<5; + x=EXCEPTION_BIT; } ++count; } @@ -403,7 +465,7 @@ addProps(Props *p) { if(!(isMn || isNumber)) { value=(int32_t)p->lowerCase-(int32_t)p->code; } else { - x=1<<5; + x=EXCEPTION_BIT; } ++count; } @@ -412,7 +474,7 @@ addProps(Props *p) { if(!(isMn || isNumber)) { value=(int32_t)p->code-(int32_t)p->titleCase; } else { - x=1<<5; + x=EXCEPTION_BIT; } ++count; } @@ -421,7 +483,7 @@ addProps(Props *p) { if(isMn) { value=p->canonicalCombining; } else { - x=1<<5; + x=EXCEPTION_BIT; } ++count; } @@ -430,7 +492,7 @@ addProps(Props *p) { if(isNumber) { value=p->numericValue; } else { - x=1<<5; + x=EXCEPTION_BIT; } ++count; } @@ -439,9 +501,15 @@ addProps(Props *p) { value=p->denominator; ++count; } + if(p->isMirrored) { + if(p->mirrorMapping!=0) { + value=(int32_t)p->mirrorMapping-(int32_t)p->code; + } + ++count; + } /* handle exceptions */ - if(count>1 || x!=0 || value<-2048 || 20471 || x!=0 || value=4096) { + fprintf(stderr, "genprops: out of exceptions memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } else { + uint32_t first=(uint32_t)p->canonicalCombining<<16; + uint16_t length=1; + + if(p->upperCase!=0) { + first|=1; + exceptions[value+length++]=p->upperCase; + } + if(p->lowerCase!=0) { + first|=2; + exceptions[value+length++]=p->lowerCase; + } + if(p->upperCase!=p->titleCase) { + first|=4; + exceptions[value+length++]=p->titleCase; + } + if(p->denominator==0) { + if(p->numericValue!=0) { + first|=8; + exceptions[value+length++]=p->numericValue; + } + } else { + if(p->numericValue!=1) { + first|=8; + exceptions[value+length++]=p->numericValue; + } + first|=0x10; + exceptions[value+length++]=p->denominator; + } + if(p->isMirrored) { + first|=0x20; + exceptions[value+length++]=p->mirrorMapping; + } + + exceptions[value]=first; + exceptionsTop+=length; + } } /* put together the 32-bit word of encoded properties */ x|= - p->generalCategory | - bidiMap[p->bidi]<<6UL | - p->isMirrored<<10UL | - (uint32_t)value<<20; + (uint32_t)p->generalCategory | + (uint32_t)p->bidi<isMirrored<code, x, &count, &count, &count); @@ -911,17 +1019,15 @@ generateData(const char *dataDir) { } indexes[2]=offset+=propsTop; /* uint32_t offset to exceptions[] */ - indexes[3]=offset+=(exceptionsTop+1)/2; /* uint32_t offset to uchars[], include padding */ - size=4*offset+ucharsTop*U_SIZEOF_UCHAR; /* total size of data */ + size=4*(offset+exceptionsTop); /* total size of data */ if(beVerbose) { printf("number of stage 2 entries: %5u\n", stage2Top); printf("number of stage 3 entries: %5u\n", stage3Top); printf("number of unique properties values: %5u\n", propsTop); printf("number of code points with exceptions: %5u\n", exceptionsCount); - printf("size in bytes of exceptions: %5u\n", 2*exceptionsTop); - printf("size in bytes of Uchars: %5u\n", ucharsTop*U_SIZEOF_UCHAR); + printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop); printf("data size: %6lu\n", size); } @@ -939,9 +1045,7 @@ generateData(const char *dataDir) { udata_writeBlock(pData, stage3, 2*stage3Top); udata_writePadding(pData, (stage2Top+stage3Top)&1); udata_writeBlock(pData, props32, 4*propsTop); - udata_writeBlock(pData, exceptions, 2*exceptionsTop); - udata_writePadding(pData, exceptionsTop&1); - udata_writeBlock(pData, uchars, ucharsTop*U_SIZEOF_UCHAR); + udata_writeBlock(pData, exceptions, 4*exceptionsTop); /* finish up */ dataLength=udata_finish(pData, &errorCode);