ICU-130 32-bit exception values, add mirror mappings, overhaul...

X-SVN-Rev: 1141
This commit is contained in:
Markus Scherer 2000-04-18 16:56:02 +00:00
parent 7af09219eb
commit 34e9e8fc9f
3 changed files with 565 additions and 384 deletions

View File

@ -24,18 +24,190 @@
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "genprops.h"
#include "unicode/putil.h"
extern bool_t beVerbose=FALSE, haveCopyright=TRUE;
/* general categories */
/* prototypes --------------------------------------------------------------- */
static void
init(void);
static void
parseMirror(const char *filename, UErrorCode *pErrorCode);
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }
};
extern int
main(int argc, const char *argv[]) {
char filename[300];
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
char *basename=NULL;
UErrorCode errorCode=U_ZERO_ERROR;
/* preset then read command line options */
options[4].value=u_getDataDirectory();
options[5].value="";
options[6].value="3.0.0";
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
/* error handling, printing usage message */
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
}
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
fprintf(stderr,
"usage: %s [-options] [suffix]\n"
"\tread the UnicodeData.txt file and other Unicode properties files and\n"
"\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
"\toptions:\n"
"\t\t-h or -? or --help this usage text\n"
"\t\t-v or --verbose verbose output\n"
"\t\t-c or --copyright include a copyright notice\n"
"\t\t-d or --destdir destination directory, followed by the path\n"
"\t\t-s or --sourcedir source directory, followed by the path\n"
"\t\t-u or --unicode Unicode version, followed by the version like 3.0.0\n"
"\t\tsuffix suffix that is to be appended with a '-'\n"
"\t\t to the source file basenames before opening;\n"
"\t\t 'genprops new' will read UnicodeData-new.txt etc.\n",
argv[0]);
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
/* get the options values */
beVerbose=options[2].doesOccur;
haveCopyright=options[3].doesOccur;
srcDir=options[5].value;
destDir=options[4].value;
if(argc>=2) {
suffix=argv[1];
} else {
suffix=NULL;
}
setUnicodeVersion(options[6].value);
/* prepare the filename beginning with the source dir */
uprv_strcpy(filename, srcDir);
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename=U_FILE_SEP_CHAR;
}
/* initialize */
init();
initStore();
/* process Mirror.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "Mirror.txt");
} else {
uprv_strcpy(basename, "Mirror");
basename[6]='-';
uprv_strcpy(basename+7, suffix);
uprv_strcat(basename+7, ".txt");
}
parseMirror(filename, &errorCode);
/* process UnicodeData.txt */
if(suffix==NULL) {
uprv_strcpy(basename, "UnicodeData.txt");
} else {
uprv_strcpy(basename, "UnicodeData");
basename[11]='-';
uprv_strcpy(basename+12, suffix);
uprv_strcat(basename+12, ".txt");
}
parseDB(filename, &errorCode);
/* process parsed data */
if(U_SUCCESS(errorCode)) {
repeatProps();
compactProps();
compactStage3();
compactStage2();
/* write the properties data file */
generateData(destDir);
}
return errorCode;
}
static void
init(void) {
}
/* parser for Mirror.txt ---------------------------------------------------- */
#define MAX_MIRROR_COUNT 2000
static uint32_t mirrorMappings[MAX_MIRROR_COUNT][2];
static int32_t mirrorCount=0;
static void
MirrorCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
char *end;
mirrorMappings[mirrorCount][fieldNr]=uprv_strtoul(start, &end, 16);
if((end-start)<1 || end!=limit) {
fprintf(stderr, "genprops: syntax error in Mirror.txt field %d at %s\n", fieldNr, start);
exit(U_PARSE_ERROR);
}
}
static void
MirrorFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
if(++mirrorCount==MAX_MIRROR_COUNT) {
fprintf(stderr, "genprops: too many mirror mappings\n");
exit(U_INDEX_OUTOFBOUNDS_ERROR);
}
}
static UParseFieldFn *mirrorFields[4]={
NULL,
MirrorCode,
MirrorCode,
MirrorFinish
};
static void
parseMirror(const char *filename, UErrorCode *pErrorCode) {
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', mirrorFields, 2, NULL, pErrorCode);
}
/* parser for UnicodeData.txt ----------------------------------------------- */
#define NO_NUMERIC_VALUE ((uint32_t)15821005)
/* general categories */
extern const char *const
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
NULL,
@ -55,327 +227,229 @@ bidiNames[U_CHAR_DIRECTION_COUNT]={
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
};
/* prototypes --------------------------------------------------------------- */
/* control code properties */
static const struct {
uint32_t code;
uint8_t generalCategory;
} controlProps[]={
/* TAB */ 0x9, U_SPACE_SEPARATOR,
/* VT */ 0xb, U_SPACE_SEPARATOR,
/* LF */ 0xa, U_PARAGRAPH_SEPARATOR,
/* FF */ 0xc, U_LINE_SEPARATOR,
/* CR */ 0xd, U_PARAGRAPH_SEPARATOR,
/* FS */ 0x1c, U_PARAGRAPH_SEPARATOR,
/* GS */ 0x1d, U_PARAGRAPH_SEPARATOR,
/* RS */ 0x1e, U_PARAGRAPH_SEPARATOR,
/* US */ 0x1f, U_SPACE_SEPARATOR,
/* NL */ 0x85, U_PARAGRAPH_SEPARATOR
};
static void
init(void);
UnicodeDataInit(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
static void
parseDB(FileStream *in);
static int16_t
getField(char *line, int16_t start, int16_t limit);
static void
checkLineIndex(uint32_t code, int16_t limit, int16_t length);
/* -------------------------------------------------------------------------- */
extern int
main(int argc, char *argv[]) {
FileStream *in;
const char *destdir = 0;
char *arg, *filename=NULL;
int i;
if(argc<=1) {
fprintf(stderr,
"usage: %s [-1[+|-]] [-v[+|-]] [-c[+|-]] filename\n"
"\tread the UnicodeData.txt file and \n"
"\tcreate a binary file " DATA_NAME "." DATA_TYPE " with the character properties\n"
"\toptions:\n"
"\t\t-v[+|-] verbose output\n"
"\t\t-c[+|-] do (not) include a copyright notice\n"
"\t\tfilename absolute path/filename for the\n"
"\t\t\tUnicode database text file (default: standard input)\n",
argv[0]);
}
for(i=1; i<argc; ++i) {
arg=argv[i];
if(arg[0]=='-') {
switch(arg[1]) {
case 'v':
beVerbose= arg[2]=='+';
break;
case 'c':
haveCopyright= arg[2]=='+';
break;
default:
break;
}
} else {
filename=arg;
}
}
if(filename==NULL) {
in=T_FileStream_stdin();
} else {
in=T_FileStream_open(filename, "r");
if(in==NULL) {
fprintf(stderr, "genprops: unable to open input file %s\n", filename);
exit(U_FILE_ACCESS_ERROR);
}
}
if (!destdir) {
destdir = u_getDataDirectory();
}
init();
initStore();
parseDB(in);
repeatProps();
compactProps();
compactStage3();
compactStage2();
generateData(destdir);
if(in!=T_FileStream_stdin()) {
T_FileStream_close(in);
}
return 0;
/* reset the properties */
uprv_memset(p, 0, sizeof(Props));
p->numericValue=NO_NUMERIC_VALUE;
}
static void
init(void) {
}
/* parsing ------------------------------------------------------------------ */
static void
parseDB(FileStream *in) {
char line[300];
UnicodeDataCode(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
char *end;
Props p;
uint32_t value;
int16_t start, limit, length, i;
bool_t hasNumericValue;
while(T_FileStream_readLine(in, line, sizeof(line))!=NULL) {
length=uprv_strlen(line);
/* remove trailing newline characters */
while(length>0 && (line[length-1]=='\r' || line[length-1]=='\n')) {
line[--length]=0;
}
/* reset the properties */
uprv_memset(&p, 0, sizeof(p));
hasNumericValue=FALSE;
/* get the character code, field 0 */
p.code=uprv_strtoul(line, &end, 16);
limit=end-line;
if(limit<1 || *end!=';') {
fprintf(stderr, "genprops: syntax error in field 0 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
/* skip character name, field 1 */
checkLineIndex(p.code, ++limit, length);
limit=getField(line, limit, length);
/* get general category, field 2 */
start=limit+1;
checkLineIndex(p.code, start, length);
limit=getField(line, start, length);
line[limit]=0;
for(i=1;;) {
if(uprv_strcmp(line+start, genCategoryNames[i])==0) {
p.generalCategory=(uint8_t)i;
break;
}
if(++i==U_CHAR_CATEGORY_COUNT) {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", line+start, p.code);
exit(U_PARSE_ERROR);
}
}
/* get canonical combining class, field 3 */
start=limit+1;
checkLineIndex(p.code, start, length);
p.canonicalCombining=(uint8_t)uprv_strtoul(line+start, &end, 10);
limit=end-line;
if(start>=limit || *end!=';') {
fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
/* get BiDi category, field 4 */
start=limit+1;
checkLineIndex(p.code, start, length);
limit=getField(line, start, length);
line[limit]=0;
for(i=0;;) {
if(uprv_strcmp(line+start, bidiNames[i])==0) {
p.bidi=(uint8_t)i;
break;
}
if(++i==U_CHAR_DIRECTION_COUNT) {
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", line+start, p.code);
exit(U_PARSE_ERROR);
}
}
/* character decomposition mapping, field 5 */
/* ### skip for now */
checkLineIndex(p.code, ++limit, length);
limit=getField(line, limit, length);
/* decimal digit value, field 6 */
start=limit+1;
checkLineIndex(p.code, start, length);
value=uprv_strtoul(line+start, &end, 10);
if(*end!=';') {
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
limit=end-line;
if(start<limit) {
p.numericValue=value;
hasNumericValue=TRUE;
}
/* digit value, field 7 */
start=limit+1;
checkLineIndex(p.code, start, length);
value=uprv_strtoul(line+start, &end, 10);
if(*end!=';') {
fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
limit=end-line;
if(start<limit) {
if(hasNumericValue) {
if(p.numericValue!=value) {
fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
} else {
p.numericValue=value;
hasNumericValue=TRUE;
}
}
/* numeric value, field 8 */
start=limit+1;
checkLineIndex(p.code, start, length);
value=uprv_strtoul(line+start, &end, 10);
if(value>0 && *end=='/') {
p.denominator=uprv_strtoul(end+1, &end, 10);
}
if(*end!=';') {
fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
limit=end-line;
if(start<limit) {
if(hasNumericValue) {
if(p.numericValue!=value) {
fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
} else {
p.numericValue=value;
hasNumericValue=TRUE;
}
}
/* get Mirrored flag, field 9 */
start=limit+1;
checkLineIndex(p.code, start, length);
limit=getField(line, start, length);
if(line[start]=='Y') {
p.isMirrored=1;
} else if(limit-start!=1 || line[start]!='N') {
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
/* skip Unicode 1.0 character name, field 10 */
checkLineIndex(p.code, ++limit, length);
limit=getField(line, limit, length);
/* skip comment, field 11 */
checkLineIndex(p.code, ++limit, length);
limit=getField(line, limit, length);
/* get uppercase mapping, field 12 */
start=limit+1;
checkLineIndex(p.code, start, length);
p.upperCase=uprv_strtoul(line+start, &end, 16);
limit=end-line;
if(*end!=';') {
fprintf(stderr, "genprops: syntax error in field 12 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
/* get lowercase mapping, field 13 */
start=limit+1;
checkLineIndex(p.code, start, length);
p.lowerCase=uprv_strtoul(line+start, &end, 16);
limit=end-line;
if(*end!=';') {
fprintf(stderr, "genprops: syntax error in field 13 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
/* get titlecase mapping, field 14 */
start=limit+1;
if(start<length) {
/* this is the last field */
p.titleCase=uprv_strtoul(line+start, &end, 16);
if(*end!=';' && *end!=0) {
fprintf(stderr, "genprops: syntax error in field 14 at code 0x%lx\n", p.code);
exit(U_PARSE_ERROR);
}
}
#if 0
/* debug output */
if(beVerbose) {
printf(
"0x%06lx "
"%s(%2d) "
"comb=%3d "
"bidi=%3s(%2d) "
"num=%7d/%7d "
"mirr=%d "
"u%06lx l%06lx t%06lx"
"\n",
p.code,
genCategoryNames[p.generalCategory], p.generalCategory,
p.canonicalCombining,
bidiNames[p.bidi], p.bidi,
p.numericValue, p.denominator,
p.isMirrored,
p.upperCase, p.lowerCase, p.titleCase);
}
#endif
addProps(&p);
}
}
static int16_t
getField(char *line, int16_t start, int16_t limit) {
while(start<limit && line[start]!=';') {
++start;
}
return start;
}
static void
checkLineIndex(uint32_t code, int16_t index, int16_t length) {
if(index>=length) {
fprintf(stderr, "genprops: too few fields at code 0x%lx\n", code);
/* get the character code, field 0 */
p->code=uprv_strtoul(start, &end, 16);
if((end-start)<1 || end!=limit) {
fprintf(stderr, "genprops: syntax error in field 0 at %s\n", start);
exit(U_PARSE_ERROR);
}
}
static void
UnicodeDataCategory(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
int i;
char c;
/* get general category, field 2 */
c=*limit;
*limit=0;
for(i=1;;) {
if(uprv_strcmp(start, genCategoryNames[i])==0) {
p->generalCategory=(uint8_t)i;
break;
}
if(++i==U_CHAR_CATEGORY_COUNT) {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n", start, p->code);
exit(U_PARSE_ERROR);
}
}
*limit=c;
}
static void
UnicodeDataCombining(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
char *end;
/* get canonical combining class, field 3 */
p->canonicalCombining=(uint8_t)uprv_strtoul(start, &end, 10);
if(start>=end || end!=limit) {
fprintf(stderr, "genprops: syntax error in field 3 at code 0x%lx\n", p->code);
exit(U_PARSE_ERROR);
}
}
static void
UnicodeDataBiDi(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
int i;
char c;
/* get BiDi category, field 4 */
c=*limit;
*limit=0;
for(i=0;;) {
if(uprv_strcmp(start, bidiNames[i])==0) {
p->bidi=(uint8_t)i;
break;
}
if(++i==U_CHAR_DIRECTION_COUNT) {
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n", start, p->code);
exit(U_PARSE_ERROR);
}
}
*limit=c;
}
static void
UnicodeDataNumeric(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
uint32_t value;
char *end;
/* decimal digit value, field 6 */
/* digit value, field 7 */
/* numeric value, field 8 */
value=uprv_strtoul(start, &end, 10);
if(fieldNr==8 && value>0 && *end=='/') {
/* field 8 may contain a fractional value, get the denominator */
p->denominator=uprv_strtoul(end+1, &end, 10);
}
if(end!=limit) {
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n", p->code);
exit(U_PARSE_ERROR);
}
if(start<end) {
if(p->numericValue!=NO_NUMERIC_VALUE && p->numericValue!=value) {
fprintf(stderr, "genprops: more than one numeric value at code 0x%lx\n", p->code);
exit(U_PARSE_ERROR);
}
p->numericValue=value;
}
}
static void
UnicodeDataMirrored(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
/* get Mirrored flag, field 9 */
if(*start=='Y') {
p->isMirrored=1;
} else if(limit-start!=1 || *start!='N') {
fprintf(stderr, "genprops: syntax error in field 9 at code 0x%lx\n", p->code);
exit(U_PARSE_ERROR);
}
}
static void
UnicodeDataCase(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
Props *p=(Props *)context;
char *end;
uint32_t mapping;
/* get uppercase mapping, field 12 */
/* get lowercase mapping, field 13 */
/* get titlecase mapping, field 14 */
mapping=uprv_strtoul(start, &end, 16);
if(end!=limit) {
fprintf(stderr, "genprops: syntax error in field %d at code 0x%lx\n", fieldNr, p->code);
exit(U_PARSE_ERROR);
}
switch(fieldNr) {
case 12:
p->upperCase=mapping;
break;
case 13:
p->lowerCase=mapping;
break;
case 14:
p->titleCase=mapping;
break;
}
}
static void
UnicodeDataFinish(void *context, char *start, char *limit, int32_t fieldNr, UErrorCode *pErrorCode) {
static int32_t mirrorIndex=0;
Props *p=(Props *)context;
int16_t i;
if(p->numericValue==NO_NUMERIC_VALUE) {
p->numericValue=0;
}
/* override properties for some common control characters */
if(p->generalCategory==U_CONTROL_CHAR) {
for(i=0; i<sizeof(controlProps)/sizeof(controlProps[0]); ++i) {
if(controlProps[i].code==p->code) {
p->generalCategory=controlProps[i].generalCategory;
}
}
}
/* set additional properties from previously parsed files */
if(mirrorIndex<mirrorCount && p->code==mirrorMappings[mirrorIndex][0]) {
p->mirrorMapping=mirrorMappings[mirrorIndex++][1];
}
addProps(p);
}
static UParseFieldFn *unicodeDBFields[17]={
UnicodeDataInit,
UnicodeDataCode,
NULL, /* 1: character name */
UnicodeDataCategory,
UnicodeDataCombining,
UnicodeDataBiDi,
NULL, /* 5: character decomposition mapping */
UnicodeDataNumeric,
UnicodeDataNumeric,
UnicodeDataNumeric,
UnicodeDataMirrored,
NULL, /* 10: Unicode 1.0 character name */
NULL, /* 11: comment */
UnicodeDataCase,
UnicodeDataCase,
UnicodeDataCase,
UnicodeDataFinish
};
static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
Props p;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', unicodeDBFields, 15, &p, pErrorCode);
}
/*
* Hey, Emacs, please set the following:
*

View File

@ -25,8 +25,8 @@
/* character properties */
typedef struct {
uint32_t code, lowerCase, upperCase, titleCase;
uint32_t decomp[16];
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
/* ### uint32_t decomp[16]; */
uint32_t numericValue, denominator;
uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
} Props;
@ -42,6 +42,9 @@ extern const char *const
genCategoryNames[];
/* prototypes */
extern void
setUnicodeVersion(const char *v);
extern void
initStore(void);

View File

@ -47,7 +47,7 @@ The following is a description of format version 1.0 .
Data contents:
The contents is a parsed, binary form of several Unicode character
database files, mose prominently UnicodeData.txt.
database files, most prominently UnicodeData.txt.
Any Unicode code point from 0 to 0x10ffff can be looked up to get
the properties, if any, for that code point. This means that the input
@ -72,7 +72,7 @@ Formally, the file contains the following structures:
A1 const uint16_t STAGE_3_BITS(=4);
(STAGE_1_BITS(=11) not stored, implicitly=21-(STAGE_2_BITS+STAGE_3_BITS))
A2 const uint16_t exceptionsIndex; -- 32-bit unit index
A3 const uint16_t ucharsIndex; -- 32-bit unit index
A3 const uint16_t reservedIndex;
A4 const uint16_t reservedIndex;
A5 const uint16_t reservedIndex;
A6 const uint16_t reservedIndex;
@ -84,10 +84,7 @@ Formally, the file contains the following structures:
(possible 1*uint16_t for padding to 4-alignment)
P const uint32_t props32[variable size];
E const uint16_t exceptions[variable size];
(possible 1*uint16_t for padding to 4-alignment)
U const UChar uchars[variable size];
E const uint32_t exceptions[variable size];
3-stage lookup and properties:
@ -124,8 +121,7 @@ arrive at an index into the props32[] table containing the character
properties for c.
For some characters, not all of the properties can be efficiently encoded
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
array. Some exception entries, in turn, may contain indexes into the uchars[]
array of Unicode strings, especially for non-1:1 case mappings.
array.
The first stage consumes the 11 most significant bits of the 21-bit code point
and results in an index into the second stage:
@ -142,28 +138,27 @@ specific value, which itself is only an index into the props32[] table:
uint16_t i=p16[i3+(c&0xf)];
Note that the bit numbers and shifts actually depend on the STAGE_2/3_BITS
in p16[0..1].
There is finally the 32-bit encoded set of properties for c:
uint32_t props=p32[i];
For some characters, this contains an index into the exceptions array:
if(props&0x20) {
uint16_t e=(uint16_t)(props>>20);
if(props&EXCEPTION_BIT)) {
uint16_t e=(uint16_t)(props>>VALUE_SHIFT);
...
}
The exception values are a variable number of uint16_t starting at
The exception values are a variable number of uint32_t starting at
const uint16_t *pe=p16+2*exceptionsIndex+e;
const uint32_t *pe=p32+exceptionsIndex+e;
The first uint16_t there contains flags about what values actually follow it.
Some of those may be indexes for case mappings or similar and point to strings
(zero-terminated) in the uchars[] array:
...
uint16_t u=pe[index depends on pe[0]];
const UChar *pu=(const UChar *)(p32+ucharsIndex)+u;
The first uint32_t there contains flags about what values actually follow it.
Some of the exception values are UChar32 code points for the case mappings,
others are numeric values etc.
32-bit properties sets:
@ -171,9 +166,9 @@ Each 32-bit properties word contains:
0.. 4 general category
5 has exception values
6.. 9 BiDi category (the 5 explicit codes stored as one)
10 is mirrored
11..19 reserved
6..10 BiDi category
11 is mirrored
12..19 reserved
20..31 value according to bits 0..5:
if(has exception) {
exception index;
@ -181,52 +176,82 @@ Each 32-bit properties word contains:
case Ll: delta to uppercase; -- same as titlecase
case Lu: delta to lowercase; -- titlecase is same as c
case Lt: delta to lowercase; -- uppercase is same as c
case Mn: canonical category;
case Mn: combining class;
case N*: numeric value;
default: *;
default:
if(is mirrored) {
delta to mirror
} else {
0
};
}
Exception values:
The first uint16_t word of exception values for a code point contains flags
that indicate which values follow:
In the first uint32_t exception word for a code point,
bits
31..24 reserved
23..16 combining class
15..0 flags that indicate which values follow:
bit
0 has uppercase mapping
1 has lowercase mapping
2 has titlecase mapping
3 has canonical category
4 has numeric value (numerator)
5 has denominator value
3 has numeric value (numerator)
4 has denominator value
5 has a mirror-image Unicode code point
According to the flags in this word, one or more uint16_t words follow it
According to the flags in this word, one or more uint32_t words follow it
in the sequence of the bit flags in the flags word; if a flag is not set,
then the value is missing or 0:
For the case mappings, one uint16_t word each is an index into uchars[],
pointing to a zero-terminated UChar string for the case mapping.
For the case mappings and the mirror-image Unicode code point,
one uint32_t or UChar32 each is the code point.
For the canonical category, the lower 8 bits of a uint16_t word give the
category value directly. The upper 8 bits are currently reserved.
For the numeric/numerator value, a uint16_t word contains the value directly,
For the numeric/numerator value, an int32_t word contains the value directly,
except for when there is no numerator but a denominator, then the numerator
is 1.
For the denominator value, a uint16_t word contains the value directly.
For the denominator value, a uint32_t word contains the value directly.
Example:
U+2160, ROMAN NUMERAL ONE, needs an exception because it has a lowercase
mapping and a numeric value.
Its exception values would be stored as 3 uint16_t words:
Its exception values would be stored as 3 uint32_t words:
- flags=0x12 (see above)
- lowercase index into uchars[]
- flags=0x0a (see above) with combining class 0
- lowercase mapping 0x2170
- numeric value=1
----------------------------------------------------------------------------- */
/* ### finding an exception value */
#define HAVE_EXCEPTION_VALUE(flags, index) ((flags)&(1<<(index)))
/* number of bits in an integer value 0..31 */
static uint8_t flagsOffset[32]={
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5
};
#define GET_EXCEPTION_OFFSET(flags, index, offset) { \
if((index)>=5) { \
(offset)+=flagsOffset[(flags)&0x1f]; \
(flags)>>=5; \
(index)-=5; \
} \
(offset)+=flagsOffset[(flags)&((1<<(index))-1)]; \
}
/* UDataInfo cf. udata.h */
static const UDataInfo dataInfo={
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
@ -262,6 +287,19 @@ enum {
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT
};
/* definitions for the properties words */
enum {
EXCEPTION_SHIFT=5,
BIDI_SHIFT,
MIRROR_SHIFT=BIDI_SHIFT+5,
VALUE_SHIFT=20,
EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
VALUE_BITS=32-VALUE_SHIFT,
MAX_VALUE=(1UL<<(VALUE_BITS-1))-1,
MIN_VALUE=-(MAX_VALUE+1)
};
static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
@ -273,16 +311,14 @@ static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
/* exceptions values */
static uint16_t exceptions[MAX_EXCEPTIONS_COUNT+20];
static uint32_t exceptions[MAX_EXCEPTIONS_COUNT+20];
static uint16_t exceptionsTop=0;
/* Unicode characters, e.g. for special casing or decomposition */
static UChar uchars[MAX_UCHAR_COUNT+20];
static uint16_t ucharsTop=0;
/* statistics */
static uint16_t exceptionsCount=0;
/* prototypes --------------------------------------------------------------- */
@ -320,6 +356,38 @@ addUChars(const UChar *s, uint16_t length);
/* -------------------------------------------------------------------------- */
/* ### this must become public in putil.c */
static void
__versionFromString(UVersionInfo versionArray, const char *versionString) {
char *end;
uint16_t part=0;
if(versionArray==NULL) {
return;
}
if(versionString!=NULL) {
for(;;) {
versionArray[part]=(uint8_t)uprv_strtoul(versionString, &end, 10);
if(*end!=U_VERSION_DELIMITER || ++part==U_MAX_VERSION_LENGTH) {
break;
}
versionString=end+1;
}
}
while(part<U_MAX_VERSION_LENGTH) {
versionArray[part++]=0;
}
}
extern void
setUnicodeVersion(const char *v) {
UVersionInfo version;
__versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
extern void
initStore() {
uprv_memset(stage1, 0, sizeof(stage1));
@ -334,12 +402,6 @@ initStore() {
extern void
addProps(Props *p) {
/* map the explicit BiDi codes to one single value */
static const uint8_t bidiMap[U_CHAR_DIRECTION_COUNT]={
0, 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 15, 15, 11, 15, 15, 15, 12, 13
};
uint32_t x;
int32_t value;
uint16_t count;
@ -394,7 +456,7 @@ addProps(Props *p) {
if(!(isMn || isNumber)) {
value=(int32_t)p->code-(int32_t)p->upperCase;
} else {
x=1<<5;
x=EXCEPTION_BIT;
}
++count;
}
@ -403,7 +465,7 @@ addProps(Props *p) {
if(!(isMn || isNumber)) {
value=(int32_t)p->lowerCase-(int32_t)p->code;
} else {
x=1<<5;
x=EXCEPTION_BIT;
}
++count;
}
@ -412,7 +474,7 @@ addProps(Props *p) {
if(!(isMn || isNumber)) {
value=(int32_t)p->code-(int32_t)p->titleCase;
} else {
x=1<<5;
x=EXCEPTION_BIT;
}
++count;
}
@ -421,7 +483,7 @@ addProps(Props *p) {
if(isMn) {
value=p->canonicalCombining;
} else {
x=1<<5;
x=EXCEPTION_BIT;
}
++count;
}
@ -430,7 +492,7 @@ addProps(Props *p) {
if(isNumber) {
value=p->numericValue;
} else {
x=1<<5;
x=EXCEPTION_BIT;
}
++count;
}
@ -439,9 +501,15 @@ addProps(Props *p) {
value=p->denominator;
++count;
}
if(p->isMirrored) {
if(p->mirrorMapping!=0) {
value=(int32_t)p->mirrorMapping-(int32_t)p->code;
}
++count;
}
/* handle exceptions */
if(count>1 || x!=0 || value<-2048 || 2047<value) {
if(count>1 || x!=0 || value<MIN_VALUE || MAX_VALUE<value) {
/* this code point needs exception values */
if(DO_DEBUG_OUT /* ### beVerbose */) {
if(x!=0) {
@ -454,18 +522,58 @@ addProps(Props *p) {
}
++exceptionsCount;
x=1<<5;
x=EXCEPTION_BIT;
/* ### allocate and create exception values */
value=-exceptionsCount;
/* allocate and create exception values */
value=exceptionsTop;
if(value>=4096) {
fprintf(stderr, "genprops: out of exceptions memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
} else {
uint32_t first=(uint32_t)p->canonicalCombining<<16;
uint16_t length=1;
if(p->upperCase!=0) {
first|=1;
exceptions[value+length++]=p->upperCase;
}
if(p->lowerCase!=0) {
first|=2;
exceptions[value+length++]=p->lowerCase;
}
if(p->upperCase!=p->titleCase) {
first|=4;
exceptions[value+length++]=p->titleCase;
}
if(p->denominator==0) {
if(p->numericValue!=0) {
first|=8;
exceptions[value+length++]=p->numericValue;
}
} else {
if(p->numericValue!=1) {
first|=8;
exceptions[value+length++]=p->numericValue;
}
first|=0x10;
exceptions[value+length++]=p->denominator;
}
if(p->isMirrored) {
first|=0x20;
exceptions[value+length++]=p->mirrorMapping;
}
exceptions[value]=first;
exceptionsTop+=length;
}
}
/* put together the 32-bit word of encoded properties */
x|=
p->generalCategory |
bidiMap[p->bidi]<<6UL |
p->isMirrored<<10UL |
(uint32_t)value<<20;
(uint32_t)p->generalCategory |
(uint32_t)p->bidi<<BIDI_SHIFT |
(uint32_t)p->isMirrored<<MIRROR_SHIFT |
(uint32_t)value<<VALUE_SHIFT;
setProps(p->code, x, &count, &count, &count);
@ -911,17 +1019,15 @@ generateData(const char *dataDir) {
}
indexes[2]=offset+=propsTop; /* uint32_t offset to exceptions[] */
indexes[3]=offset+=(exceptionsTop+1)/2; /* uint32_t offset to uchars[], include padding */
size=4*offset+ucharsTop*U_SIZEOF_UCHAR; /* total size of data */
size=4*(offset+exceptionsTop); /* total size of data */
if(beVerbose) {
printf("number of stage 2 entries: %5u\n", stage2Top);
printf("number of stage 3 entries: %5u\n", stage3Top);
printf("number of unique properties values: %5u\n", propsTop);
printf("number of code points with exceptions: %5u\n", exceptionsCount);
printf("size in bytes of exceptions: %5u\n", 2*exceptionsTop);
printf("size in bytes of Uchars: %5u\n", ucharsTop*U_SIZEOF_UCHAR);
printf("size in bytes of exceptions: %5u\n", 4*exceptionsTop);
printf("data size: %6lu\n", size);
}
@ -939,9 +1045,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, stage3, 2*stage3Top);
udata_writePadding(pData, (stage2Top+stage3Top)&1);
udata_writeBlock(pData, props32, 4*propsTop);
udata_writeBlock(pData, exceptions, 2*exceptionsTop);
udata_writePadding(pData, exceptionsTop&1);
udata_writeBlock(pData, uchars, ucharsTop*U_SIZEOF_UCHAR);
udata_writeBlock(pData, exceptions, 4*exceptionsTop);
/* finish up */
dataLength=udata_finish(pData, &errorCode);