ICU-1721 store numericType, parse Scripts.txt, some cleanup and more common code

X-SVN-Rev: 7826
This commit is contained in:
Markus Scherer 2002-03-01 01:58:49 +00:00
parent b04d6bd082
commit faa1bf56cb
4 changed files with 192 additions and 56 deletions

View File

@ -175,6 +175,34 @@ writeUCDFilename(char *basename, const char *filename, const char *suffix) {
uprv_strcpy(basename+length, ".txt");
}
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
const char *t, *z;
int32_t i, j;
s=u_skipWhitespace(s);
for(i=0; i<countTokens; ++i) {
t=tokens[i];
if(t!=NULL) {
for(j=0;; ++j) {
if(t[j]!=0) {
if(s[j]!=t[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0) {
return i;
} else {
break;
}
}
}
}
}
return -1;
}
/* parser for BidiMirroring.txt --------------------------------------------- */
#define MAX_MIRROR_COUNT 2000
@ -468,7 +496,7 @@ unicodeDataLineFn(void *context,
char *end;
static uint32_t prevCode=0;
uint32_t value;
int i;
int32_t i;
/* reset the properties */
uprv_memset(&p, 0, sizeof(Props));
@ -484,18 +512,14 @@ unicodeDataLineFn(void *context,
}
/* get general category, field 2 */
*fields[2][1]=0;
for(i=0;;) {
if(uprv_strcmp(fields[2][0], genCategoryNames[i])==0) {
p.generalCategory=(uint8_t)i;
break;
}
if(++i==U_CHAR_CATEGORY_COUNT) {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
fields[2][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
if(i>=0) {
p.generalCategory=(uint8_t)i;
} else {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
fields[2][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get canonical combining class, field 3 */
@ -508,18 +532,14 @@ unicodeDataLineFn(void *context,
}
/* get BiDi category, field 4 */
*fields[4][1]=0;
for(i=0;;) {
if(uprv_strcmp(fields[4][0], bidiNames[i])==0) {
p.bidi=(uint8_t)i;
break;
}
if(++i==U_CHAR_DIRECTION_COUNT) {
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
fields[4][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
if(i>=0) {
p.bidi=(uint8_t)i;
} else {
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
fields[4][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* decimal digit value, field 6 */
@ -532,6 +552,7 @@ unicodeDataLineFn(void *context,
exit(U_PARSE_ERROR);
}
p.decimalDigitValue=(int16_t)value;
p.numericType=1;
}
/* digit value, field 7 */
@ -544,6 +565,9 @@ unicodeDataLineFn(void *context,
exit(U_PARSE_ERROR);
}
p.digitValue=(int16_t)value;
if(p.numericType==0) {
p.numericType=2;
}
}
/* numeric value, field 8 */
@ -582,7 +606,9 @@ unicodeDataLineFn(void *context,
} else {
p.numericValue=(int32_t)value;
}
p.hasNumericValue=TRUE;
if(p.numericType==0) {
p.numericType=3;
}
}
/* get Mirrored flag, field 9 */

View File

@ -42,9 +42,9 @@ typedef struct {
typedef struct {
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
int16_t decimalDigitValue, digitValue; /* -1: no value */
int32_t numericValue; /* see hasNumericValue */
int32_t numericValue; /* see numericType */
uint32_t denominator; /* 0: no value */
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, hasNumericValue;
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, numericType;
SpecialCasing *specialCasing;
CaseFolding *caseFolding;
} Props;
@ -63,6 +63,9 @@ genCategoryNames[];
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix);
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
extern void
setUnicodeVersion(const char *v);

View File

@ -19,6 +19,7 @@
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uscript.h"
#include "cstring.h"
#include "cmemory.h"
#include "utrie.h"
@ -38,6 +39,9 @@ static int32_t pvCount;
static void
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
static void
parseScripts(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
U_CFUNC void
@ -48,10 +52,13 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
/* process DerivedAge.txt */
/* process various UCD .txt files */
writeUCDFilename(basename, "DerivedAge", suffix);
parseAge(filename, pv, pErrorCode);
writeUCDFilename(basename, "Scripts", suffix);
parseScripts(filename, pv, pErrorCode);
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
if(trie==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
@ -66,6 +73,8 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
}
}
/* DerivedAge.txt ----------------------------------------------------------- */
static void
ageLineFn(void *context,
char *fields[][2], int32_t fieldCount,
@ -120,6 +129,80 @@ parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
u_parseDelimitedFile(filename, ';', fields, 2, ageLineFn, pv, pErrorCode);
}
/* Scripts.txt -------------------------------------------------------------- */
static void
scriptsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t *pv=(uint32_t *)context;
char *s, *end;
uint32_t start, limit;
UScriptCode script;
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in Scripts.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
++limit;
/* parse script name */
s=(char *)u_skipWhitespace(fields[1][0]);
/* trim trailing whitespace */
end=fields[1][1];
while(s<end && (*(end-1)==' ' || *(end-1)=='\t')) {
--end;
}
*end=0;
if( 1!=uscript_getCode(s, &script, 1, pErrorCode) ||
U_FAILURE(*pErrorCode) ||
script<=USCRIPT_INVALID_CODE
) {
fprintf(stderr, "genprops: syntax error in Scripts.txt field 1 at %s\n", fields[1][0]);
if(U_SUCCESS(*pErrorCode)) {
*pErrorCode=U_PARSE_ERROR;
}
exit(*pErrorCode);
}
if(!upvec_setValue(pv, start, limit, 0, (uint32_t)script, UPROPS_SCRIPT_MASK, pErrorCode)) {
fprintf(stderr, "genprops: unable to set script code: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void
parseScripts(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
char *fields[2][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
/*
* UTR 24 says:
* Section 2:
* "Common - For characters that may be used
* within multiple scripts,
* or any unassigned code points."
*
* Section 4:
* "The value COMMON is the default value,
* given to all code points that are not
* explicitly mentioned in the data file."
*/
if(!upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)USCRIPT_COMMON, UPROPS_SCRIPT_MASK, pErrorCode)) {
fprintf(stderr, "genprops: unable to set script code: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
u_parseDelimitedFile(filename, ';', fields, 2, scriptsLineFn, pv, pErrorCode);
}
/* data serialization ------------------------------------------------------- */
U_CFUNC int32_t
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
int32_t length;
@ -141,9 +224,9 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
indexes[UPROPS_RESERVED_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
}
if(p!=NULL && (pvCount*4)<=capacity) {

View File

@ -32,13 +32,6 @@
#define DO_DEBUG_OUT 0
/*
* ### TODO
* document structure with additional properties
* use index enums in uchar.c
* improve UTrie compaction: remove identical data blocks before folding! - need to remember which ones are skipped?!
*/
/* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data
@ -49,7 +42,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 2.0 .
The following is a description of format version 2.1 .
Data contents:
@ -68,13 +61,18 @@ to the beginning of the data:
Formally, the file contains the following structures:
indexes[16] with values i0..i15:
const int32_t indexes[16] with values i0..i15:
i0 const int32_t propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 const int32_t exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 const int32_t exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i3 const int32_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special mappings
i4..i15 const int32_t[] reservedIndex; -- reserved values; 0 for now
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
i7..i15 reservedIndexes; -- reserved values; 0 for now
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
@ -82,6 +80,9 @@ Formally, the file contains the following structures:
E const uint32_t exceptions[i2-i1];
U const UChar uchars[2*(i3-i2)];
AT serialized trie for additional properties (byte size: 4*(i4-i3))
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
@ -138,7 +139,12 @@ Each 32-bit properties word contains:
5 has exception values
6..10 BiDi category
11 is mirrored
12..19 reserved
12..13 numericType (new in format version 2.1):
0 no numeric value
1 decimal digit value
2 digit value
3 numeric value
14..19 reserved
20..31 value according to bits 0..5:
if(has exception) {
exception index;
@ -239,6 +245,22 @@ Its exception values would be stored as 3 uint32_t words:
- lowercase mapping 0x2170
- numeric value=1
--- Additional properties (new in format version 2.1) ---
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
The data words consist of 32-bit unit indexes (not row indexes!) into the
table of unique properties vectors (PV).
Each vector contains a set of properties. The width of a vector may change
with the formatVersion, it is stored in i5.
Current properties: (see also icu/source/common/uprops.h)
Word/Bits
0 31..24 age of the code point designation/assignment, a Unicode version
bits 31..28 major version number
bits 27..24 minor version number
0 6.. 0 UScriptCode
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
@ -265,10 +287,13 @@ enum {
/* definitions for the properties words */
enum {
EXCEPTION_SHIFT=5,
BIDI_SHIFT,
MIRROR_SHIFT=BIDI_SHIFT+5,
VALUE_SHIFT=20,
/* general category shift==0 0 (5 bits) */
EXCEPTION_SHIFT=5, /* 5 (1 bit) */
BIDI_SHIFT, /* 6 (5 bits) */
MIRROR_SHIFT=BIDI_SHIFT+5, /* 11 (1 bit) */
NUMERIC_TYPE_SHIFT, /* 12 (2 bits) */
RESERVED_SHIFT=NUMERIC_TYPE_SHIFT+2, /* 14 (6 bits) */
VALUE_SHIFT=20, /* 20 */
EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
VALUE_BITS=32-VALUE_SHIFT
@ -413,7 +438,7 @@ makeProps(Props *p) {
if(p->generalCategory==U_DECIMAL_DIGIT_NUMBER) {
/* verify that all numeric fields contain the same value */
if(p->decimalDigitValue!=-1 && p->digitValue==p->decimalDigitValue &&
p->hasNumericValue && p->numericValue==p->decimalDigitValue &&
p->numericType==1 && p->numericValue==p->decimalDigitValue &&
p->denominator==0
) {
value=p->decimalDigitValue;
@ -422,15 +447,13 @@ makeProps(Props *p) {
}
++count;
} else if(p->generalCategory==U_LETTER_NUMBER || p->generalCategory==U_OTHER_NUMBER) {
/* verify that only the numeric value field itself contains a value */
if(p->decimalDigitValue==-1 && p->digitValue==-1 && p->hasNumericValue) {
if(p->numericType==3) {
value=p->numericValue;
} else {
x=EXCEPTION_BIT;
}
++count;
} else if(p->decimalDigitValue!=-1 || p->digitValue!=-1 || p->hasNumericValue) {
/* verify that only numeric categories have numeric values */
} else if(p->numericType!=0) {
x=EXCEPTION_BIT;
++count;
}
@ -506,7 +529,7 @@ makeProps(Props *p) {
(uint32_t)p->decimalDigitValue<<16|
(uint16_t)p->digitValue;
}
if(p->hasNumericValue) {
if(p->numericType==3) {
if(p->denominator==0) {
first|=0x10;
exceptions[value+length++]=(uint32_t)p->numericValue;
@ -593,6 +616,7 @@ makeProps(Props *p) {
(uint32_t)p->generalCategory |
(uint32_t)p->bidi<<BIDI_SHIFT |
(uint32_t)p->isMirrored<<MIRROR_SHIFT |
(uint32_t)p->numericType<<NUMERIC_TYPE_SHIFT |
(uint32_t)value<<VALUE_SHIFT;
if(beVerbose && p->code<=0x9f) {