ICU-1721 store numericType, parse Scripts.txt, some cleanup and more common code
X-SVN-Rev: 7826
This commit is contained in:
parent
b04d6bd082
commit
faa1bf56cb
@ -175,6 +175,34 @@ writeUCDFilename(char *basename, const char *filename, const char *suffix) {
|
||||
uprv_strcpy(basename+length, ".txt");
|
||||
}
|
||||
|
||||
U_CFUNC int32_t
|
||||
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
|
||||
const char *t, *z;
|
||||
int32_t i, j;
|
||||
|
||||
s=u_skipWhitespace(s);
|
||||
for(i=0; i<countTokens; ++i) {
|
||||
t=tokens[i];
|
||||
if(t!=NULL) {
|
||||
for(j=0;; ++j) {
|
||||
if(t[j]!=0) {
|
||||
if(s[j]!=t[j]) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
z=u_skipWhitespace(s+j);
|
||||
if(*z==';' || *z==0) {
|
||||
return i;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* parser for BidiMirroring.txt --------------------------------------------- */
|
||||
|
||||
#define MAX_MIRROR_COUNT 2000
|
||||
@ -468,7 +496,7 @@ unicodeDataLineFn(void *context,
|
||||
char *end;
|
||||
static uint32_t prevCode=0;
|
||||
uint32_t value;
|
||||
int i;
|
||||
int32_t i;
|
||||
|
||||
/* reset the properties */
|
||||
uprv_memset(&p, 0, sizeof(Props));
|
||||
@ -484,18 +512,14 @@ unicodeDataLineFn(void *context,
|
||||
}
|
||||
|
||||
/* get general category, field 2 */
|
||||
*fields[2][1]=0;
|
||||
for(i=0;;) {
|
||||
if(uprv_strcmp(fields[2][0], genCategoryNames[i])==0) {
|
||||
p.generalCategory=(uint8_t)i;
|
||||
break;
|
||||
}
|
||||
if(++i==U_CHAR_CATEGORY_COUNT) {
|
||||
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
|
||||
fields[2][0], (unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
|
||||
if(i>=0) {
|
||||
p.generalCategory=(uint8_t)i;
|
||||
} else {
|
||||
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
|
||||
fields[2][0], (unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get canonical combining class, field 3 */
|
||||
@ -508,18 +532,14 @@ unicodeDataLineFn(void *context,
|
||||
}
|
||||
|
||||
/* get BiDi category, field 4 */
|
||||
*fields[4][1]=0;
|
||||
for(i=0;;) {
|
||||
if(uprv_strcmp(fields[4][0], bidiNames[i])==0) {
|
||||
p.bidi=(uint8_t)i;
|
||||
break;
|
||||
}
|
||||
if(++i==U_CHAR_DIRECTION_COUNT) {
|
||||
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
|
||||
fields[4][0], (unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
i=getTokenIndex(bidiNames, U_CHAR_DIRECTION_COUNT, fields[4][0]);
|
||||
if(i>=0) {
|
||||
p.bidi=(uint8_t)i;
|
||||
} else {
|
||||
fprintf(stderr, "genprops: unknown BiDi category \"%s\" at code 0x%lx\n",
|
||||
fields[4][0], (unsigned long)p.code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* decimal digit value, field 6 */
|
||||
@ -532,6 +552,7 @@ unicodeDataLineFn(void *context,
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
p.decimalDigitValue=(int16_t)value;
|
||||
p.numericType=1;
|
||||
}
|
||||
|
||||
/* digit value, field 7 */
|
||||
@ -544,6 +565,9 @@ unicodeDataLineFn(void *context,
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
p.digitValue=(int16_t)value;
|
||||
if(p.numericType==0) {
|
||||
p.numericType=2;
|
||||
}
|
||||
}
|
||||
|
||||
/* numeric value, field 8 */
|
||||
@ -582,7 +606,9 @@ unicodeDataLineFn(void *context,
|
||||
} else {
|
||||
p.numericValue=(int32_t)value;
|
||||
}
|
||||
p.hasNumericValue=TRUE;
|
||||
if(p.numericType==0) {
|
||||
p.numericType=3;
|
||||
}
|
||||
}
|
||||
|
||||
/* get Mirrored flag, field 9 */
|
||||
|
@ -42,9 +42,9 @@ typedef struct {
|
||||
typedef struct {
|
||||
uint32_t code, lowerCase, upperCase, titleCase, mirrorMapping;
|
||||
int16_t decimalDigitValue, digitValue; /* -1: no value */
|
||||
int32_t numericValue; /* see hasNumericValue */
|
||||
int32_t numericValue; /* see numericType */
|
||||
uint32_t denominator; /* 0: no value */
|
||||
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, hasNumericValue;
|
||||
uint8_t generalCategory, canonicalCombining, bidi, isMirrored, numericType;
|
||||
SpecialCasing *specialCasing;
|
||||
CaseFolding *caseFolding;
|
||||
} Props;
|
||||
@ -63,6 +63,9 @@ genCategoryNames[];
|
||||
U_CFUNC void
|
||||
writeUCDFilename(char *basename, const char *filename, const char *suffix);
|
||||
|
||||
U_CFUNC int32_t
|
||||
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
|
||||
|
||||
extern void
|
||||
setUnicodeVersion(const char *v);
|
||||
|
||||
|
@ -19,6 +19,7 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "utrie.h"
|
||||
@ -38,6 +39,9 @@ static int32_t pvCount;
|
||||
static void
|
||||
parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseScripts(const char *filename, uint32_t *pv, UErrorCode *pErrorCode);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
U_CFUNC void
|
||||
@ -48,10 +52,13 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
||||
|
||||
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
|
||||
|
||||
/* process DerivedAge.txt */
|
||||
/* process various UCD .txt files */
|
||||
writeUCDFilename(basename, "DerivedAge", suffix);
|
||||
parseAge(filename, pv, pErrorCode);
|
||||
|
||||
writeUCDFilename(basename, "Scripts", suffix);
|
||||
parseScripts(filename, pv, pErrorCode);
|
||||
|
||||
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
|
||||
if(trie==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
@ -66,6 +73,8 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
||||
}
|
||||
}
|
||||
|
||||
/* DerivedAge.txt ----------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
ageLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
@ -120,6 +129,80 @@ parseAge(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, ageLineFn, pv, pErrorCode);
|
||||
}
|
||||
|
||||
/* Scripts.txt -------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
scriptsLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t *pv=(uint32_t *)context;
|
||||
char *s, *end;
|
||||
uint32_t start, limit;
|
||||
UScriptCode script;
|
||||
|
||||
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genprops: syntax error in Scripts.txt field 0 at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
++limit;
|
||||
|
||||
/* parse script name */
|
||||
s=(char *)u_skipWhitespace(fields[1][0]);
|
||||
|
||||
/* trim trailing whitespace */
|
||||
end=fields[1][1];
|
||||
while(s<end && (*(end-1)==' ' || *(end-1)=='\t')) {
|
||||
--end;
|
||||
}
|
||||
*end=0;
|
||||
if( 1!=uscript_getCode(s, &script, 1, pErrorCode) ||
|
||||
U_FAILURE(*pErrorCode) ||
|
||||
script<=USCRIPT_INVALID_CODE
|
||||
) {
|
||||
fprintf(stderr, "genprops: syntax error in Scripts.txt field 1 at %s\n", fields[1][0]);
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
}
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
if(!upvec_setValue(pv, start, limit, 0, (uint32_t)script, UPROPS_SCRIPT_MASK, pErrorCode)) {
|
||||
fprintf(stderr, "genprops: unable to set script code: %s\n", u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseScripts(const char *filename, uint32_t *pv, UErrorCode *pErrorCode) {
|
||||
char *fields[2][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* UTR 24 says:
|
||||
* Section 2:
|
||||
* "Common - For characters that may be used
|
||||
* within multiple scripts,
|
||||
* or any unassigned code points."
|
||||
*
|
||||
* Section 4:
|
||||
* "The value COMMON is the default value,
|
||||
* given to all code points that are not
|
||||
* explicitly mentioned in the data file."
|
||||
*/
|
||||
if(!upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)USCRIPT_COMMON, UPROPS_SCRIPT_MASK, pErrorCode)) {
|
||||
fprintf(stderr, "genprops: unable to set script code: %s\n", u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, scriptsLineFn, pv, pErrorCode);
|
||||
}
|
||||
|
||||
/* data serialization ------------------------------------------------------- */
|
||||
|
||||
U_CFUNC int32_t
|
||||
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
|
||||
int32_t length;
|
||||
@ -141,9 +224,9 @@ writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_C
|
||||
/* set indexes */
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
||||
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
||||
indexes[UPROPS_RESERVED_INDEX]=
|
||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
||||
}
|
||||
|
||||
if(p!=NULL && (pvCount*4)<=capacity) {
|
||||
|
@ -32,13 +32,6 @@
|
||||
|
||||
#define DO_DEBUG_OUT 0
|
||||
|
||||
/*
|
||||
* ### TODO
|
||||
* document structure with additional properties
|
||||
* use index enums in uchar.c
|
||||
* improve UTrie compaction: remove identical data blocks before folding! - need to remember which ones are skipped?!
|
||||
*/
|
||||
|
||||
/* Unicode character properties file format ------------------------------------
|
||||
|
||||
The file format prepared and written here contains several data
|
||||
@ -49,7 +42,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
||||
precedes the actual data. It contains platform properties values and the
|
||||
file format version.
|
||||
|
||||
The following is a description of format version 2.0 .
|
||||
The following is a description of format version 2.1 .
|
||||
|
||||
Data contents:
|
||||
|
||||
@ -68,13 +61,18 @@ to the beginning of the data:
|
||||
|
||||
Formally, the file contains the following structures:
|
||||
|
||||
indexes[16] with values i0..i15:
|
||||
const int32_t indexes[16] with values i0..i15:
|
||||
|
||||
i0 const int32_t propsIndex; -- 32-bit unit index to the table of 32-bit properties words
|
||||
i1 const int32_t exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
|
||||
i2 const int32_t exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
|
||||
i3 const int32_t ucharsTopIndex; -- 32-bit unit index to the first unit after the array of UChars for special mappings
|
||||
i4..i15 const int32_t[] reservedIndex; -- reserved values; 0 for now
|
||||
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
|
||||
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
|
||||
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
|
||||
|
||||
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
|
||||
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
|
||||
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
|
||||
|
||||
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
|
||||
i7..i15 reservedIndexes; -- reserved values; 0 for now
|
||||
|
||||
PT serialized properties trie, see utrie.h (byte size: 4*(i0-16))
|
||||
|
||||
@ -82,6 +80,9 @@ Formally, the file contains the following structures:
|
||||
E const uint32_t exceptions[i2-i1];
|
||||
U const UChar uchars[2*(i3-i2)];
|
||||
|
||||
AT serialized trie for additional properties (byte size: 4*(i4-i3))
|
||||
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
|
||||
|
||||
Trie lookup and properties:
|
||||
|
||||
In order to condense the data for the 21-bit code space, several properties of
|
||||
@ -138,7 +139,12 @@ Each 32-bit properties word contains:
|
||||
5 has exception values
|
||||
6..10 BiDi category
|
||||
11 is mirrored
|
||||
12..19 reserved
|
||||
12..13 numericType (new in format version 2.1):
|
||||
0 no numeric value
|
||||
1 decimal digit value
|
||||
2 digit value
|
||||
3 numeric value
|
||||
14..19 reserved
|
||||
20..31 value according to bits 0..5:
|
||||
if(has exception) {
|
||||
exception index;
|
||||
@ -239,6 +245,22 @@ Its exception values would be stored as 3 uint32_t words:
|
||||
- lowercase mapping 0x2170
|
||||
- numeric value=1
|
||||
|
||||
--- Additional properties (new in format version 2.1) ---
|
||||
|
||||
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
|
||||
The data words consist of 32-bit unit indexes (not row indexes!) into the
|
||||
table of unique properties vectors (PV).
|
||||
Each vector contains a set of properties. The width of a vector may change
|
||||
with the formatVersion, it is stored in i5.
|
||||
|
||||
Current properties: (see also icu/source/common/uprops.h)
|
||||
|
||||
Word/Bits
|
||||
0 31..24 age of the code point designation/assignment, a Unicode version
|
||||
bits 31..28 major version number
|
||||
bits 27..24 minor version number
|
||||
0 6.. 0 UScriptCode
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
@ -265,10 +287,13 @@ enum {
|
||||
|
||||
/* definitions for the properties words */
|
||||
enum {
|
||||
EXCEPTION_SHIFT=5,
|
||||
BIDI_SHIFT,
|
||||
MIRROR_SHIFT=BIDI_SHIFT+5,
|
||||
VALUE_SHIFT=20,
|
||||
/* general category shift==0 0 (5 bits) */
|
||||
EXCEPTION_SHIFT=5, /* 5 (1 bit) */
|
||||
BIDI_SHIFT, /* 6 (5 bits) */
|
||||
MIRROR_SHIFT=BIDI_SHIFT+5, /* 11 (1 bit) */
|
||||
NUMERIC_TYPE_SHIFT, /* 12 (2 bits) */
|
||||
RESERVED_SHIFT=NUMERIC_TYPE_SHIFT+2, /* 14 (6 bits) */
|
||||
VALUE_SHIFT=20, /* 20 */
|
||||
|
||||
EXCEPTION_BIT=1UL<<EXCEPTION_SHIFT,
|
||||
VALUE_BITS=32-VALUE_SHIFT
|
||||
@ -413,7 +438,7 @@ makeProps(Props *p) {
|
||||
if(p->generalCategory==U_DECIMAL_DIGIT_NUMBER) {
|
||||
/* verify that all numeric fields contain the same value */
|
||||
if(p->decimalDigitValue!=-1 && p->digitValue==p->decimalDigitValue &&
|
||||
p->hasNumericValue && p->numericValue==p->decimalDigitValue &&
|
||||
p->numericType==1 && p->numericValue==p->decimalDigitValue &&
|
||||
p->denominator==0
|
||||
) {
|
||||
value=p->decimalDigitValue;
|
||||
@ -422,15 +447,13 @@ makeProps(Props *p) {
|
||||
}
|
||||
++count;
|
||||
} else if(p->generalCategory==U_LETTER_NUMBER || p->generalCategory==U_OTHER_NUMBER) {
|
||||
/* verify that only the numeric value field itself contains a value */
|
||||
if(p->decimalDigitValue==-1 && p->digitValue==-1 && p->hasNumericValue) {
|
||||
if(p->numericType==3) {
|
||||
value=p->numericValue;
|
||||
} else {
|
||||
x=EXCEPTION_BIT;
|
||||
}
|
||||
++count;
|
||||
} else if(p->decimalDigitValue!=-1 || p->digitValue!=-1 || p->hasNumericValue) {
|
||||
/* verify that only numeric categories have numeric values */
|
||||
} else if(p->numericType!=0) {
|
||||
x=EXCEPTION_BIT;
|
||||
++count;
|
||||
}
|
||||
@ -506,7 +529,7 @@ makeProps(Props *p) {
|
||||
(uint32_t)p->decimalDigitValue<<16|
|
||||
(uint16_t)p->digitValue;
|
||||
}
|
||||
if(p->hasNumericValue) {
|
||||
if(p->numericType==3) {
|
||||
if(p->denominator==0) {
|
||||
first|=0x10;
|
||||
exceptions[value+length++]=(uint32_t)p->numericValue;
|
||||
@ -593,6 +616,7 @@ makeProps(Props *p) {
|
||||
(uint32_t)p->generalCategory |
|
||||
(uint32_t)p->bidi<<BIDI_SHIFT |
|
||||
(uint32_t)p->isMirrored<<MIRROR_SHIFT |
|
||||
(uint32_t)p->numericType<<NUMERIC_TYPE_SHIFT |
|
||||
(uint32_t)value<<VALUE_SHIFT;
|
||||
|
||||
if(beVerbose && p->code<=0x9f) {
|
||||
|
Loading…
Reference in New Issue
Block a user