ICU-7264 parse ScriptExtensions.txt, write uprops.icu formatVersion 7

X-SVN-Rev: 28377
This commit is contained in:
Markus Scherer 2010-07-27 23:32:04 +00:00
parent 1360486f9e
commit 3bf87d9766
5 changed files with 242 additions and 74 deletions

View File

@ -5,5 +5,5 @@
# created by: Markus W. Scherer
# edited on: 2010jul20
# edited by: Stuart G. Gill
add_executable(genprops genprops.c props2.c store.c)
add_executable(genprops genprops.cpp props2.cpp store.c)
target_link_libraries(genprops icuuc icutu)

View File

@ -1,11 +1,11 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2008, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genprops.c
* file name: genprops.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@ -40,6 +40,8 @@ U_CDECL_END
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
U_NAMESPACE_USE
UBool beVerbose=FALSE, haveCopyright=TRUE;
/* prototypes --------------------------------------------------------------- */

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2008, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -34,13 +34,13 @@ typedef struct {
} Props;
/* global flags */
extern UBool beVerbose, haveCopyright;
U_CFUNC UBool beVerbose, haveCopyright;
extern const char *const
U_CFUNC const char *const
genCategoryNames[];
/* properties vectors in props2.c */
extern UPropsVectors *pv;
/* properties vectors in props2.cpp */
U_CFUNC UPropsVectors *pv;
/* prototypes */
U_CFUNC void
@ -52,28 +52,28 @@ isToken(const char *token, const char *s);
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
extern void
U_CFUNC void
setUnicodeVersion(const char *v);
extern void
U_CFUNC void
initStore(void);
extern void
U_CFUNC void
exitStore(void);
extern uint32_t
U_CFUNC uint32_t
makeProps(Props *p);
extern void
U_CFUNC void
addProps(uint32_t c, uint32_t props);
extern uint32_t
U_CFUNC uint32_t
getProps(uint32_t c);
extern void
U_CFUNC void
repeatProps(uint32_t first, uint32_t last, uint32_t props);
extern void
U_CFUNC void
generateData(const char *dataDir, UBool csource);
/* props2.c */

View File

@ -1,11 +1,11 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: props2.c
* file name: props2.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@ -20,6 +20,7 @@
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/unistr.h"
#include "unicode/uscript.h"
#include "cstring.h"
#include "cmemory.h"
@ -32,11 +33,15 @@
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
U_NAMESPACE_USE
/* data --------------------------------------------------------------------- */
static UNewTrie *newTrie;
UPropsVectors *pv;
static UnicodeString *scriptExtensions;
/* miscellaneous ------------------------------------------------------------ */
static char *
@ -45,7 +50,7 @@ trimTerminateField(char *s, char *limit) {
s=(char *)u_skipWhitespace(s);
/* trim trailing whitespace */
while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) {
--limit;
}
*limit=0;
@ -77,6 +82,11 @@ ageLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
static void U_CALLCONV
scriptExtensionsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
static void
parseMultiFieldFile(char *filename, char *basename,
const char *ucdFile, const char *suffix,
@ -415,12 +425,14 @@ initAdditionalProperties() {
fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
exit(errorCode);
}
scriptExtensions=new UnicodeString;
}
U_CFUNC void
exitAdditionalProperties() {
utrie_close(newTrie);
upvec_close(pv);
delete scriptExtensions;
}
U_CFUNC void
@ -436,22 +448,10 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
/*
* UTR 24 says:
* Section 2:
* "Common - For characters that may be used
* within multiple scripts,
* or any unassigned code points."
*
* Section 4:
* "The value COMMON is the default value,
* given to all code points that are not
* explicitly mentioned in the data file."
*
* COMMON==USCRIPT_COMMON==0 - nothing to do
*/
parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
@ -567,6 +567,135 @@ ageLineFn(void *context,
}
}
/* ScriptExtensions.txt ----------------------------------------------------- */
static void U_CALLCONV
scriptExtensionsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t start, end;
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/* parse list of script codes */
UnicodeString codes; // vector of 16-bit UScriptCode values
char *s=fields[1][0];
for(;;) {
// skip whitespace before each token
s=(char *)u_skipWhitespace(s);
if(*s==0 || *s==';') {
break;
}
// skip non-whitespace, non-terminator characters to find the token limit
char *limit=s;
char c;
do {
c=*++limit;
} while(!U_IS_INV_WHITESPACE(c) && c!=0 && c!=';');
// NUL-terminated this token
*limit=0;
// convert the token (script property value alias) into a UScriptCode value
int32_t value=u_getPropertyValueEnum(UCHAR_SCRIPT, s);
if(value<0) {
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 1 at %s\n", s);
exit(U_INVALID_FORMAT_ERROR);
}
// Insertion sort into the list of script codes.
for(int32_t i=0;; ++i) {
if(i<codes.length()) {
if(value<codes[i]) {
codes.insert(i, (UChar)value);
break;
} else if(value==codes[i]) {
fprintf(stderr,
"genprops: duplicate script code in ScriptExtensions.txt field 1 at %s "
"for U+%04lx..U+%04lx\n",
s, (long)start, (long)end);
exit(U_INVALID_FORMAT_ERROR);
}
// continue while value>codes[i]
} else {
codes.append((UChar)value);
break;
}
}
if(c==0 || c==';') {
// the token ended at a terminator
break;
} else {
// the token ended at U_IS_INV_WHITESPACE(c), continue after c
s=limit+1;
}
}
int32_t length=codes.length();
if(length==0) {
fprintf(stderr,
"genprops: missing values in ScriptExtensions.txt field 1 "
"for U+%04lx..U+%04lx\n",
(long)start, (long)end);
exit(U_INVALID_FORMAT_ERROR);
}
// Set bit 15 on the last script code, for termination.
codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
// Find this list of codes in the Script_Extensions data so far, or add this list.
int32_t index=scriptExtensions->indexOf(codes);
if(index<0) {
index=scriptExtensions->length();
scriptExtensions->append(codes);
}
// Modify the Script data for each of the start..end code points
// to include the Script_Extensions index.
do {
uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK;
// Find the next code point that has a different script value.
// We want to add the Script_Extensions index to the code point range start..next-1.
UChar32 next;
for(next=(UChar32)start+1;
next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK);
++next) {}
if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) {
fprintf(stderr,
"genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx "
"which overlaps with a range including U+%04lx..U+%04lx\n",
(long)start, (long)end, (long)start, (long)(next-1));
exit(U_INVALID_FORMAT_ERROR);
}
// Encode the (Script, Script_Extensions index) pair.
if(scriptX==USCRIPT_COMMON) {
scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index;
} else if(scriptX==USCRIPT_INHERITED) {
scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index;
} else {
// Store an additional pair of 16-bit units for an unusual main Script code
// together with the Script_Extensions index.
UnicodeString codeIndexPair;
codeIndexPair.append((UChar)scriptX).append((UChar)index);
index=scriptExtensions->indexOf(codeIndexPair);
if(index<0) {
index=scriptExtensions->length();
scriptExtensions->append(codeIndexPair);
}
scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index;
}
if(index>UPROPS_SCRIPT_MASK) {
fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
// Write the (Script, Script_Extensions index) pair into
// the properties vector for start..next-1.
upvec_setValue(pv, (UChar32)start, (UChar32)(next-1),
0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode));
exit(*pErrorCode);
}
start=next;
} while(start<=end);
}
/* DerivedNumericValues.txt ------------------------------------------------- */
static void U_CALLCONV
@ -719,7 +848,36 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
exit(errorCode);
}
if(p!=NULL) {
/* round up scriptExtensions to multiple of 4 bytes */
if(scriptExtensions->length()&1) {
scriptExtensions->append((UChar)0);
}
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
indexes[UPROPS_RESERVED_INDEX_7]=
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2;
indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7];
indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8];
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
((int32_t)U_DT_COUNT-1);
int32_t additionalPropsSize=4*(indexes[UPROPS_DATA_TOP_INDEX]-indexes[UPROPS_ADDITIONAL_TRIE_INDEX]);
if(p!=NULL && additionalPropsSize<=capacity) {
if(beVerbose) {
printf("size in bytes of additional props trie:%5u\n", (int)length);
}
@ -756,7 +914,7 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
if(U_FAILURE(errorCode)) {
fprintf(
stderr,
"genbidi error: deleting lead surrogate code unit values failed - %s\n",
"genprops error: deleting lead surrogate code unit values failed - %s\n",
u_errorName(errorCode));
exit(errorCode);
}
@ -772,47 +930,33 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
"};\n\n");
utrie2_close(trie2);
}
p+=length;
capacity-=length;
/* set indexes */
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
indexes[UPROPS_RESERVED_INDEX]=
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
indexes[UPROPS_MAX_VALUES_INDEX]=
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
indexes[UPROPS_MAX_VALUES_2_INDEX]=
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
((int32_t)U_DT_COUNT-1);
}
if(p!=NULL && (pvCount*4)<=capacity) {
if(f!=NULL) {
usrc_writeArray(f,
"static const uint32_t propsVectors[%ld]={\n",
pvArray, 32, pvCount,
"};\n\n");
fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
usrc_writeArray(f,
"static const uint16_t scriptExtensions[%ld]={\n",
scriptExtensions->getBuffer(), 16, scriptExtensions->length(),
"};\n\n");
} else {
uprv_memcpy(p, pvArray, pvCount*4);
p+=length;
length=pvCount*4;
uprv_memcpy(p, pvArray, length);
p+=length;
length=scriptExtensions->length()*2;
uprv_memcpy(p, scriptExtensions->getBuffer(), length);
}
if(beVerbose) {
printf("number of additional props vectors: %5u\n", (int)pvRows);
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions->length());
}
}
length+=pvCount*4;
return length;
return additionalPropsSize;
}

View File

@ -41,7 +41,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
precedes the actual data. It contains platform properties values and the
file format version.
The following is a description of format version 6 .
The following is a description of format version 7 .
Data contents:
@ -74,8 +74,10 @@ Formally, the file contains the following structures:
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
i7..i9 reservedIndexes; -- reserved values; 0 for now
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
@ -92,6 +94,20 @@ Formally, the file contains the following structures:
AT serialized trie for additional properties (byte size: 4*(i4-i3))
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
SCX const uint16_t scriptExtensions[2*(i7-i6)];
SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
A Script_Extensions list is a sequence of UScriptCode values in ascending order,
with the last code having bit 15 set for termination.
A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
followed by the index of the Script_Extensions list.
If the propsVectors[] column 0 value indicates that there are Script_Extensions,
then the UPROPS_SCRIPT_MASK bit field is an index to either a list or a pair in SCX,
rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
indicate whether the main Script value is Common or Inherited (and the index is to a list)
vs. another value (and the index is to a pair).
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
Trie lookup and properties:
In order to condense the data for the 21-bit code space, several properties of
@ -206,6 +222,12 @@ Format version 6 became necessary because Unicode 5.2 adds fractions with
denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
types and values rather than add another variant to the previous format.
--- Changes in format version 7 ---
Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
the script code bits are an index into the new Script_Extensions array rather
than a script code.
----------------------------------------------------------------------------- */
/* UDataInfo cf. udata.h */
@ -227,14 +249,14 @@ static UNewTrie *pTrie=NULL;
/* -------------------------------------------------------------------------- */
extern void
U_CFUNC void
setUnicodeVersion(const char *v) {
UVersionInfo version;
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
extern void
U_CFUNC void
initStore() {
pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
if(pTrie==NULL) {
@ -245,7 +267,7 @@ initStore() {
initAdditionalProperties();
}
extern void
U_CFUNC void
exitStore() {
utrie_close(pTrie);
exitAdditionalProperties();
@ -253,7 +275,7 @@ exitStore() {
/* store a character's properties ------------------------------------------- */
extern uint32_t
U_CFUNC uint32_t
makeProps(Props *p) {
uint32_t den;
int32_t type, value, exp, ntv;
@ -327,7 +349,7 @@ makeProps(Props *p) {
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
}
extern void
U_CFUNC void
addProps(uint32_t c, uint32_t x) {
if(!utrie_set32(pTrie, (UChar32)c, x)) {
fprintf(stderr, "error: too many entries for the properties trie\n");
@ -335,14 +357,14 @@ addProps(uint32_t c, uint32_t x) {
}
}
extern uint32_t
U_CFUNC uint32_t
getProps(uint32_t c) {
return utrie_get32(pTrie, (UChar32)c, NULL);
}
/* areas of same properties ------------------------------------------------- */
extern void
U_CFUNC void
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
fprintf(stderr, "error: too many entries for the properties trie\n");
@ -352,7 +374,7 @@ repeatProps(uint32_t first, uint32_t last, uint32_t x) {
/* generate output data ----------------------------------------------------- */
extern void
U_CFUNC void
generateData(const char *dataDir, UBool csource) {
static int32_t indexes[UPROPS_INDEX_COUNT]={
0, 0, 0, 0,