ICU-7264 parse ScriptExtensions.txt, write uprops.icu formatVersion 7
X-SVN-Rev: 28377
This commit is contained in:
parent
1360486f9e
commit
3bf87d9766
@ -5,5 +5,5 @@
|
|||||||
# created by: Markus W. Scherer
|
# created by: Markus W. Scherer
|
||||||
# edited on: 2010jul20
|
# edited on: 2010jul20
|
||||||
# edited by: Stuart G. Gill
|
# edited by: Stuart G. Gill
|
||||||
add_executable(genprops genprops.c props2.c store.c)
|
add_executable(genprops genprops.cpp props2.cpp store.c)
|
||||||
target_link_libraries(genprops icuuc icutu)
|
target_link_libraries(genprops icuuc icutu)
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
/*
|
/*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* Copyright (C) 1999-2008, International Business Machines
|
* Copyright (C) 1999-2010, International Business Machines
|
||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
*
|
*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* file name: genprops.c
|
* file name: genprops.cpp
|
||||||
* encoding: US-ASCII
|
* encoding: US-ASCII
|
||||||
* tab size: 8 (not used)
|
* tab size: 8 (not used)
|
||||||
* indentation:4
|
* indentation:4
|
||||||
@ -40,6 +40,8 @@ U_CDECL_END
|
|||||||
|
|
||||||
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
|
||||||
|
|
||||||
|
U_NAMESPACE_USE
|
||||||
|
|
||||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||||
|
|
||||||
/* prototypes --------------------------------------------------------------- */
|
/* prototypes --------------------------------------------------------------- */
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* Copyright (C) 1999-2008, International Business Machines
|
* Copyright (C) 1999-2010, International Business Machines
|
||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
*
|
*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
@ -34,13 +34,13 @@ typedef struct {
|
|||||||
} Props;
|
} Props;
|
||||||
|
|
||||||
/* global flags */
|
/* global flags */
|
||||||
extern UBool beVerbose, haveCopyright;
|
U_CFUNC UBool beVerbose, haveCopyright;
|
||||||
|
|
||||||
extern const char *const
|
U_CFUNC const char *const
|
||||||
genCategoryNames[];
|
genCategoryNames[];
|
||||||
|
|
||||||
/* properties vectors in props2.c */
|
/* properties vectors in props2.cpp */
|
||||||
extern UPropsVectors *pv;
|
U_CFUNC UPropsVectors *pv;
|
||||||
|
|
||||||
/* prototypes */
|
/* prototypes */
|
||||||
U_CFUNC void
|
U_CFUNC void
|
||||||
@ -52,28 +52,28 @@ isToken(const char *token, const char *s);
|
|||||||
U_CFUNC int32_t
|
U_CFUNC int32_t
|
||||||
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
|
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
setUnicodeVersion(const char *v);
|
setUnicodeVersion(const char *v);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
initStore(void);
|
initStore(void);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
exitStore(void);
|
exitStore(void);
|
||||||
|
|
||||||
extern uint32_t
|
U_CFUNC uint32_t
|
||||||
makeProps(Props *p);
|
makeProps(Props *p);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
addProps(uint32_t c, uint32_t props);
|
addProps(uint32_t c, uint32_t props);
|
||||||
|
|
||||||
extern uint32_t
|
U_CFUNC uint32_t
|
||||||
getProps(uint32_t c);
|
getProps(uint32_t c);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
generateData(const char *dataDir, UBool csource);
|
generateData(const char *dataDir, UBool csource);
|
||||||
|
|
||||||
/* props2.c */
|
/* props2.c */
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
/*
|
/*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* Copyright (C) 2002-2009, International Business Machines
|
* Copyright (C) 2002-2010, International Business Machines
|
||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
*
|
*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* file name: props2.c
|
* file name: props2.cpp
|
||||||
* encoding: US-ASCII
|
* encoding: US-ASCII
|
||||||
* tab size: 8 (not used)
|
* tab size: 8 (not used)
|
||||||
* indentation:4
|
* indentation:4
|
||||||
@ -20,6 +20,7 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "unicode/utypes.h"
|
#include "unicode/utypes.h"
|
||||||
#include "unicode/uchar.h"
|
#include "unicode/uchar.h"
|
||||||
|
#include "unicode/unistr.h"
|
||||||
#include "unicode/uscript.h"
|
#include "unicode/uscript.h"
|
||||||
#include "cstring.h"
|
#include "cstring.h"
|
||||||
#include "cmemory.h"
|
#include "cmemory.h"
|
||||||
@ -32,11 +33,15 @@
|
|||||||
|
|
||||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||||
|
|
||||||
|
U_NAMESPACE_USE
|
||||||
|
|
||||||
/* data --------------------------------------------------------------------- */
|
/* data --------------------------------------------------------------------- */
|
||||||
|
|
||||||
static UNewTrie *newTrie;
|
static UNewTrie *newTrie;
|
||||||
UPropsVectors *pv;
|
UPropsVectors *pv;
|
||||||
|
|
||||||
|
static UnicodeString *scriptExtensions;
|
||||||
|
|
||||||
/* miscellaneous ------------------------------------------------------------ */
|
/* miscellaneous ------------------------------------------------------------ */
|
||||||
|
|
||||||
static char *
|
static char *
|
||||||
@ -45,7 +50,7 @@ trimTerminateField(char *s, char *limit) {
|
|||||||
s=(char *)u_skipWhitespace(s);
|
s=(char *)u_skipWhitespace(s);
|
||||||
|
|
||||||
/* trim trailing whitespace */
|
/* trim trailing whitespace */
|
||||||
while(s<limit && (*(limit-1)==' ' || *(limit-1)=='\t')) {
|
while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) {
|
||||||
--limit;
|
--limit;
|
||||||
}
|
}
|
||||||
*limit=0;
|
*limit=0;
|
||||||
@ -77,6 +82,11 @@ ageLineFn(void *context,
|
|||||||
char *fields[][2], int32_t fieldCount,
|
char *fields[][2], int32_t fieldCount,
|
||||||
UErrorCode *pErrorCode);
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
|
static void U_CALLCONV
|
||||||
|
scriptExtensionsLineFn(void *context,
|
||||||
|
char *fields[][2], int32_t fieldCount,
|
||||||
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
static void
|
static void
|
||||||
parseMultiFieldFile(char *filename, char *basename,
|
parseMultiFieldFile(char *filename, char *basename,
|
||||||
const char *ucdFile, const char *suffix,
|
const char *ucdFile, const char *suffix,
|
||||||
@ -415,12 +425,14 @@ initAdditionalProperties() {
|
|||||||
fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
|
fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
|
||||||
exit(errorCode);
|
exit(errorCode);
|
||||||
}
|
}
|
||||||
|
scriptExtensions=new UnicodeString;
|
||||||
}
|
}
|
||||||
|
|
||||||
U_CFUNC void
|
U_CFUNC void
|
||||||
exitAdditionalProperties() {
|
exitAdditionalProperties() {
|
||||||
utrie_close(newTrie);
|
utrie_close(newTrie);
|
||||||
upvec_close(pv);
|
upvec_close(pv);
|
||||||
|
delete scriptExtensions;
|
||||||
}
|
}
|
||||||
|
|
||||||
U_CFUNC void
|
U_CFUNC void
|
||||||
@ -436,22 +448,10 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
|||||||
|
|
||||||
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
|
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
|
||||||
|
|
||||||
/*
|
|
||||||
* UTR 24 says:
|
|
||||||
* Section 2:
|
|
||||||
* "Common - For characters that may be used
|
|
||||||
* within multiple scripts,
|
|
||||||
* or any unassigned code points."
|
|
||||||
*
|
|
||||||
* Section 4:
|
|
||||||
* "The value COMMON is the default value,
|
|
||||||
* given to all code points that are not
|
|
||||||
* explicitly mentioned in the data file."
|
|
||||||
*
|
|
||||||
* COMMON==USCRIPT_COMMON==0 - nothing to do
|
|
||||||
*/
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
|
parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
|
||||||
|
|
||||||
|
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
|
parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
|
||||||
|
|
||||||
parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
|
parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
|
||||||
@ -567,6 +567,135 @@ ageLineFn(void *context,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ScriptExtensions.txt ----------------------------------------------------- */
|
||||||
|
|
||||||
|
static void U_CALLCONV
|
||||||
|
scriptExtensionsLineFn(void *context,
|
||||||
|
char *fields[][2], int32_t fieldCount,
|
||||||
|
UErrorCode *pErrorCode) {
|
||||||
|
uint32_t start, end;
|
||||||
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
||||||
|
if(U_FAILURE(*pErrorCode)) {
|
||||||
|
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 0 at %s\n", fields[0][0]);
|
||||||
|
exit(*pErrorCode);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* parse list of script codes */
|
||||||
|
UnicodeString codes; // vector of 16-bit UScriptCode values
|
||||||
|
char *s=fields[1][0];
|
||||||
|
for(;;) {
|
||||||
|
// skip whitespace before each token
|
||||||
|
s=(char *)u_skipWhitespace(s);
|
||||||
|
if(*s==0 || *s==';') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// skip non-whitespace, non-terminator characters to find the token limit
|
||||||
|
char *limit=s;
|
||||||
|
char c;
|
||||||
|
do {
|
||||||
|
c=*++limit;
|
||||||
|
} while(!U_IS_INV_WHITESPACE(c) && c!=0 && c!=';');
|
||||||
|
// NUL-terminated this token
|
||||||
|
*limit=0;
|
||||||
|
// convert the token (script property value alias) into a UScriptCode value
|
||||||
|
int32_t value=u_getPropertyValueEnum(UCHAR_SCRIPT, s);
|
||||||
|
if(value<0) {
|
||||||
|
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 1 at %s\n", s);
|
||||||
|
exit(U_INVALID_FORMAT_ERROR);
|
||||||
|
}
|
||||||
|
// Insertion sort into the list of script codes.
|
||||||
|
for(int32_t i=0;; ++i) {
|
||||||
|
if(i<codes.length()) {
|
||||||
|
if(value<codes[i]) {
|
||||||
|
codes.insert(i, (UChar)value);
|
||||||
|
break;
|
||||||
|
} else if(value==codes[i]) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"genprops: duplicate script code in ScriptExtensions.txt field 1 at %s "
|
||||||
|
"for U+%04lx..U+%04lx\n",
|
||||||
|
s, (long)start, (long)end);
|
||||||
|
exit(U_INVALID_FORMAT_ERROR);
|
||||||
|
}
|
||||||
|
// continue while value>codes[i]
|
||||||
|
} else {
|
||||||
|
codes.append((UChar)value);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(c==0 || c==';') {
|
||||||
|
// the token ended at a terminator
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// the token ended at U_IS_INV_WHITESPACE(c), continue after c
|
||||||
|
s=limit+1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int32_t length=codes.length();
|
||||||
|
if(length==0) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"genprops: missing values in ScriptExtensions.txt field 1 "
|
||||||
|
"for U+%04lx..U+%04lx\n",
|
||||||
|
(long)start, (long)end);
|
||||||
|
exit(U_INVALID_FORMAT_ERROR);
|
||||||
|
}
|
||||||
|
// Set bit 15 on the last script code, for termination.
|
||||||
|
codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
|
||||||
|
// Find this list of codes in the Script_Extensions data so far, or add this list.
|
||||||
|
int32_t index=scriptExtensions->indexOf(codes);
|
||||||
|
if(index<0) {
|
||||||
|
index=scriptExtensions->length();
|
||||||
|
scriptExtensions->append(codes);
|
||||||
|
}
|
||||||
|
// Modify the Script data for each of the start..end code points
|
||||||
|
// to include the Script_Extensions index.
|
||||||
|
do {
|
||||||
|
uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK;
|
||||||
|
// Find the next code point that has a different script value.
|
||||||
|
// We want to add the Script_Extensions index to the code point range start..next-1.
|
||||||
|
UChar32 next;
|
||||||
|
for(next=(UChar32)start+1;
|
||||||
|
next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK);
|
||||||
|
++next) {}
|
||||||
|
if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) {
|
||||||
|
fprintf(stderr,
|
||||||
|
"genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx "
|
||||||
|
"which overlaps with a range including U+%04lx..U+%04lx\n",
|
||||||
|
(long)start, (long)end, (long)start, (long)(next-1));
|
||||||
|
exit(U_INVALID_FORMAT_ERROR);
|
||||||
|
}
|
||||||
|
// Encode the (Script, Script_Extensions index) pair.
|
||||||
|
if(scriptX==USCRIPT_COMMON) {
|
||||||
|
scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index;
|
||||||
|
} else if(scriptX==USCRIPT_INHERITED) {
|
||||||
|
scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index;
|
||||||
|
} else {
|
||||||
|
// Store an additional pair of 16-bit units for an unusual main Script code
|
||||||
|
// together with the Script_Extensions index.
|
||||||
|
UnicodeString codeIndexPair;
|
||||||
|
codeIndexPair.append((UChar)scriptX).append((UChar)index);
|
||||||
|
index=scriptExtensions->indexOf(codeIndexPair);
|
||||||
|
if(index<0) {
|
||||||
|
index=scriptExtensions->length();
|
||||||
|
scriptExtensions->append(codeIndexPair);
|
||||||
|
}
|
||||||
|
scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index;
|
||||||
|
}
|
||||||
|
if(index>UPROPS_SCRIPT_MASK) {
|
||||||
|
fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n");
|
||||||
|
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||||
|
}
|
||||||
|
// Write the (Script, Script_Extensions index) pair into
|
||||||
|
// the properties vector for start..next-1.
|
||||||
|
upvec_setValue(pv, (UChar32)start, (UChar32)(next-1),
|
||||||
|
0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode);
|
||||||
|
if(U_FAILURE(*pErrorCode)) {
|
||||||
|
fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode));
|
||||||
|
exit(*pErrorCode);
|
||||||
|
}
|
||||||
|
start=next;
|
||||||
|
} while(start<=end);
|
||||||
|
}
|
||||||
|
|
||||||
/* DerivedNumericValues.txt ------------------------------------------------- */
|
/* DerivedNumericValues.txt ------------------------------------------------- */
|
||||||
|
|
||||||
static void U_CALLCONV
|
static void U_CALLCONV
|
||||||
@ -719,7 +848,36 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
|
|||||||
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
|
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
|
||||||
exit(errorCode);
|
exit(errorCode);
|
||||||
}
|
}
|
||||||
if(p!=NULL) {
|
|
||||||
|
/* round up scriptExtensions to multiple of 4 bytes */
|
||||||
|
if(scriptExtensions->length()&1) {
|
||||||
|
scriptExtensions->append((UChar)0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* set indexes */
|
||||||
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
||||||
|
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
||||||
|
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
||||||
|
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=
|
||||||
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
||||||
|
indexes[UPROPS_RESERVED_INDEX_7]=
|
||||||
|
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2;
|
||||||
|
indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7];
|
||||||
|
indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8];
|
||||||
|
|
||||||
|
indexes[UPROPS_MAX_VALUES_INDEX]=
|
||||||
|
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
||||||
|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
||||||
|
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
|
||||||
|
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
||||||
|
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
||||||
|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
|
||||||
|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
|
||||||
|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
|
||||||
|
((int32_t)U_DT_COUNT-1);
|
||||||
|
|
||||||
|
int32_t additionalPropsSize=4*(indexes[UPROPS_DATA_TOP_INDEX]-indexes[UPROPS_ADDITIONAL_TRIE_INDEX]);
|
||||||
|
if(p!=NULL && additionalPropsSize<=capacity) {
|
||||||
if(beVerbose) {
|
if(beVerbose) {
|
||||||
printf("size in bytes of additional props trie:%5u\n", (int)length);
|
printf("size in bytes of additional props trie:%5u\n", (int)length);
|
||||||
}
|
}
|
||||||
@ -756,7 +914,7 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
|
|||||||
if(U_FAILURE(errorCode)) {
|
if(U_FAILURE(errorCode)) {
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr,
|
stderr,
|
||||||
"genbidi error: deleting lead surrogate code unit values failed - %s\n",
|
"genprops error: deleting lead surrogate code unit values failed - %s\n",
|
||||||
u_errorName(errorCode));
|
u_errorName(errorCode));
|
||||||
exit(errorCode);
|
exit(errorCode);
|
||||||
}
|
}
|
||||||
@ -772,47 +930,33 @@ writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROP
|
|||||||
"};\n\n");
|
"};\n\n");
|
||||||
|
|
||||||
utrie2_close(trie2);
|
utrie2_close(trie2);
|
||||||
}
|
|
||||||
|
|
||||||
p+=length;
|
|
||||||
capacity-=length;
|
|
||||||
|
|
||||||
/* set indexes */
|
|
||||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
|
||||||
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
|
||||||
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
|
||||||
indexes[UPROPS_RESERVED_INDEX]=
|
|
||||||
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
|
||||||
|
|
||||||
indexes[UPROPS_MAX_VALUES_INDEX]=
|
|
||||||
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
|
||||||
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
|
||||||
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
|
|
||||||
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
|
||||||
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
|
||||||
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
|
|
||||||
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
|
|
||||||
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
|
|
||||||
((int32_t)U_DT_COUNT-1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(p!=NULL && (pvCount*4)<=capacity) {
|
|
||||||
if(f!=NULL) {
|
|
||||||
usrc_writeArray(f,
|
usrc_writeArray(f,
|
||||||
"static const uint32_t propsVectors[%ld]={\n",
|
"static const uint32_t propsVectors[%ld]={\n",
|
||||||
pvArray, 32, pvCount,
|
pvArray, 32, pvCount,
|
||||||
"};\n\n");
|
"};\n\n");
|
||||||
fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
|
fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
|
||||||
fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
|
fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
|
||||||
|
|
||||||
|
usrc_writeArray(f,
|
||||||
|
"static const uint16_t scriptExtensions[%ld]={\n",
|
||||||
|
scriptExtensions->getBuffer(), 16, scriptExtensions->length(),
|
||||||
|
"};\n\n");
|
||||||
} else {
|
} else {
|
||||||
uprv_memcpy(p, pvArray, pvCount*4);
|
p+=length;
|
||||||
|
length=pvCount*4;
|
||||||
|
uprv_memcpy(p, pvArray, length);
|
||||||
|
|
||||||
|
p+=length;
|
||||||
|
length=scriptExtensions->length()*2;
|
||||||
|
uprv_memcpy(p, scriptExtensions->getBuffer(), length);
|
||||||
}
|
}
|
||||||
if(beVerbose) {
|
if(beVerbose) {
|
||||||
printf("number of additional props vectors: %5u\n", (int)pvRows);
|
printf("number of additional props vectors: %5u\n", (int)pvRows);
|
||||||
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
||||||
|
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions->length());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
length+=pvCount*4;
|
|
||||||
|
|
||||||
return length;
|
return additionalPropsSize;
|
||||||
}
|
}
|
@ -41,7 +41,7 @@ the udata API for loading ICU data. Especially, a UDataInfo structure
|
|||||||
precedes the actual data. It contains platform properties values and the
|
precedes the actual data. It contains platform properties values and the
|
||||||
file format version.
|
file format version.
|
||||||
|
|
||||||
The following is a description of format version 6 .
|
The following is a description of format version 7 .
|
||||||
|
|
||||||
Data contents:
|
Data contents:
|
||||||
|
|
||||||
@ -74,8 +74,10 @@ Formally, the file contains the following structures:
|
|||||||
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
|
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
|
||||||
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
|
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
|
||||||
|
|
||||||
i6 reservedItemIndex; -- 32-bit unit index to the top of the properties vectors table
|
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
|
||||||
i7..i9 reservedIndexes; -- reserved values; 0 for now
|
i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
|
||||||
|
i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
|
||||||
|
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
|
||||||
|
|
||||||
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
|
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
|
||||||
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
|
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
|
||||||
@ -92,6 +94,20 @@ Formally, the file contains the following structures:
|
|||||||
AT serialized trie for additional properties (byte size: 4*(i4-i3))
|
AT serialized trie for additional properties (byte size: 4*(i4-i3))
|
||||||
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
|
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
|
||||||
|
|
||||||
|
SCX const uint16_t scriptExtensions[2*(i7-i6)];
|
||||||
|
|
||||||
|
SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
|
||||||
|
A Script_Extensions list is a sequence of UScriptCode values in ascending order,
|
||||||
|
with the last code having bit 15 set for termination.
|
||||||
|
A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
|
||||||
|
followed by the index of the Script_Extensions list.
|
||||||
|
If the propsVectors[] column 0 value indicates that there are Script_Extensions,
|
||||||
|
then the UPROPS_SCRIPT_MASK bit field is an index to either a list or a pair in SCX,
|
||||||
|
rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
|
||||||
|
indicate whether the main Script value is Common or Inherited (and the index is to a list)
|
||||||
|
vs. another value (and the index is to a pair).
|
||||||
|
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
|
||||||
|
|
||||||
Trie lookup and properties:
|
Trie lookup and properties:
|
||||||
|
|
||||||
In order to condense the data for the 21-bit code space, several properties of
|
In order to condense the data for the 21-bit code space, several properties of
|
||||||
@ -206,6 +222,12 @@ Format version 6 became necessary because Unicode 5.2 adds fractions with
|
|||||||
denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
|
denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
|
||||||
types and values rather than add another variant to the previous format.
|
types and values rather than add another variant to the previous format.
|
||||||
|
|
||||||
|
--- Changes in format version 7 ---
|
||||||
|
|
||||||
|
Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
|
||||||
|
the script code bits are an index into the new Script_Extensions array rather
|
||||||
|
than a script code.
|
||||||
|
|
||||||
----------------------------------------------------------------------------- */
|
----------------------------------------------------------------------------- */
|
||||||
|
|
||||||
/* UDataInfo cf. udata.h */
|
/* UDataInfo cf. udata.h */
|
||||||
@ -227,14 +249,14 @@ static UNewTrie *pTrie=NULL;
|
|||||||
|
|
||||||
/* -------------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------------- */
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
setUnicodeVersion(const char *v) {
|
setUnicodeVersion(const char *v) {
|
||||||
UVersionInfo version;
|
UVersionInfo version;
|
||||||
u_versionFromString(version, v);
|
u_versionFromString(version, v);
|
||||||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
initStore() {
|
initStore() {
|
||||||
pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
|
pTrie=utrie_open(NULL, NULL, 40000, 0, 0, TRUE);
|
||||||
if(pTrie==NULL) {
|
if(pTrie==NULL) {
|
||||||
@ -245,7 +267,7 @@ initStore() {
|
|||||||
initAdditionalProperties();
|
initAdditionalProperties();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
exitStore() {
|
exitStore() {
|
||||||
utrie_close(pTrie);
|
utrie_close(pTrie);
|
||||||
exitAdditionalProperties();
|
exitAdditionalProperties();
|
||||||
@ -253,7 +275,7 @@ exitStore() {
|
|||||||
|
|
||||||
/* store a character's properties ------------------------------------------- */
|
/* store a character's properties ------------------------------------------- */
|
||||||
|
|
||||||
extern uint32_t
|
U_CFUNC uint32_t
|
||||||
makeProps(Props *p) {
|
makeProps(Props *p) {
|
||||||
uint32_t den;
|
uint32_t den;
|
||||||
int32_t type, value, exp, ntv;
|
int32_t type, value, exp, ntv;
|
||||||
@ -327,7 +349,7 @@ makeProps(Props *p) {
|
|||||||
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
|
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
addProps(uint32_t c, uint32_t x) {
|
addProps(uint32_t c, uint32_t x) {
|
||||||
if(!utrie_set32(pTrie, (UChar32)c, x)) {
|
if(!utrie_set32(pTrie, (UChar32)c, x)) {
|
||||||
fprintf(stderr, "error: too many entries for the properties trie\n");
|
fprintf(stderr, "error: too many entries for the properties trie\n");
|
||||||
@ -335,14 +357,14 @@ addProps(uint32_t c, uint32_t x) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extern uint32_t
|
U_CFUNC uint32_t
|
||||||
getProps(uint32_t c) {
|
getProps(uint32_t c) {
|
||||||
return utrie_get32(pTrie, (UChar32)c, NULL);
|
return utrie_get32(pTrie, (UChar32)c, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* areas of same properties ------------------------------------------------- */
|
/* areas of same properties ------------------------------------------------- */
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
||||||
if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
|
if(!utrie_setRange32(pTrie, (UChar32)first, (UChar32)(last+1), x, FALSE)) {
|
||||||
fprintf(stderr, "error: too many entries for the properties trie\n");
|
fprintf(stderr, "error: too many entries for the properties trie\n");
|
||||||
@ -352,7 +374,7 @@ repeatProps(uint32_t first, uint32_t last, uint32_t x) {
|
|||||||
|
|
||||||
/* generate output data ----------------------------------------------------- */
|
/* generate output data ----------------------------------------------------- */
|
||||||
|
|
||||||
extern void
|
U_CFUNC void
|
||||||
generateData(const char *dataDir, UBool csource) {
|
generateData(const char *dataDir, UBool csource) {
|
||||||
static int32_t indexes[UPROPS_INDEX_COUNT]={
|
static int32_t indexes[UPROPS_INDEX_COUNT]={
|
||||||
0, 0, 0, 0,
|
0, 0, 0, 0,
|
||||||
|
Loading…
Reference in New Issue
Block a user