ICU-8972 genprops: use ppucd.txt for gc, nt, nv

X-SVN-Rev: 31145
This commit is contained in:
Markus Scherer 2011-12-18 03:21:54 +00:00
parent 764caf4347
commit 7270f8f618
4 changed files with 159 additions and 658 deletions

View File

@ -29,8 +29,6 @@
#include "uprops.h"
#include "genprops.h"
#define DO_DEBUG_OUT 0
/* Unicode character properties file format ------------------------------------
The file format prepared and written here contains several data
@ -249,111 +247,6 @@ static UDataInfo dataInfo={
static UTrie2 *pTrie=NULL;
/* store a character's properties ------------------------------------------- */
U_CFUNC uint32_t
makeProps(Props *p) {
uint32_t den;
int32_t type, value, exp, ntv;
/* encode numeric type & value */
type=p->numericType;
value=p->numericValue;
den=p->denominator;
exp=p->exponent;
ntv=-1; /* the numeric type and value cannot be encoded if ntv remains -1 */
switch(type) {
case U_NT_NONE:
if(value==0 && den==0 && exp==0) {
ntv=UPROPS_NTV_NONE;
}
break;
case U_NT_DECIMAL:
if(0<=value && value<=9 && den==0 && exp==0) {
ntv=UPROPS_NTV_DECIMAL_START+value;
}
break;
case U_NT_DIGIT:
if(0<=value && value<=9 && den==0 && exp==0) {
ntv=UPROPS_NTV_DIGIT_START+value;
}
break;
case U_NT_NUMERIC:
if(den==0) {
if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer parsed like a large one */
ntv=UPROPS_NTV_NUMERIC_START+value*100;
} else if(exp==0 && value>=0) {
if(value<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer */
ntv=UPROPS_NTV_NUMERIC_START+value;
} else {
/* large integer parsed like a small one */
/* split the value into mantissa and exponent, base 10 */
int32_t mant=value;
while((mant%10)==0) {
mant/=10;
++exp;
}
if(mant<=9) {
ntv=((mant+14)<<5)+(exp-2);
}
}
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
/* large, single-significant-digit integer */
ntv=((value+14)<<5)+(exp-2);
}
} else if(exp==0) {
if(-1<=value && value<=17 && 1<=den && den<=16) {
/* fraction */
ntv=((value+12)<<4)+(den-1);
}
}
default:
break;
}
if(ntv<0) {
fprintf(stderr, "genprops error: unable to encode numeric type %d & value %ld/%lu E%d\n",
(int)type, (long)value, (unsigned long)den, exp);
exit(U_ILLEGAL_ARGUMENT_ERROR);
}
/* encode the properties */
return
(uint32_t)p->generalCategory |
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
}
U_CFUNC void
addProps(uint32_t c, uint32_t x) {
UErrorCode errorCode=U_ZERO_ERROR;
utrie2_set32(pTrie, (UChar32)c, x, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie2_set32(properties trie) failed - %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
U_CFUNC uint32_t
getProps(uint32_t c) {
return utrie2_get32(pTrie, (UChar32)c);
}
/* areas of same properties ------------------------------------------------- */
U_CFUNC void
repeatProps(uint32_t first, uint32_t last, uint32_t x) {
UErrorCode errorCode=U_ZERO_ERROR;
utrie2_setRange32(pTrie, (UChar32)first, (UChar32)last, x, FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie2_set32(properties trie) failed - %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
class CorePropsWriter : public PropsWriter {
public:
CorePropsWriter(UErrorCode &errorCode);
@ -383,8 +276,154 @@ CorePropsWriter::setUnicodeVersion(const UVersionInfo version) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
// For nt=U_NT_NUMERIC.
static int32_t
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
/* get a possible minus sign */
UBool isNegative;
if(*s=='-') {
isNegative=TRUE;
++s;
} else {
isNegative=FALSE;
}
int32_t value=0, den=0, exp=0, ntv=0;
char *numberLimit;
/* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
value=s[0]-'0';
numberLimit=const_cast<char *>(s);
while(*(++numberLimit)=='0') {
++exp;
}
} else {
/* normal number parsing */
unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
if(ul>0x7fffffff) {
ntv=-1;
} else {
value=(int32_t)ul;
}
if(s<numberLimit && *numberLimit=='/') {
/* fractional value, get the denominator */
ul=uprv_strtoul(numberLimit+1, &numberLimit, 10);
if(ul==0 || ul>0x7fffffff) {
ntv=-1;
} else {
den=(int32_t)ul;
}
}
}
if(isNegative) {
value=-(int32_t)value;
}
if(ntv<0) {
// pass
} else if(den==0) {
if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer parsed like a large one */
ntv=UPROPS_NTV_NUMERIC_START+value*100;
} else if(exp==0 && value>=0) {
if(value<=UPROPS_NTV_MAX_SMALL_INT) {
/* small integer */
ntv=UPROPS_NTV_NUMERIC_START+value;
} else {
/* large integer parsed like a small one */
/* split the value into mantissa and exponent, base 10 */
int32_t mant=value;
while((mant%10)==0) {
mant/=10;
++exp;
}
if(mant<=9) {
ntv=((mant+14)<<5)+(exp-2);
}
}
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
/* large, single-significant-digit integer */
ntv=((value+14)<<5)+(exp-2);
}
} else if(exp==0) {
if(-1<=value && value<=17 && 1<=den && den<=16) {
/* fraction */
ntv=((value+12)<<4)+(den-1);
}
}
if(ntv<0 || *numberLimit!=0) {
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", s);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
return ntv;
}
void
CorePropsWriter::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
UChar32 start=props.start;
UChar32 end=props.end;
int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
const char *nvString=props.numericValue;
if(type!=U_NT_NONE && nvString==NULL && start==end) {
fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
return;
}
int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
if(nvString!=NULL) {
int32_t digitValue=props.digitValue;
if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
) {
fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
(int)type, nvString==NULL ? "NULL" : nvString);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
switch(type) {
case U_NT_NONE:
ntv=UPROPS_NTV_NONE;
break;
case U_NT_DECIMAL:
ntv=UPROPS_NTV_DECIMAL_START+digitValue;
break;
case U_NT_DIGIT:
ntv=UPROPS_NTV_DIGIT_START+digitValue;
break;
case U_NT_NUMERIC:
if(digitValue>=0) {
ntv=UPROPS_NTV_NUMERIC_START+digitValue;
} else {
ntv=encodeNumericValue(start, nvString, errorCode);
if(U_FAILURE(errorCode)) {
return;
}
}
default:
break; // unreachable
}
}
uint32_t value=
(uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
if(start==end) {
utrie2_set32(pTrie, start, value, &errorCode);
} else {
utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
}
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
(long)start, (long)end, u_errorName(errorCode));
}
}
static int32_t indexes[UPROPS_INDEX_COUNT]={
@ -409,7 +448,13 @@ CorePropsWriter::finalizeData(UErrorCode &errorCode) {
u_errorName(errorCode), (long)trieSize);
return;
}
// TODO: remove
#if 0
for(int32_t c=0; c<=0x10ffff; ++c) {
uint32_t v=utrie2_get32(pTrie, c);
printf("%04x ntv=%3d gc=%2d\n", c, v>>6, v&0x1f);
}
#endif
int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
offset+=trieSize>>2;
indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */

View File

@ -25,27 +25,17 @@
#include "unicode/utypes.h"
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/uclean.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "genprops.h"
#include "propsvec.h"
#include "ppucd.h"
#include "toolutil.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "uprops.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
// TODO: remove
#define USE_NEW 1
U_NAMESPACE_USE
UBool beVerbose=FALSE;
@ -57,15 +47,7 @@ void PropsWriter::finalizeData(UErrorCode &) {}
void PropsWriter::writeCSourceFile(const char *, UErrorCode &) {}
void PropsWriter::writeBinaryData(const char *, UBool, UErrorCode &) {}
/* prototypes --------------------------------------------------------------- */
static void
parseDB(const char *filename, UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
enum
{
enum {
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
@ -165,6 +147,7 @@ main(int argc, char* argv[]) {
while((lineType=ppucd.readLine(errorCode))!=PreparsedUCD::NO_LINE) {
if(ppucd.lineHasPropertyValues()) {
const UniProps *props=ppucd.getProps(newValues, errorCode);
corePropsWriter->setProps(*props, newValues, errorCode);
props2Writer->setProps(*props, newValues, errorCode);
} else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) {
const UVersionInfo &version=ppucd.getUnicodeVersion();
@ -189,10 +172,6 @@ main(int argc, char* argv[]) {
*basename++=U_FILE_SEP_CHAR;
}
/* process UnicodeData.txt */
writeUCDFilename(basename, "UnicodeData", NULL);
parseDB(filename, errorCode);
/* process additional properties files */
*basename=0;
generateAdditionalProperties(filename, NULL, errorCode);
@ -231,338 +210,6 @@ writeUCDFilename(char *basename, const char *filename, const char *suffix) {
uprv_strcpy(basename+length, ".txt");
}
U_CFUNC UBool
isToken(const char *token, const char *s) {
const char *z;
int32_t j;
s=u_skipWhitespace(s);
for(j=0;; ++j) {
if(token[j]!=0) {
if(s[j]!=token[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0) {
return TRUE;
} else {
break;
}
}
}
return FALSE;
}
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
const char *t, *z;
int32_t i, j;
s=u_skipWhitespace(s);
for(i=0; i<countTokens; ++i) {
t=tokens[i];
if(t!=NULL) {
for(j=0;; ++j) {
if(t[j]!=0) {
if(s[j]!=t[j]) {
break;
}
} else {
z=u_skipWhitespace(s+j);
if(*z==';' || *z==0 || *z=='#' || *z=='\r' || *z=='\n') {
return i;
} else {
break;
}
}
}
}
}
return -1;
}
/* parser for UnicodeData.txt ----------------------------------------------- */
/* general categories */
const char *const
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
"Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
"Mc", "Nd", "Nl", "No",
"Zs", "Zl", "Zp",
"Cc", "Cf", "Co", "Cs",
"Pd", "Ps", "Pe", "Pc", "Po",
"Sm", "Sc", "Sk", "So",
"Pi", "Pf"
};
static struct {
uint32_t first, last, props;
char name[80];
} unicodeAreas[32];
static int32_t unicodeAreaIndex=0;
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Props p;
char *end;
static uint32_t prevCode=0;
uint32_t value;
int32_t i;
/* reset the properties */
uprv_memset(&p, 0, sizeof(Props));
/* get the character code, field 0 */
p.code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genprops: syntax error in field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* get general category, field 2 */
i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]);
if(i>=0) {
p.generalCategory=(uint8_t)i;
} else {
fprintf(stderr, "genprops: unknown general category \"%s\" at code 0x%lx\n",
fields[2][0], (unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* decimal digit value, field 6 */
if(fields[6][0]<fields[6][1]) {
value=(uint32_t)uprv_strtoul(fields[6][0], &end, 10);
if(end!=fields[6][1] || value>0x7fff) {
fprintf(stderr, "genprops: syntax error in field 6 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
p.numericValue=(int32_t)value;
p.numericType=1;
}
/* digit value, field 7 */
if(fields[7][0]<fields[7][1]) {
value=(uint32_t)uprv_strtoul(fields[7][0], &end, 10);
if(end!=fields[7][1] || value>0x7fff) {
fprintf(stderr, "genprops: syntax error in field 7 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(p.numericType==0) {
p.numericValue=(int32_t)value;
p.numericType=2;
} else if((int32_t)value!=p.numericValue) {
fprintf(stderr, "genprops error: numeric values in fields 6 & 7 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
/* numeric value, field 8 */
if(fields[8][0]<fields[8][1]) {
char *s=fields[8][0];
UBool isNegative;
/* get a possible minus sign */
if(*s=='-') {
isNegative=TRUE;
++s;
} else {
isNegative=FALSE;
}
value=(uint32_t)uprv_strtoul(s, &end, 10);
if(value>0 && *end=='/') {
/* field 8 may contain a fractional value, get the denominator */
if(p.numericType>0) {
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
p.denominator=(uint32_t)uprv_strtoul(end+1, &end, 10);
if(p.denominator==0) {
fprintf(stderr, "genprops: denominator is 0 in field 8 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
if(end!=fields[8][1] || value>0x7fffffff) {
fprintf(stderr, "genprops: syntax error in field 8 at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
if(p.numericType==0) {
if(isNegative) {
p.numericValue=-(int32_t)value;
} else {
p.numericValue=(int32_t)value;
}
p.numericType=3;
} else if((int32_t)value!=p.numericValue) {
fprintf(stderr, "genprops error: numeric values in fields 6..8 different at code 0x%lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
}
value=makeProps(&p);
if(*fields[1][0]=='<') {
/* first or last entry of a Unicode area */
size_t length=fields[1][1]-fields[1][0];
if(length<9) {
/* name too short for an area name */
} else if(0==uprv_memcmp(", First>", fields[1][1]-8, 8)) {
/* set the current area */
if(unicodeAreas[unicodeAreaIndex].first==0xffffffff) {
length-=9;
unicodeAreas[unicodeAreaIndex].first=p.code;
unicodeAreas[unicodeAreaIndex].props=value;
uprv_memcpy(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length);
unicodeAreas[unicodeAreaIndex].name[length]=0;
} else {
/* error: a previous area is incomplete */
fprintf(stderr, "genprops: error - area \"%s\" is incomplete\n", unicodeAreas[unicodeAreaIndex].name);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
return;
} else if(0==uprv_memcmp(", Last>", fields[1][1]-7, 7)) {
/* check that the current area matches, and complete it with the last code point */
length-=8;
if( unicodeAreas[unicodeAreaIndex].props==value &&
0==uprv_memcmp(unicodeAreas[unicodeAreaIndex].name, fields[1][0]+1, length) &&
unicodeAreas[unicodeAreaIndex].name[length]==0 &&
unicodeAreas[unicodeAreaIndex].first<p.code
) {
unicodeAreas[unicodeAreaIndex].last=p.code;
if(beVerbose) {
printf("Unicode area U+%04lx..U+%04lx \"%s\"\n",
(unsigned long)unicodeAreas[unicodeAreaIndex].first,
(unsigned long)unicodeAreas[unicodeAreaIndex].last,
unicodeAreas[unicodeAreaIndex].name);
}
unicodeAreas[++unicodeAreaIndex].first=0xffffffff;
} else {
/* error: different properties between first & last, different area name, first>=last */
fprintf(stderr, "genprops: error - Last of area \"%s\" is incorrect\n", unicodeAreas[unicodeAreaIndex].name);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
return;
} else {
/* not an area name */
}
}
/* check for non-character code points */
if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) {
fprintf(stderr, "genprops: error - properties for non-character code point U+%04lx\n",
(unsigned long)p.code);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
/* check that the code points (p.code) are in ascending order */
if(p.code<=prevCode && p.code>0) {
fprintf(stderr, "genprops: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n",
(unsigned long)p.code, (unsigned long)prevCode);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
prevCode=p.code;
/* properties for a single code point */
addProps(p.code, value);
}
/* set repeated properties for the areas */
static void
repeatAreaProps() {
uint32_t puaProps;
int32_t i;
UBool hasPlane15PUA, hasPlane16PUA;
UErrorCode errorCode;
/*
* UnicodeData.txt before 3.0.1 did not contain the PUAs on
* planes 15 and 16.
* If that is the case, then we add them here, using the properties
* from the BMP PUA.
*/
puaProps=0;
hasPlane15PUA=hasPlane16PUA=FALSE;
for(i=0; i<unicodeAreaIndex; ++i) {
repeatProps(unicodeAreas[i].first,
unicodeAreas[i].last,
unicodeAreas[i].props);
if(unicodeAreas[i].first==0xe000) {
puaProps=unicodeAreas[i].props;
} else if(unicodeAreas[i].first==0xf0000) {
hasPlane15PUA=TRUE;
} else if(unicodeAreas[i].first==0x100000) {
hasPlane16PUA=TRUE;
}
}
if(puaProps!=0) {
if(!hasPlane15PUA) {
repeatProps(0xf0000, 0xffffd, puaProps);
}
if(!hasPlane16PUA) {
repeatProps(0x100000, 0x10fffd, puaProps);
}
}
}
static void
parseDB(const char *filename, UErrorCode *pErrorCode) {
char *fields[15][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
/* while unicodeAreas[unicodeAreaIndex] is unused, set its first to a bogus value */
unicodeAreas[0].first=0xffffffff;
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
if(unicodeAreas[unicodeAreaIndex].first!=0xffffffff) {
fprintf(stderr, "genprops: error - the last area \"%s\" from U+%04lx is incomplete\n",
unicodeAreas[unicodeAreaIndex].name,
(unsigned long)unicodeAreas[unicodeAreaIndex].first);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
repeatAreaProps();
if(U_FAILURE(*pErrorCode)) {
return;
}
}
/*
* Hey, Emacs, please set the following:
*

View File

@ -23,9 +23,8 @@
#include "propsvec.h"
#include "unewdata.h"
/* file definitions */
#define DATA_NAME "uprops"
#define DATA_TYPE "icu"
// TODO: remove
#define USE_NEW 1
class PropsWriter {
public:
@ -51,31 +50,10 @@ typedef struct {
/* global flags */
U_CFUNC UBool beVerbose;
U_CFUNC const char *const
genCategoryNames[];
/* prototypes */
U_CFUNC void
writeUCDFilename(char *basename, const char *filename, const char *suffix);
U_CFUNC UBool
isToken(const char *token, const char *s);
U_CFUNC int32_t
getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s);
U_CFUNC uint32_t
makeProps(Props *p);
U_CFUNC void
addProps(uint32_t c, uint32_t props);
U_CFUNC uint32_t
getProps(uint32_t c);
U_CFUNC void
repeatProps(uint32_t first, uint32_t last, uint32_t props);
U_CFUNC void
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode);

View File

@ -23,7 +23,6 @@
#include "unicode/unistr.h"
#include "unicode/uscript.h"
#include "cstring.h"
#include "cmemory.h"
#include "utrie2.h"
#include "uprops.h"
#include "propsvec.h"
@ -70,31 +69,6 @@ scriptExtensionsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
static void
parseMultiFieldFile(char *filename, char *basename,
const char *ucdFile, const char *suffix,
int32_t fieldCount,
UParseLineFn *lineFn,
UErrorCode *pErrorCode) {
char *fields[20][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
writeUCDFilename(basename, ucdFile, suffix);
u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
}
}
static void U_CALLCONV
numericLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
/* -------------------------------------------------------------------------- */
U_CFUNC void
@ -103,11 +77,6 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
basename=filename+uprv_strlen(filename);
/* process various UCD .txt files */
/* add Han numeric types & values */
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
}
@ -245,140 +214,6 @@ scriptExtensionsLineFn(void *context,
} while(start<=end);
}
/* DerivedNumericValues.txt ------------------------------------------------- */
static void U_CALLCONV
numericLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
Props newProps={ 0 };
char *s, *numberLimit;
uint32_t start, end, value, oldProps32;
char c;
UBool isFraction;
/* get the code point range */
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/*
* Ignore the
* # @missing: 0000..10FFFF; NaN
* line from Unicode 5.1's DerivedNumericValues.txt:
* The following code cannot parse "NaN", and we don't want to overwrite
* the numeric values for all characters after reading most
* from UnicodeData.txt already.
*/
if(start==0 && end==0x10ffff) {
return;
}
/* check if the numeric value is a fraction (this code does not handle any) */
isFraction=FALSE;
s=uprv_strchr(fields[1][0], '.');
if(s!=NULL) {
numberLimit=s+1;
while('0'<=(c=*numberLimit++) && c<='9') {
if(c!='0') {
isFraction=TRUE;
break;
}
}
}
if(isFraction) {
value=0;
} else {
/* parse numeric value */
s=(char *)u_skipWhitespace(fields[1][0]);
/* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
/* large integers are encoded in a special way, see store.c */
uint8_t exp=0;
value=s[0]-'0';
numberLimit=s;
while(*(++numberLimit)=='0') {
++exp;
}
newProps.exponent=exp;
} else {
/* normal number parsing */
value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
}
if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
}
/*
* Unicode 4.0.1 removes the third column that used to list the numeric type.
* Assume that either the data is the same as in UnicodeData.txt,
* or else that the numeric type is "numeric".
* This should work because we only expect to add numeric values for
* Han characters; for those, UnicodeData.txt lists only ranges without
* specific properties for single characters.
*/
/* set the new numeric value */
newProps.code=start;
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
/* the exponent may have been set above */
for(; start<=end; ++start) {
uint32_t newProps32;
int32_t oldNtv;
oldProps32=getProps(start);
oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);
if(isFraction) {
if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
/* this code point was already listed with its numeric value in UnicodeData.txt */
continue;
} else {
fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
exit(U_PARSE_ERROR);
}
}
/*
* For simplicity, and because we only expect to set numeric values for Han characters,
* for now we only allow to set these values for Lo characters.
*/
if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
exit(U_PARSE_ERROR);
}
/* verify that we do not change an existing value (fractions were excluded above) */
if(oldNtv!=UPROPS_NTV_NONE) {
/* the code point already has a value stored */
newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
newProps32=makeProps(&newProps);
if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
exit(U_PARSE_ERROR);
}
/* same value, continue */
} else {
/* the code point is getting a new numeric value */
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
newProps32=makeProps(&newProps);
if(beVerbose) {
printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
(int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
}
addProps(start, newProps32|GET_CATEGORY(oldProps32));
}
}
}
class Props2Writer : public PropsWriter {
public:
Props2Writer(UErrorCode &errorCode);
@ -463,6 +298,7 @@ propToEnums[]={
{ UCHAR_SCRIPT, 0, 0, UPROPS_SCRIPT_MASK },
{ UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
{ UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
@ -501,11 +337,6 @@ Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErro
}
}
}
if(newValues.contains(UCHAR_DECOMPOSITION_TYPE)) {
upvec_setValue(pv, start, end,
2, (uint32_t)props.getIntProp(UCHAR_DECOMPOSITION_TYPE), UPROPS_DT_MASK,
&errorCode);
}
if(newValues.contains(UCHAR_AGE)) {
if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
char buffer[U_MAX_VERSION_STRING_LENGTH];
@ -520,7 +351,7 @@ Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErro
&errorCode);
}
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops error: unable to set values for %04lX..%04lX: %s\n",
fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
(long)start, (long)end, u_errorName(errorCode));
}
}