03f9f2307e
X-SVN-Rev: 31137
970 lines
34 KiB
C++
970 lines
34 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2002-2011, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: props2writer.cpp (was props2.cpp)
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2002feb24
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Parse more Unicode Character Database files and store
|
|
* additional Unicode character properties in bit set vectors.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/uscript.h"
|
|
#include "cstring.h"
|
|
#include "cmemory.h"
|
|
#include "utrie2.h"
|
|
#include "uprops.h"
|
|
#include "propsvec.h"
|
|
#include "uparse.h"
|
|
#include "writesrc.h"
|
|
#include "genprops.h"
|
|
|
|
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
/* data --------------------------------------------------------------------- */
|
|
|
|
static UTrie2 *newTrie;
|
|
UPropsVectors *pv;
|
|
|
|
static UnicodeString *scriptExtensions;
|
|
|
|
/* miscellaneous ------------------------------------------------------------ */
|
|
|
|
static char *
|
|
trimTerminateField(char *s, char *limit) {
|
|
/* trim leading whitespace */
|
|
s=(char *)u_skipWhitespace(s);
|
|
|
|
/* trim trailing whitespace */
|
|
while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) {
|
|
--limit;
|
|
}
|
|
*limit=0;
|
|
|
|
return s;
|
|
}
|
|
|
|
static void
|
|
parseTwoFieldFile(char *filename, char *basename,
|
|
const char *ucdFile, const char *suffix,
|
|
UParseLineFn *lineFn,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[2][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, ucdFile, suffix);
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
|
|
}
|
|
}
|
|
|
|
static void U_CALLCONV
|
|
ageLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
scriptExtensionsLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void
|
|
parseMultiFieldFile(char *filename, char *basename,
|
|
const char *ucdFile, const char *suffix,
|
|
int32_t fieldCount,
|
|
UParseLineFn *lineFn,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[20][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, ucdFile, suffix);
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, fieldCount, lineFn, NULL, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
|
|
}
|
|
}
|
|
|
|
static void U_CALLCONV
|
|
numericLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
/* parse files with single enumerated properties ---------------------------- */
|
|
|
|
struct SingleEnum {
|
|
const char *ucdFile, *propName;
|
|
UProperty prop;
|
|
int32_t vecWord, vecShift;
|
|
uint32_t vecMask;
|
|
};
|
|
typedef struct SingleEnum SingleEnum;
|
|
|
|
static void
|
|
parseSingleEnumFile(char *filename, char *basename, const char *suffix,
|
|
const SingleEnum *sen,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static const SingleEnum scriptSingleEnum={
|
|
"Scripts", "script",
|
|
UCHAR_SCRIPT,
|
|
0, 0, UPROPS_SCRIPT_MASK
|
|
};
|
|
|
|
static const SingleEnum blockSingleEnum={
|
|
"Blocks", "block",
|
|
UCHAR_BLOCK,
|
|
0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
|
|
};
|
|
|
|
static const SingleEnum graphemeClusterBreakSingleEnum={
|
|
"GraphemeBreakProperty", "Grapheme_Cluster_Break",
|
|
UCHAR_GRAPHEME_CLUSTER_BREAK,
|
|
2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
|
|
};
|
|
|
|
static const SingleEnum wordBreakSingleEnum={
|
|
"WordBreakProperty", "Word_Break",
|
|
UCHAR_WORD_BREAK,
|
|
2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
|
|
};
|
|
|
|
static const SingleEnum sentenceBreakSingleEnum={
|
|
"SentenceBreakProperty", "Sentence_Break",
|
|
UCHAR_SENTENCE_BREAK,
|
|
2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
|
|
};
|
|
|
|
static const SingleEnum lineBreakSingleEnum={
|
|
"LineBreak", "line break",
|
|
UCHAR_LINE_BREAK,
|
|
2, UPROPS_LB_SHIFT, UPROPS_LB_MASK
|
|
};
|
|
|
|
static const SingleEnum eawSingleEnum={
|
|
"EastAsianWidth", "east asian width",
|
|
UCHAR_EAST_ASIAN_WIDTH,
|
|
0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
singleEnumLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
const SingleEnum *sen;
|
|
char *s;
|
|
uint32_t start, end, uv;
|
|
int32_t value;
|
|
|
|
sen=(const SingleEnum *)context;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* parse property alias */
|
|
s=trimTerminateField(fields[1][0], fields[1][1]);
|
|
value=u_getPropertyValueEnum(sen->prop, s);
|
|
if(value<0) {
|
|
if(sen->prop==UCHAR_BLOCK) {
|
|
if(isToken("Greek", s)) {
|
|
value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
|
|
} else if(isToken("Combining Marks for Symbols", s)) {
|
|
value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
|
|
} else if(isToken("Private Use", s)) {
|
|
value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
|
|
}
|
|
}
|
|
}
|
|
if(value<0) {
|
|
fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
|
|
sen->propName, sen->ucdFile, s);
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
uv=(uint32_t)(value<<sen->vecShift);
|
|
if((uv&sen->vecMask)!=uv) {
|
|
fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
|
|
sen->propName, (int)uv, s);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
|
|
if(start==0 && end==0x10ffff) {
|
|
/* Also set bits for initialValue and errorValue. */
|
|
end=UPVEC_MAX_CP;
|
|
}
|
|
upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set %s code: %s\n",
|
|
sen->propName, u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseSingleEnumFile(char *filename, char *basename, const char *suffix,
|
|
const SingleEnum *sen,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[2][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, sen->ucdFile, suffix);
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
|
|
}
|
|
}
|
|
|
|
/* parse files with multiple binary properties ------------------------------ */
|
|
|
|
struct Binary {
|
|
const char *propName;
|
|
int32_t vecWord, vecShift;
|
|
};
|
|
typedef struct Binary Binary;
|
|
|
|
struct Binaries {
|
|
const char *ucdFile;
|
|
const Binary *binaries;
|
|
int32_t binariesCount;
|
|
};
|
|
typedef struct Binaries Binaries;
|
|
|
|
static const Binary
|
|
propListNames[]={
|
|
{ "White_Space", 1, UPROPS_WHITE_SPACE },
|
|
{ "Dash", 1, UPROPS_DASH },
|
|
{ "Hyphen", 1, UPROPS_HYPHEN },
|
|
{ "Quotation_Mark", 1, UPROPS_QUOTATION_MARK },
|
|
{ "Terminal_Punctuation", 1, UPROPS_TERMINAL_PUNCTUATION },
|
|
{ "Hex_Digit", 1, UPROPS_HEX_DIGIT },
|
|
{ "ASCII_Hex_Digit", 1, UPROPS_ASCII_HEX_DIGIT },
|
|
{ "Ideographic", 1, UPROPS_IDEOGRAPHIC },
|
|
{ "Diacritic", 1, UPROPS_DIACRITIC },
|
|
{ "Extender", 1, UPROPS_EXTENDER },
|
|
{ "Noncharacter_Code_Point", 1, UPROPS_NONCHARACTER_CODE_POINT },
|
|
{ "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK },
|
|
{ "IDS_Binary_Operator", 1, UPROPS_IDS_BINARY_OPERATOR },
|
|
{ "IDS_Trinary_Operator", 1, UPROPS_IDS_TRINARY_OPERATOR },
|
|
{ "Radical", 1, UPROPS_RADICAL },
|
|
{ "Unified_Ideograph", 1, UPROPS_UNIFIED_IDEOGRAPH },
|
|
{ "Deprecated", 1, UPROPS_DEPRECATED },
|
|
{ "Logical_Order_Exception", 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
|
|
|
|
/* new properties in Unicode 4.0.1 */
|
|
{ "STerm", 1, UPROPS_S_TERM },
|
|
{ "Variation_Selector", 1, UPROPS_VARIATION_SELECTOR },
|
|
|
|
/* new properties in Unicode 4.1 */
|
|
{ "Pattern_Syntax", 1, UPROPS_PATTERN_SYNTAX },
|
|
{ "Pattern_White_Space", 1, UPROPS_PATTERN_WHITE_SPACE }
|
|
};
|
|
|
|
static const Binaries
|
|
propListBinaries={
|
|
"PropList", propListNames, LENGTHOF(propListNames)
|
|
};
|
|
|
|
static const Binary
|
|
derCorePropsNames[]={
|
|
{ "XID_Start", 1, UPROPS_XID_START },
|
|
{ "XID_Continue", 1, UPROPS_XID_CONTINUE },
|
|
|
|
/* before Unicode 4/ICU 2.6/format version 3.2, these used to be Other_XYZ from PropList.txt */
|
|
{ "Math", 1, UPROPS_MATH },
|
|
{ "Alphabetic", 1, UPROPS_ALPHABETIC },
|
|
{ "Grapheme_Extend", 1, UPROPS_GRAPHEME_EXTEND },
|
|
{ "Default_Ignorable_Code_Point", 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
|
|
|
|
/* new properties bits in ICU 2.6/format version 3.2 */
|
|
{ "ID_Start", 1, UPROPS_ID_START },
|
|
{ "ID_Continue", 1, UPROPS_ID_CONTINUE },
|
|
{ "Grapheme_Base", 1, UPROPS_GRAPHEME_BASE },
|
|
|
|
/*
|
|
* Unicode 5/ICU 3.6 moves Grapheme_Link from PropList.txt
|
|
* to DerivedCoreProperties.txt and deprecates it.
|
|
*/
|
|
{ "Grapheme_Link", 1, UPROPS_GRAPHEME_LINK }
|
|
};
|
|
|
|
static const Binaries
|
|
derCorePropsBinaries={
|
|
"DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames)
|
|
};
|
|
|
|
static char ignoredProps[100][64];
|
|
static int32_t ignoredPropsCount;
|
|
|
|
static void
|
|
addIgnoredProp(char *s, char *limit) {
|
|
int32_t i;
|
|
|
|
s=trimTerminateField(s, limit);
|
|
for(i=0; i<ignoredPropsCount; ++i) {
|
|
if(0==uprv_strcmp(ignoredProps[i], s)) {
|
|
return;
|
|
}
|
|
}
|
|
uprv_strcpy(ignoredProps[ignoredPropsCount++], s);
|
|
}
|
|
|
|
static void U_CALLCONV
|
|
binariesLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
const Binaries *bin;
|
|
char *s;
|
|
uint32_t start, end, uv;
|
|
int32_t i;
|
|
|
|
bin=(const Binaries *)context;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* parse binary property name */
|
|
s=(char *)u_skipWhitespace(fields[1][0]);
|
|
for(i=0;; ++i) {
|
|
if(i==bin->binariesCount) {
|
|
/* ignore unrecognized properties */
|
|
if(beVerbose) {
|
|
addIgnoredProp(s, fields[1][1]);
|
|
}
|
|
return;
|
|
}
|
|
if(isToken(bin->binaries[i].propName, s)) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(bin->binaries[i].vecShift>=32) {
|
|
fprintf(stderr, "genprops error: shift value %d>=32 for %s %s\n",
|
|
(int)bin->binaries[i].vecShift, bin->ucdFile, bin->binaries[i].propName);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
uv=U_MASK(bin->binaries[i].vecShift);
|
|
|
|
if(start==0 && end==0x10ffff) {
|
|
/* Also set bits for initialValue and errorValue. */
|
|
end=UPVEC_MAX_CP;
|
|
}
|
|
upvec_setValue(pv, start, end, bin->binaries[i].vecWord, uv, uv, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set %s code: %s\n",
|
|
bin->binaries[i].propName, u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseBinariesFile(char *filename, char *basename, const char *suffix,
|
|
const Binaries *bin,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[2][2];
|
|
int32_t i;
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, bin->ucdFile, suffix);
|
|
|
|
ignoredPropsCount=0;
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode));
|
|
}
|
|
|
|
if(beVerbose) {
|
|
for(i=0; i<ignoredPropsCount; ++i) {
|
|
printf("genprops: ignoring property %s in %s.txt\n", ignoredProps[i], bin->ucdFile);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
U_CFUNC void
|
|
initAdditionalProperties() {
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: upvec_open() failed - %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
scriptExtensions=new UnicodeString;
|
|
}
|
|
|
|
U_CFUNC void
|
|
exitAdditionalProperties() {
|
|
utrie2_close(newTrie);
|
|
upvec_close(pv);
|
|
delete scriptExtensions;
|
|
}
|
|
|
|
U_CFUNC void
|
|
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
|
|
char *basename;
|
|
|
|
basename=filename+uprv_strlen(filename);
|
|
|
|
/* process various UCD .txt files */
|
|
|
|
/* add Han numeric types & values */
|
|
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
|
|
|
|
parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
|
|
|
|
parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
|
|
|
|
parseBinariesFile(filename, basename, suffix, &propListBinaries, pErrorCode);
|
|
|
|
parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, pErrorCode);
|
|
|
|
parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
|
|
|
|
parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
|
|
|
|
parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
|
|
|
|
/*
|
|
* LineBreak-4.0.0.txt:
|
|
* - All code points, assigned and unassigned, that are not listed
|
|
* explicitly are given the value "XX".
|
|
*
|
|
* XX==U_LB_UNKNOWN==0 - nothing to do
|
|
*/
|
|
parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
|
|
|
|
/*
|
|
* Preset East Asian Width defaults:
|
|
*
|
|
* http://www.unicode.org/reports/tr11/#Unassigned
|
|
* 7.1 Unassigned and Private Use characters
|
|
*
|
|
* All unassigned characters are by default classified as non-East Asian neutral,
|
|
* except for the range U+20000 to U+2FFFD,
|
|
* since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
|
|
* All Private use characters are by default classified as ambiguous,
|
|
* since their definition depends on context.
|
|
*
|
|
* N for all ==0 - nothing to do
|
|
* A for Private Use
|
|
* W for plane 2
|
|
*/
|
|
*pErrorCode=U_ZERO_ERROR;
|
|
upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* parse EastAsianWidth.txt */
|
|
parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
|
|
|
|
newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode);
|
|
// TODO: remove
|
|
#if 0
|
|
const uint32_t *pvArray;
|
|
int32_t pvRows;
|
|
pvArray=upvec_getArray(pv, &pvRows, NULL);
|
|
for(int32_t c=0; c<=0x10ffff; ++c) {
|
|
uint16_t ri=utrie2_get32(newTrie, c);
|
|
uint32_t v2=pvArray[ri+2];
|
|
int32_t dt=v2&UPROPS_DT_MASK;
|
|
if(dt!=0) {
|
|
printf("%04x %d\n", c, dt);
|
|
}
|
|
}
|
|
#endif
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
|
|
u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* DerivedAge.txt ----------------------------------------------------------- */
|
|
|
|
static void U_CALLCONV
|
|
ageLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *s, *numberLimit;
|
|
uint32_t value, start, end, version;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* ignore "unassigned" (the default is already set to 0.0) */
|
|
s=(char *)u_skipWhitespace(fields[1][0]);
|
|
if(0==uprv_strncmp(s, "unassigned", 10) || 0==uprv_strncmp(s, "Unassigned", 10)) {
|
|
return;
|
|
}
|
|
|
|
/* parse version number */
|
|
value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
|
|
if(s==numberLimit || value==0 || value>15 || (*numberLimit!='.' && *numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
version=value<<4;
|
|
|
|
/* parse minor version number */
|
|
if(*numberLimit=='.') {
|
|
s=(char *)u_skipWhitespace(numberLimit+1);
|
|
value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
|
|
if(s==numberLimit || value>15 || (*numberLimit!=' ' && *numberLimit!='\t' && *numberLimit!=0)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
version|=value;
|
|
}
|
|
|
|
if(start==0 && end==0x10ffff) {
|
|
/* Also set bits for initialValue and errorValue. */
|
|
end=UPVEC_MAX_CP;
|
|
}
|
|
upvec_setValue(pv, start, end, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* ScriptExtensions.txt ----------------------------------------------------- */
|
|
|
|
static void U_CALLCONV
|
|
scriptExtensionsLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, end;
|
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/* ignore "<script>" on the @missing line */
|
|
if(*u_skipWhitespace(fields[1][0])=='<') {
|
|
return;
|
|
}
|
|
|
|
/* parse list of script codes */
|
|
UnicodeString codes; // vector of 16-bit UScriptCode values
|
|
char *s=fields[1][0];
|
|
for(;;) {
|
|
// skip whitespace before each token
|
|
s=(char *)u_skipWhitespace(s);
|
|
if(*s==0 || *s==';') {
|
|
break;
|
|
}
|
|
// skip non-whitespace, non-terminator characters to find the token limit
|
|
char *limit=s;
|
|
char c;
|
|
do {
|
|
c=*++limit;
|
|
} while(!U_IS_INV_WHITESPACE(c) && c!=0 && c!=';');
|
|
// NUL-terminated this token
|
|
*limit=0;
|
|
// convert the token (script property value alias) into a UScriptCode value
|
|
int32_t value=u_getPropertyValueEnum(UCHAR_SCRIPT, s);
|
|
if(value<0) {
|
|
fprintf(stderr, "genprops: syntax error in ScriptExtensions.txt field 1 at %s\n", s);
|
|
exit(U_INVALID_FORMAT_ERROR);
|
|
}
|
|
// Insertion sort into the list of script codes.
|
|
for(int32_t i=0;; ++i) {
|
|
if(i<codes.length()) {
|
|
if(value<codes[i]) {
|
|
codes.insert(i, (UChar)value);
|
|
break;
|
|
} else if(value==codes[i]) {
|
|
fprintf(stderr,
|
|
"genprops: duplicate script code in ScriptExtensions.txt field 1 at %s "
|
|
"for U+%04lx..U+%04lx\n",
|
|
s, (long)start, (long)end);
|
|
exit(U_INVALID_FORMAT_ERROR);
|
|
}
|
|
// continue while value>codes[i]
|
|
} else {
|
|
codes.append((UChar)value);
|
|
break;
|
|
}
|
|
}
|
|
if(c==0 || c==';') {
|
|
// the token ended at a terminator
|
|
break;
|
|
} else {
|
|
// the token ended at U_IS_INV_WHITESPACE(c), continue after c
|
|
s=limit+1;
|
|
}
|
|
}
|
|
int32_t length=codes.length();
|
|
if(length==0) {
|
|
fprintf(stderr,
|
|
"genprops: missing values in ScriptExtensions.txt field 1 "
|
|
"for U+%04lx..U+%04lx\n",
|
|
(long)start, (long)end);
|
|
exit(U_INVALID_FORMAT_ERROR);
|
|
}
|
|
// Set bit 15 on the last script code, for termination.
|
|
codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
|
|
// Find this list of codes in the Script_Extensions data so far, or add this list.
|
|
int32_t index=scriptExtensions->indexOf(codes);
|
|
if(index<0) {
|
|
index=scriptExtensions->length();
|
|
scriptExtensions->append(codes);
|
|
}
|
|
// Modify the Script data for each of the start..end code points
|
|
// to include the Script_Extensions index.
|
|
do {
|
|
uint32_t scriptX=upvec_getValue(pv, (UChar32)start, 0)&UPROPS_SCRIPT_X_MASK;
|
|
// Find the next code point that has a different script value.
|
|
// We want to add the Script_Extensions index to the code point range start..next-1.
|
|
UChar32 next;
|
|
for(next=(UChar32)start+1;
|
|
next<=(UChar32)end && scriptX==(upvec_getValue(pv, next, 0)&UPROPS_SCRIPT_X_MASK);
|
|
++next) {}
|
|
if(scriptX>=UPROPS_SCRIPT_X_WITH_COMMON) {
|
|
fprintf(stderr,
|
|
"genprops: ScriptExtensions.txt has values for U+%04lx..U+%04lx "
|
|
"which overlaps with a range including U+%04lx..U+%04lx\n",
|
|
(long)start, (long)end, (long)start, (long)(next-1));
|
|
exit(U_INVALID_FORMAT_ERROR);
|
|
}
|
|
// Encode the (Script, Script_Extensions index) pair.
|
|
if(scriptX==USCRIPT_COMMON) {
|
|
scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index;
|
|
} else if(scriptX==USCRIPT_INHERITED) {
|
|
scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index;
|
|
} else {
|
|
// Store an additional pair of 16-bit units for an unusual main Script code
|
|
// together with the Script_Extensions index.
|
|
UnicodeString codeIndexPair;
|
|
codeIndexPair.append((UChar)scriptX).append((UChar)index);
|
|
index=scriptExtensions->indexOf(codeIndexPair);
|
|
if(index<0) {
|
|
index=scriptExtensions->length();
|
|
scriptExtensions->append(codeIndexPair);
|
|
}
|
|
scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index;
|
|
}
|
|
if(index>UPROPS_SCRIPT_MASK) {
|
|
fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n");
|
|
exit(U_BUFFER_OVERFLOW_ERROR);
|
|
}
|
|
// Write the (Script, Script_Extensions index) pair into
|
|
// the properties vector for start..next-1.
|
|
upvec_setValue(pv, (UChar32)start, (UChar32)(next-1),
|
|
0, scriptX, UPROPS_SCRIPT_X_MASK, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set Script_Extensions: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
start=next;
|
|
} while(start<=end);
|
|
}
|
|
|
|
/* DerivedNumericValues.txt ------------------------------------------------- */
|
|
|
|
static void U_CALLCONV
|
|
numericLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
Props newProps={ 0 };
|
|
char *s, *numberLimit;
|
|
uint32_t start, end, value, oldProps32;
|
|
char c;
|
|
UBool isFraction;
|
|
|
|
/* get the code point range */
|
|
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
/*
|
|
* Ignore the
|
|
* # @missing: 0000..10FFFF; NaN
|
|
* line from Unicode 5.1's DerivedNumericValues.txt:
|
|
* The following code cannot parse "NaN", and we don't want to overwrite
|
|
* the numeric values for all characters after reading most
|
|
* from UnicodeData.txt already.
|
|
*/
|
|
if(start==0 && end==0x10ffff) {
|
|
return;
|
|
}
|
|
|
|
/* check if the numeric value is a fraction (this code does not handle any) */
|
|
isFraction=FALSE;
|
|
s=uprv_strchr(fields[1][0], '.');
|
|
if(s!=NULL) {
|
|
numberLimit=s+1;
|
|
while('0'<=(c=*numberLimit++) && c<='9') {
|
|
if(c!='0') {
|
|
isFraction=TRUE;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(isFraction) {
|
|
value=0;
|
|
} else {
|
|
/* parse numeric value */
|
|
s=(char *)u_skipWhitespace(fields[1][0]);
|
|
|
|
/* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
|
|
if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
|
|
/* large integers are encoded in a special way, see store.c */
|
|
uint8_t exp=0;
|
|
|
|
value=s[0]-'0';
|
|
numberLimit=s;
|
|
while(*(++numberLimit)=='0') {
|
|
++exp;
|
|
}
|
|
newProps.exponent=exp;
|
|
} else {
|
|
/* normal number parsing */
|
|
value=(uint32_t)uprv_strtoul(s, &numberLimit, 10);
|
|
}
|
|
if(numberLimit<=s || (*numberLimit!='.' && u_skipWhitespace(numberLimit)!=fields[1][1]) || value>=0x80000000) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedNumericValues.txt field 1 at %s\n", fields[0][0]);
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Unicode 4.0.1 removes the third column that used to list the numeric type.
|
|
* Assume that either the data is the same as in UnicodeData.txt,
|
|
* or else that the numeric type is "numeric".
|
|
* This should work because we only expect to add numeric values for
|
|
* Han characters; for those, UnicodeData.txt lists only ranges without
|
|
* specific properties for single characters.
|
|
*/
|
|
|
|
/* set the new numeric value */
|
|
newProps.code=start;
|
|
newProps.numericValue=(int32_t)value; /* newly parsed numeric value */
|
|
/* the exponent may have been set above */
|
|
|
|
for(; start<=end; ++start) {
|
|
uint32_t newProps32;
|
|
int32_t oldNtv;
|
|
oldProps32=getProps(start);
|
|
oldNtv=(int32_t)GET_NUMERIC_TYPE_VALUE(oldProps32);
|
|
|
|
if(isFraction) {
|
|
if(UPROPS_NTV_FRACTION_START<=oldNtv && oldNtv<UPROPS_NTV_LARGE_START) {
|
|
/* this code point was already listed with its numeric value in UnicodeData.txt */
|
|
continue;
|
|
} else {
|
|
fprintf(stderr, "genprops: not prepared for new fractions in DerivedNumericValues.txt field 1 at %s\n", fields[1][0]);
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* For simplicity, and because we only expect to set numeric values for Han characters,
|
|
* for now we only allow to set these values for Lo characters.
|
|
*/
|
|
if(oldNtv==UPROPS_NTV_NONE && GET_CATEGORY(oldProps32)!=U_OTHER_LETTER) {
|
|
fprintf(stderr, "genprops error: new numeric value for a character other than Lo in DerivedNumericValues.txt at %s\n", fields[0][0]);
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* verify that we do not change an existing value (fractions were excluded above) */
|
|
if(oldNtv!=UPROPS_NTV_NONE) {
|
|
/* the code point already has a value stored */
|
|
newProps.numericType=UPROPS_NTV_GET_TYPE(oldNtv);
|
|
newProps32=makeProps(&newProps);
|
|
if(oldNtv!=GET_NUMERIC_TYPE_VALUE(newProps32)) {
|
|
fprintf(stderr, "genprops error: new numeric value differs from old one for U+%04lx\n", (long)start);
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
/* same value, continue */
|
|
} else {
|
|
/* the code point is getting a new numeric value */
|
|
newProps.numericType=(uint8_t)U_NT_NUMERIC; /* assumed numeric type, see Unicode 4.0.1 comment */
|
|
newProps32=makeProps(&newProps);
|
|
if(beVerbose) {
|
|
printf("adding U+%04x numeric type %d encoded-numeric-type-value 0x%03x from %s\n",
|
|
(int)start, U_NT_NUMERIC, (int)GET_NUMERIC_TYPE_VALUE(newProps32), fields[0][0]);
|
|
}
|
|
|
|
addProps(start, newProps32|GET_CATEGORY(oldProps32));
|
|
}
|
|
}
|
|
}
|
|
|
|
/* data serialization ------------------------------------------------------- */
|
|
|
|
U_CFUNC int32_t
|
|
writeAdditionalData(FILE *f, uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
|
|
const uint32_t *pvArray;
|
|
int32_t pvRows, pvCount;
|
|
int32_t length;
|
|
UErrorCode errorCode;
|
|
|
|
pvArray=upvec_getArray(pv, &pvRows, NULL);
|
|
pvCount=pvRows*UPROPS_VECTOR_WORDS;
|
|
|
|
errorCode=U_ZERO_ERROR;
|
|
length=utrie2_serialize(newTrie, p, capacity, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr,
|
|
"genprops error: utrie2_freeze(additional properties)+utrie2_serialize() failed: %s\n",
|
|
u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
|
|
/* round up scriptExtensions to multiple of 4 bytes */
|
|
if(scriptExtensions->length()&1) {
|
|
scriptExtensions->append((UChar)0);
|
|
}
|
|
|
|
/* set indexes */
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
|
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
|
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
|
indexes[UPROPS_RESERVED_INDEX_7]=
|
|
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]+scriptExtensions->length()/2;
|
|
indexes[UPROPS_RESERVED_INDEX_8]=indexes[UPROPS_RESERVED_INDEX_7];
|
|
indexes[UPROPS_DATA_TOP_INDEX]=indexes[UPROPS_RESERVED_INDEX_8];
|
|
|
|
indexes[UPROPS_MAX_VALUES_INDEX]=
|
|
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
|
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
|
|
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
|
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
|
|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
|
|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
|
|
((int32_t)U_DT_COUNT-1);
|
|
|
|
int32_t additionalPropsSize=4*(indexes[UPROPS_DATA_TOP_INDEX]-indexes[UPROPS_ADDITIONAL_TRIE_INDEX]);
|
|
if(p!=NULL && additionalPropsSize<=capacity) {
|
|
if(beVerbose) {
|
|
printf("size in bytes of additional props trie:%5u\n", (int)length);
|
|
}
|
|
if(f!=NULL) {
|
|
usrc_writeUTrie2Arrays(f,
|
|
"static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
|
|
newTrie,
|
|
"\n};\n\n");
|
|
usrc_writeUTrie2Struct(f,
|
|
"static const UTrie2 propsVectorsTrie={\n",
|
|
newTrie, "propsVectorsTrie_index", NULL,
|
|
"};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
"static const uint32_t propsVectors[%ld]={\n",
|
|
pvArray, 32, pvCount,
|
|
"};\n\n");
|
|
fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
|
|
fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]);
|
|
|
|
usrc_writeArray(f,
|
|
"static const uint16_t scriptExtensions[%ld]={\n",
|
|
scriptExtensions->getBuffer(), 16, scriptExtensions->length(),
|
|
"};\n\n");
|
|
} else {
|
|
p+=length;
|
|
length=pvCount*4;
|
|
uprv_memcpy(p, pvArray, length);
|
|
|
|
p+=length;
|
|
length=scriptExtensions->length()*2;
|
|
uprv_memcpy(p, scriptExtensions->getBuffer(), length);
|
|
}
|
|
if(beVerbose) {
|
|
printf("number of additional props vectors: %5u\n", (int)pvRows);
|
|
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
|
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions->length());
|
|
}
|
|
}
|
|
|
|
return additionalPropsSize;
|
|
}
|
|
|
|
class Props2Writer : public PropsWriter {
|
|
public:
|
|
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
|
|
};
|
|
|
|
void
|
|
Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
if(newValues.contains(UCHAR_DECOMPOSITION_TYPE)) {
|
|
upvec_setValue(pv, props.start, props.end,
|
|
2, (uint32_t)props.getIntProp(UCHAR_DECOMPOSITION_TYPE), UPROPS_DT_MASK,
|
|
&errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set decomposition type: %s\n",
|
|
u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
}
|
|
}
|
|
|
|
PropsWriter *
|
|
createProps2Writer(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return NULL; }
|
|
PropsWriter *pw=new Props2Writer();
|
|
if(pw==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return pw;
|
|
}
|