ICU-8972 genprops: use ppucd.txt for enumerated properties
X-SVN-Rev: 31143
This commit is contained in:
parent
008e89c02f
commit
2cac672e6e
@ -45,20 +45,6 @@ static UnicodeString *scriptExtensions;
|
|||||||
|
|
||||||
/* miscellaneous ------------------------------------------------------------ */
|
/* miscellaneous ------------------------------------------------------------ */
|
||||||
|
|
||||||
static char *
|
|
||||||
trimTerminateField(char *s, char *limit) {
|
|
||||||
/* trim leading whitespace */
|
|
||||||
s=(char *)u_skipWhitespace(s);
|
|
||||||
|
|
||||||
/* trim trailing whitespace */
|
|
||||||
while(s<limit && U_IS_INV_WHITESPACE(*(limit-1))) {
|
|
||||||
--limit;
|
|
||||||
}
|
|
||||||
*limit=0;
|
|
||||||
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
parseTwoFieldFile(char *filename, char *basename,
|
parseTwoFieldFile(char *filename, char *basename,
|
||||||
const char *ucdFile, const char *suffix,
|
const char *ucdFile, const char *suffix,
|
||||||
@ -108,137 +94,6 @@ numericLineFn(void *context,
|
|||||||
char *fields[][2], int32_t fieldCount,
|
char *fields[][2], int32_t fieldCount,
|
||||||
UErrorCode *pErrorCode);
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
/* parse files with single enumerated properties ---------------------------- */
|
|
||||||
|
|
||||||
struct SingleEnum {
|
|
||||||
const char *ucdFile, *propName;
|
|
||||||
UProperty prop;
|
|
||||||
int32_t vecWord, vecShift;
|
|
||||||
uint32_t vecMask;
|
|
||||||
};
|
|
||||||
typedef struct SingleEnum SingleEnum;
|
|
||||||
|
|
||||||
static void
|
|
||||||
parseSingleEnumFile(char *filename, char *basename, const char *suffix,
|
|
||||||
const SingleEnum *sen,
|
|
||||||
UErrorCode *pErrorCode);
|
|
||||||
|
|
||||||
static const SingleEnum scriptSingleEnum={
|
|
||||||
"Scripts", "script",
|
|
||||||
UCHAR_SCRIPT,
|
|
||||||
0, 0, UPROPS_SCRIPT_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum blockSingleEnum={
|
|
||||||
"Blocks", "block",
|
|
||||||
UCHAR_BLOCK,
|
|
||||||
0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum graphemeClusterBreakSingleEnum={
|
|
||||||
"GraphemeBreakProperty", "Grapheme_Cluster_Break",
|
|
||||||
UCHAR_GRAPHEME_CLUSTER_BREAK,
|
|
||||||
2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum wordBreakSingleEnum={
|
|
||||||
"WordBreakProperty", "Word_Break",
|
|
||||||
UCHAR_WORD_BREAK,
|
|
||||||
2, UPROPS_WB_SHIFT, UPROPS_WB_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum sentenceBreakSingleEnum={
|
|
||||||
"SentenceBreakProperty", "Sentence_Break",
|
|
||||||
UCHAR_SENTENCE_BREAK,
|
|
||||||
2, UPROPS_SB_SHIFT, UPROPS_SB_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum lineBreakSingleEnum={
|
|
||||||
"LineBreak", "line break",
|
|
||||||
UCHAR_LINE_BREAK,
|
|
||||||
2, UPROPS_LB_SHIFT, UPROPS_LB_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static const SingleEnum eawSingleEnum={
|
|
||||||
"EastAsianWidth", "east asian width",
|
|
||||||
UCHAR_EAST_ASIAN_WIDTH,
|
|
||||||
0, UPROPS_EA_SHIFT, UPROPS_EA_MASK
|
|
||||||
};
|
|
||||||
|
|
||||||
static void U_CALLCONV
|
|
||||||
singleEnumLineFn(void *context,
|
|
||||||
char *fields[][2], int32_t fieldCount,
|
|
||||||
UErrorCode *pErrorCode) {
|
|
||||||
const SingleEnum *sen;
|
|
||||||
char *s;
|
|
||||||
uint32_t start, end, uv;
|
|
||||||
int32_t value;
|
|
||||||
|
|
||||||
sen=(const SingleEnum *)context;
|
|
||||||
|
|
||||||
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
|
||||||
if(U_FAILURE(*pErrorCode)) {
|
|
||||||
fprintf(stderr, "genprops: syntax error in %s.txt field 0 at %s\n", sen->ucdFile, fields[0][0]);
|
|
||||||
exit(*pErrorCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* parse property alias */
|
|
||||||
s=trimTerminateField(fields[1][0], fields[1][1]);
|
|
||||||
value=u_getPropertyValueEnum(sen->prop, s);
|
|
||||||
if(value<0) {
|
|
||||||
if(sen->prop==UCHAR_BLOCK) {
|
|
||||||
if(isToken("Greek", s)) {
|
|
||||||
value=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
|
|
||||||
} else if(isToken("Combining Marks for Symbols", s)) {
|
|
||||||
value=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
|
|
||||||
} else if(isToken("Private Use", s)) {
|
|
||||||
value=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(value<0) {
|
|
||||||
fprintf(stderr, "genprops error: unknown %s name in %s.txt field 1 at %s\n",
|
|
||||||
sen->propName, sen->ucdFile, s);
|
|
||||||
exit(U_PARSE_ERROR);
|
|
||||||
}
|
|
||||||
|
|
||||||
uv=(uint32_t)(value<<sen->vecShift);
|
|
||||||
if((uv&sen->vecMask)!=uv) {
|
|
||||||
fprintf(stderr, "genprops error: %s value overflow (0x%x) at %s\n",
|
|
||||||
sen->propName, (int)uv, s);
|
|
||||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(start==0 && end==0x10ffff) {
|
|
||||||
/* Also set bits for initialValue and errorValue. */
|
|
||||||
end=UPVEC_MAX_CP;
|
|
||||||
}
|
|
||||||
upvec_setValue(pv, start, end, sen->vecWord, uv, sen->vecMask, pErrorCode);
|
|
||||||
if(U_FAILURE(*pErrorCode)) {
|
|
||||||
fprintf(stderr, "genprops error: unable to set %s code: %s\n",
|
|
||||||
sen->propName, u_errorName(*pErrorCode));
|
|
||||||
exit(*pErrorCode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
parseSingleEnumFile(char *filename, char *basename, const char *suffix,
|
|
||||||
const SingleEnum *sen,
|
|
||||||
UErrorCode *pErrorCode) {
|
|
||||||
char *fields[2][2];
|
|
||||||
|
|
||||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
writeUCDFilename(basename, sen->ucdFile, suffix);
|
|
||||||
|
|
||||||
u_parseDelimitedFile(filename, ';', fields, 2, singleEnumLineFn, (void *)sen, pErrorCode);
|
|
||||||
if(U_FAILURE(*pErrorCode)) {
|
|
||||||
fprintf(stderr, "error parsing %s.txt: %s\n", sen->ucdFile, u_errorName(*pErrorCode));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* -------------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------------- */
|
||||||
|
|
||||||
static void
|
static void
|
||||||
@ -271,56 +126,8 @@ generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pEr
|
|||||||
/* add Han numeric types & values */
|
/* add Han numeric types & values */
|
||||||
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
|
parseMultiFieldFile(filename, basename, "DerivedNumericValues", suffix, 2, numericLineFn, pErrorCode);
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &scriptSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
|
parseTwoFieldFile(filename, basename, "ScriptExtensions", suffix, scriptExtensionsLineFn, pErrorCode);
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &blockSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &graphemeClusterBreakSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &wordBreakSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &sentenceBreakSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* LineBreak-4.0.0.txt:
|
|
||||||
* - All code points, assigned and unassigned, that are not listed
|
|
||||||
* explicitly are given the value "XX".
|
|
||||||
*
|
|
||||||
* XX==U_LB_UNKNOWN==0 - nothing to do
|
|
||||||
*/
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &lineBreakSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Preset East Asian Width defaults:
|
|
||||||
*
|
|
||||||
* http://www.unicode.org/reports/tr11/#Unassigned
|
|
||||||
* 7.1 Unassigned and Private Use characters
|
|
||||||
*
|
|
||||||
* All unassigned characters are by default classified as non-East Asian neutral,
|
|
||||||
* except for the range U+20000 to U+2FFFD,
|
|
||||||
* since all code positions from U+20000 to U+2FFFD are intended for CJK ideographs (W).
|
|
||||||
* All Private use characters are by default classified as ambiguous,
|
|
||||||
* since their definition depends on context.
|
|
||||||
*
|
|
||||||
* N for all ==0 - nothing to do
|
|
||||||
* A for Private Use
|
|
||||||
* W for plane 2
|
|
||||||
*/
|
|
||||||
*pErrorCode=U_ZERO_ERROR;
|
|
||||||
upvec_setValue(pv, 0xe000, 0xf8ff, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
||||||
upvec_setValue(pv, 0xf0000, 0xffffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
||||||
upvec_setValue(pv, 0x100000, 0x10fffd, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
||||||
upvec_setValue(pv, 0x20000, 0x2fffd, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode);
|
|
||||||
if(U_FAILURE(*pErrorCode)) {
|
|
||||||
fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
|
|
||||||
exit(*pErrorCode);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* parse EastAsianWidth.txt */
|
|
||||||
parseSingleEnumFile(filename, basename, suffix, &eawSingleEnum, pErrorCode);
|
|
||||||
|
|
||||||
newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode);
|
newTrie=upvec_compactToUTrie2WithRowIndexes(pv, pErrorCode);
|
||||||
// TODO: remove
|
// TODO: remove
|
||||||
#if 0
|
#if 0
|
||||||
@ -736,7 +543,8 @@ propToBinaries[]={
|
|||||||
// Note: The Noncharacter_Code_Point property is probably stable enough
|
// Note: The Noncharacter_Code_Point property is probably stable enough
|
||||||
// so that it could be hardcoded.
|
// so that it could be hardcoded.
|
||||||
{ UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
|
{ UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
|
||||||
// Note: The Grapheme_Link property is deprecated since Unicode 5.0.
|
// Note: The Grapheme_Link property is deprecated since Unicode 5.0
|
||||||
|
// because it is a "Duplication of ccc=9" (UAX #44).
|
||||||
{ UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
|
{ UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
|
||||||
{ UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
|
{ UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
|
||||||
{ UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
|
{ UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
|
||||||
@ -761,6 +569,23 @@ propToBinaries[]={
|
|||||||
{ UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
|
{ UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct PropToEnum {
|
||||||
|
int32_t prop; // UProperty
|
||||||
|
int32_t vecWord, vecShift;
|
||||||
|
uint32_t vecMask;
|
||||||
|
};
|
||||||
|
|
||||||
|
static const PropToEnum
|
||||||
|
propToEnums[]={
|
||||||
|
{ UCHAR_SCRIPT, 0, 0, UPROPS_SCRIPT_MASK },
|
||||||
|
{ UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
|
||||||
|
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
|
||||||
|
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
|
||||||
|
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
|
||||||
|
{ UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
|
||||||
|
{ UCHAR_LINE_BREAK, 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
|
||||||
|
};
|
||||||
|
|
||||||
void
|
void
|
||||||
Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
|
Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErrorCode &errorCode) {
|
||||||
if(U_FAILURE(errorCode)) { return; }
|
if(U_FAILURE(errorCode)) { return; }
|
||||||
@ -781,6 +606,18 @@ Props2Writer::setProps(const UniProps &props, const UnicodeSet &newValues, UErro
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
|
||||||
|
for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
|
||||||
|
const PropToEnum &p2e=propToEnums[i];
|
||||||
|
U_ASSERT(p2e.vecShift<32);
|
||||||
|
if(newValues.contains(p2e.prop)) {
|
||||||
|
uint32_t mask=p2e.vecMask;
|
||||||
|
uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
|
||||||
|
U_ASSERT((value&mask)==value);
|
||||||
|
upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if(newValues.contains(UCHAR_DECOMPOSITION_TYPE)) {
|
if(newValues.contains(UCHAR_DECOMPOSITION_TYPE)) {
|
||||||
upvec_setValue(pv, start, end,
|
upvec_setValue(pv, start, end,
|
||||||
2, (uint32_t)props.getIntProp(UCHAR_DECOMPOSITION_TYPE), UPROPS_DT_MASK,
|
2, (uint32_t)props.getIntProp(UCHAR_DECOMPOSITION_TYPE), UPROPS_DT_MASK,
|
||||||
|
Loading…
Reference in New Issue
Block a user