7e92471c93
X-SVN-Rev: 9295
819 lines
25 KiB
C
819 lines
25 KiB
C
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2002, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: props2.c
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2002feb24
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Parse more Unicode Character Database files and store
|
|
* additional Unicode character properties in bit set vectors.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/uscript.h"
|
|
#include "cstring.h"
|
|
#include "cmemory.h"
|
|
#include "utrie.h"
|
|
#include "uprops.h"
|
|
#include "propsvec.h"
|
|
#include "uparse.h"
|
|
#include "genprops.h"
|
|
|
|
#define FLAG(n) ((uint32_t)1<<(n))
|
|
|
|
/* data --------------------------------------------------------------------- */
|
|
|
|
static UNewTrie *trie;
|
|
uint32_t *pv;
|
|
static int32_t pvCount;
|
|
|
|
static uint32_t prevStart=0, prevLimit=0, prevValue=0;
|
|
|
|
/* prototypes --------------------------------------------------------------- */
|
|
|
|
static void
|
|
parseTwoFieldFile(char *filename, char *basename,
|
|
const char *ucdFile, const char *suffix,
|
|
UParseLineFn *lineFn,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void
|
|
parseArabicShaping(char *filename, char *basename,
|
|
const char *suffix,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
ageLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
scriptsLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
blocksLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
propListLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
derivedPropListLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
eaWidthLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
static void U_CALLCONV
|
|
lineBreakLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode);
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
U_CFUNC void
|
|
initAdditionalProperties() {
|
|
pv=upvec_open(UPROPS_VECTOR_WORDS, 20000);
|
|
}
|
|
|
|
U_CFUNC void
|
|
generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) {
|
|
char *basename;
|
|
|
|
basename=filename+uprv_strlen(filename);
|
|
|
|
/* process various UCD .txt files */
|
|
parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode);
|
|
|
|
/*
|
|
* UTR 24 says:
|
|
* Section 2:
|
|
* "Common - For characters that may be used
|
|
* within multiple scripts,
|
|
* or any unassigned code points."
|
|
*
|
|
* Section 4:
|
|
* "The value COMMON is the default value,
|
|
* given to all code points that are not
|
|
* explicitly mentioned in the data file."
|
|
*/
|
|
if(!upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)USCRIPT_COMMON, UPROPS_SCRIPT_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set script code: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
parseTwoFieldFile(filename, basename, "Scripts", suffix, scriptsLineFn, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "Blocks", suffix, blocksLineFn, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "PropList", suffix, propListLineFn, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "DerivedCoreProperties", suffix, derivedPropListLineFn, pErrorCode);
|
|
|
|
parseTwoFieldFile(filename, basename, "LineBreak", suffix, lineBreakLineFn, pErrorCode);
|
|
|
|
parseArabicShaping(filename, basename, suffix, pErrorCode);
|
|
|
|
/*
|
|
* Preset East Asian Width defaults:
|
|
* N for all
|
|
* A for Private Use
|
|
* W for plane 2
|
|
*/
|
|
*pErrorCode=U_ZERO_ERROR;
|
|
if( !upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)(U_EA_NEUTRAL<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
|
|
!upvec_setValue(pv, 0xe000, 0xf900, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
|
|
!upvec_setValue(pv, 0xf0000, 0xffffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
|
|
!upvec_setValue(pv, 0x100000, 0x10fffe, 0, (uint32_t)(U_EA_AMBIGUOUS<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode) ||
|
|
!upvec_setValue(pv, 0x20000, 0x2fffe, 0, (uint32_t)(U_EA_WIDE<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)
|
|
) {
|
|
fprintf(stderr, "genprops: unable to set default East Asian Widths: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
prevStart=prevLimit=prevValue=0;
|
|
/* parse EastAsianWidth.txt */
|
|
parseTwoFieldFile(filename, basename, "EastAsianWidth", suffix, eaWidthLineFn, pErrorCode);
|
|
/* set last range */
|
|
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
trie=utrie_open(NULL, NULL, 50000, 0, FALSE);
|
|
if(trie==NULL) {
|
|
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
upvec_close(pv);
|
|
return;
|
|
}
|
|
|
|
pvCount=upvec_toTrie(pv, trie, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseTwoFieldFile(char *filename, char *basename,
|
|
const char *ucdFile, const char *suffix,
|
|
UParseLineFn *lineFn,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[2][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, ucdFile, suffix);
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 2, lineFn, NULL, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing %s.txt: %s\n", ucdFile, u_errorName(*pErrorCode));
|
|
}
|
|
}
|
|
|
|
/* DerivedAge.txt ----------------------------------------------------------- */
|
|
|
|
static void U_CALLCONV
|
|
ageLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *s, *end;
|
|
uint32_t value, start, limit, version;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse version number */
|
|
s=(char *)u_skipWhitespace(fields[1][0]);
|
|
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
|
if(s==end || value==0 || value>15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
version=value<<4;
|
|
|
|
/* parse minor version number */
|
|
if(*end=='.') {
|
|
s=(char *)u_skipWhitespace(end+1);
|
|
value=(uint32_t)uprv_strtoul(s, &end, 10);
|
|
if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
version|=value;
|
|
}
|
|
|
|
if(!upvec_setValue(pv, start, limit, 0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set character age: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* Scripts.txt -------------------------------------------------------------- */
|
|
|
|
static void U_CALLCONV
|
|
scriptsLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
char *s, *end;
|
|
uint32_t start, limit;
|
|
UScriptCode script;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in Scripts.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse script name */
|
|
s=(char *)u_skipWhitespace(fields[1][0]);
|
|
|
|
/* trim trailing whitespace */
|
|
end=fields[1][1];
|
|
while(s<end && (*(end-1)==' ' || *(end-1)=='\t')) {
|
|
--end;
|
|
}
|
|
*end=0;
|
|
if( 1!=uscript_getCode(s, &script, 1, pErrorCode) ||
|
|
U_FAILURE(*pErrorCode) ||
|
|
script<=USCRIPT_INVALID_CODE
|
|
) {
|
|
fprintf(stderr, "genprops error: unknown script name in Scripts.txt field 1 at %s\n", fields[1][0]);
|
|
if(U_SUCCESS(*pErrorCode)) {
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
}
|
|
exit(*pErrorCode);
|
|
}
|
|
|
|
if(!upvec_setValue(pv, start, limit, 0, (uint32_t)script, UPROPS_SCRIPT_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set script code: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* Blocks.txt --------------------------------------------------------------- */
|
|
|
|
/* Blocks.txt block names in the order of the parallel UBlockCode constants */
|
|
static const char *const
|
|
blockNames[UBLOCK_COUNT]={
|
|
NULL, /* 0 */
|
|
"Basic Latin",
|
|
"Latin-1 Supplement",
|
|
"Latin Extended-A",
|
|
"Latin Extended-B",
|
|
"IPA Extensions",
|
|
"Spacing Modifier Letters",
|
|
"Combining Diacritical Marks",
|
|
"Greek and Coptic", /* used to be just "Greek" before Unicode 3.2 */
|
|
"Cyrillic",
|
|
"Armenian", /* 10 */
|
|
"Hebrew",
|
|
"Arabic",
|
|
"Syriac",
|
|
"Thaana",
|
|
"Devanagari",
|
|
"Bengali",
|
|
"Gurmukhi",
|
|
"Gujarati",
|
|
"Oriya",
|
|
"Tamil", /* 20 */
|
|
"Telugu",
|
|
"Kannada",
|
|
"Malayalam",
|
|
"Sinhala",
|
|
"Thai",
|
|
"Lao",
|
|
"Tibetan",
|
|
"Myanmar",
|
|
"Georgian",
|
|
"Hangul Jamo", /* 30 */
|
|
"Ethiopic",
|
|
"Cherokee",
|
|
"Unified Canadian Aboriginal Syllabics",
|
|
"Ogham",
|
|
"Runic",
|
|
"Khmer",
|
|
"Mongolian",
|
|
"Latin Extended Additional",
|
|
"Greek Extended",
|
|
"General Punctuation", /* 40 */
|
|
"Superscripts and Subscripts",
|
|
"Currency Symbols",
|
|
"Combining Diacritical Marks for Symbols", /* used to be "Combining Marks for Symbols" before Unicode 3.2 */
|
|
"Letterlike Symbols",
|
|
"Number Forms",
|
|
"Arrows",
|
|
"Mathematical Operators",
|
|
"Miscellaneous Technical",
|
|
"Control Pictures",
|
|
"Optical Character Recognition", /* 50 */
|
|
"Enclosed Alphanumerics",
|
|
"Box Drawing",
|
|
"Block Elements",
|
|
"Geometric Shapes",
|
|
"Miscellaneous Symbols",
|
|
"Dingbats",
|
|
"Braille Patterns",
|
|
"CJK Radicals Supplement",
|
|
"Kangxi Radicals",
|
|
"Ideographic Description Characters", /* 60 */
|
|
"CJK Symbols and Punctuation",
|
|
"Hiragana",
|
|
"Katakana",
|
|
"Bopomofo",
|
|
"Hangul Compatibility Jamo",
|
|
"Kanbun",
|
|
"Bopomofo Extended",
|
|
"Enclosed CJK Letters and Months",
|
|
"CJK Compatibility",
|
|
"CJK Unified Ideographs Extension A", /* 70 */
|
|
"CJK Unified Ideographs",
|
|
"Yi Syllables",
|
|
"Yi Radicals",
|
|
"Hangul Syllables",
|
|
"High Surrogates",
|
|
"High Private Use Surrogates",
|
|
"Low Surrogates",
|
|
"Private Use Area", /* used to be "Private Use" before Unicode 3.2 */
|
|
"CJK Compatibility Ideographs",
|
|
"Alphabetic Presentation Forms", /* 80 */
|
|
"Arabic Presentation Forms-A",
|
|
"Combining Half Marks",
|
|
"CJK Compatibility Forms",
|
|
"Small Form Variants",
|
|
"Arabic Presentation Forms-B",
|
|
"Specials",
|
|
"Halfwidth and Fullwidth Forms",
|
|
"Old Italic",
|
|
"Gothic",
|
|
"Deseret", /* 90 */
|
|
"Byzantine Musical Symbols",
|
|
"Musical Symbols",
|
|
"Mathematical Alphanumeric Symbols",
|
|
"CJK Unified Ideographs Extension B",
|
|
"CJK Compatibility Ideographs Supplement",
|
|
"Tags",
|
|
"Cyrillic Supplementary", /* first new block in Unicode 3.2 */
|
|
"Tagalog",
|
|
"Hanunoo",
|
|
"Buhid", /* 100 */
|
|
"Tagbanwa",
|
|
"Miscellaneous Mathematical Symbols-A",
|
|
"Supplemental Arrows-A",
|
|
"Supplemental Arrows-B",
|
|
"Miscellaneous Mathematical Symbols-B",
|
|
"Supplemental Mathematical Operators",
|
|
"Katakana Phonetic Extensions",
|
|
"Variation Selectors",
|
|
"Supplementary Private Use Area-A",
|
|
"Supplementary Private Use Area-B" /* 110 */
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
blocksLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t i;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in Blocks.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse block name */
|
|
i=getTokenIndex(blockNames, UBLOCK_COUNT, fields[1][0]);
|
|
if(i<0) {
|
|
if(isToken("Greek", fields[1][0])) {
|
|
i=UBLOCK_GREEK; /* Unicode 3.2 renames this to "Greek and Coptic" */
|
|
} else if(isToken("Combining Marks for Symbols", fields[1][0])) {
|
|
i=UBLOCK_COMBINING_MARKS_FOR_SYMBOLS; /* Unicode 3.2 renames this to "Combining Diacritical Marks for Symbols" */
|
|
} else if(isToken("Private Use", fields[1][0])) {
|
|
i=UBLOCK_PRIVATE_USE; /* Unicode 3.2 renames this to "Private Use Area" */
|
|
} else {
|
|
fprintf(stderr, "genprops error: unknown block name \"%s\" in Blocks.txt\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
}
|
|
|
|
if(!upvec_setValue(pv, start, limit, 0, (uint32_t)i<<UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set block code: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* PropList.txt ------------------------------------------------------------- */
|
|
|
|
/*
|
|
* Keep this list of property names in sync with
|
|
* enums in icu/source/common/uprops.h, see UPROPS_BINARY_1_TOP!
|
|
*
|
|
* Careful: Since UPROPS_ also contain derivedPropListNames[] entries,
|
|
* they would need to be skipped here with NULL entries if new properties
|
|
* are added to PropList.txt.
|
|
*/
|
|
static const char *const
|
|
propListNames[]={
|
|
"White_Space",
|
|
"Bidi_Control",
|
|
"Join_Control",
|
|
"Dash",
|
|
"Hyphen",
|
|
"Quotation_Mark",
|
|
"Terminal_Punctuation",
|
|
"Other_Math",
|
|
"Hex_Digit",
|
|
"ASCII_Hex_Digit",
|
|
"Other_Alphabetic",
|
|
"Ideographic",
|
|
"Diacritic",
|
|
"Extender",
|
|
"Other_Lowercase",
|
|
"Other_Uppercase",
|
|
"Noncharacter_Code_Point",
|
|
"Other_Grapheme_Extend",
|
|
"Grapheme_Link",
|
|
"IDS_Binary_Operator",
|
|
"IDS_Trinary_Operator",
|
|
"Radical",
|
|
"Unified_Ideograph",
|
|
"Other_Default_Ignorable_Code_Point",
|
|
"Deprecated",
|
|
"Soft_Dotted",
|
|
"Logical_Order_Exception"
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
propListLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t i;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in PropList.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse binary property name */
|
|
i=getTokenIndex(propListNames, sizeof(propListNames)/sizeof(*propListNames), fields[1][0]);
|
|
if(i<0) {
|
|
if(isToken("White_space", fields[1][0])) {
|
|
i=0; /* accept misspelled property name in Unicode 3.1.1 */
|
|
} else {
|
|
fprintf(stderr, "genprops warning: unknown binary property name \"%s\" in PropList.txt\n", fields[1][0]);
|
|
return;
|
|
}
|
|
}
|
|
if(!upvec_setValue(pv, start, limit, 1, FLAG(i), FLAG(i), pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set binary property: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* DerivedCoreProperties ---------------------------------------------------- */
|
|
|
|
static const char *const
|
|
derivedPropListNames[]={
|
|
"XID_Start",
|
|
"XID_Continue"
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
derivedPropListLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t i;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in DerivedCoreProperties.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse derived binary property name, ignore unknown names */
|
|
i=getTokenIndex(derivedPropListNames, sizeof(derivedPropListNames)/sizeof(*derivedPropListNames), fields[1][0]);
|
|
if(i>=0) {
|
|
uint32_t flag=FLAG(UPROPS_XID_START+i);
|
|
if(!upvec_setValue(pv, start, limit, 1, flag, flag, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set derived binary property: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* East Asian Width --------------------------------------------------------- */
|
|
|
|
/* keep this list in sync with UEAWidthCode in uprops.h or uchar.h */
|
|
static const char *const
|
|
eaNames[U_EA_COUNT]={
|
|
"N", /* Non-East Asian Neutral, default for unassigned code points */
|
|
"A", /* Ambiguous, default for Private Use code points */
|
|
"H", /* Half-width */
|
|
"F", /* Full-width */
|
|
"Na", /* Narrow */
|
|
"W" /* Wide, default for plane 2 */
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
eaWidthLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t i;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in EastAsianWidth.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse binary property name */
|
|
i=getTokenIndex(eaNames, U_EA_COUNT, fields[1][0]);
|
|
if(i<0) {
|
|
fprintf(stderr, "genprops error: unknown width name \"%s\" in EastAsianWidth.txt\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* collect maximum ranges */
|
|
if(prevLimit==start && (uint32_t)i==prevValue) {
|
|
prevLimit=limit;
|
|
} else {
|
|
if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<<UPROPS_EA_SHIFT), UPROPS_EA_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set East Asian Width: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
prevStart=start;
|
|
prevLimit=limit;
|
|
prevValue=(uint32_t)i;
|
|
}
|
|
}
|
|
|
|
/* LineBreak.txt ------------------------------------------------------------ */
|
|
|
|
/* LineBreak.txt block names in the order of the parallel ULineBreak constants */
|
|
static const char *const
|
|
lbNames[U_LB_COUNT]={
|
|
"XX",
|
|
"AI",
|
|
"AL",
|
|
"B2",
|
|
"BA",
|
|
"BB",
|
|
"BK",
|
|
"CB",
|
|
"CL",
|
|
"CM",
|
|
"CR",
|
|
"EX",
|
|
"GL",
|
|
"HY",
|
|
"ID",
|
|
"IN",
|
|
"IS",
|
|
"LF",
|
|
"NS",
|
|
"NU",
|
|
"OP",
|
|
"PO",
|
|
"PR",
|
|
"QU",
|
|
"SA",
|
|
"SG",
|
|
"SP",
|
|
"SY",
|
|
"ZW"
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
lineBreakLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t i;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in LineBreak.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse block name */
|
|
i=getTokenIndex(lbNames, U_LB_COUNT, fields[1][0]);
|
|
if(i<0) {
|
|
fprintf(stderr, "genprops error: unknown line break name \"%s\" in LineBreak.txt\n", fields[1][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
if(!upvec_setValue(pv, start, limit, 0, (uint32_t)i<<UPROPS_LB_SHIFT, UPROPS_LB_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set line break code: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
/* ArabicShaping.txt -------------------------------------------------------- */
|
|
|
|
/* Joining Type/Joining Group names in the order of the parallel UJoiningType/UJoiningGroup constants */
|
|
static const char *const
|
|
jtNames[U_JT_COUNT]={
|
|
"U",
|
|
"C",
|
|
"D",
|
|
"L",
|
|
"R",
|
|
"T"
|
|
};
|
|
|
|
static const char *const
|
|
jgNames[U_JG_COUNT]={
|
|
"<no shaping>",
|
|
"AIN",
|
|
"ALAPH",
|
|
"ALEF",
|
|
"BEH",
|
|
"BETH",
|
|
"DAL",
|
|
"DALATH RISH",
|
|
"E",
|
|
"FEH",
|
|
"FINAL SEMKATH",
|
|
"GAF",
|
|
"GAMAL",
|
|
"HAH",
|
|
"HAMZA ON HEH GOAL",
|
|
"HE",
|
|
"HEH",
|
|
"HEH GOAL",
|
|
"HETH",
|
|
"KAF",
|
|
"KAPH",
|
|
"KNOTTED HEH",
|
|
"LAM",
|
|
"LAMADH",
|
|
"MEEM",
|
|
"MIM",
|
|
"NOON",
|
|
"NUN",
|
|
"PE",
|
|
"QAF",
|
|
"QAPH",
|
|
"REH",
|
|
"REVERSED PE",
|
|
"SAD",
|
|
"SADHE",
|
|
"SEEN",
|
|
"SEMKATH",
|
|
"SHIN",
|
|
"SWASH KAF",
|
|
"SYRIAC WAW",
|
|
"TAH",
|
|
"TAW",
|
|
"TEH MARBUTA",
|
|
"TETH",
|
|
"WAW",
|
|
"YEH",
|
|
"YEH BARREE",
|
|
"YEH WITH TAIL",
|
|
"YUDH",
|
|
"YUDH HE",
|
|
"ZAIN"
|
|
};
|
|
|
|
static void U_CALLCONV
|
|
arabicShapingLineFn(void *context,
|
|
char *fields[][2], int32_t fieldCount,
|
|
UErrorCode *pErrorCode) {
|
|
uint32_t start, limit;
|
|
int32_t jt, jg;
|
|
|
|
u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "genprops: syntax error in ArabicShaping.txt field 0 at %s\n", fields[0][0]);
|
|
exit(*pErrorCode);
|
|
}
|
|
++limit;
|
|
|
|
/* parse joining type */
|
|
jt=getTokenIndex(jtNames, U_JT_COUNT, fields[2][0]);
|
|
if(jt<0) {
|
|
fprintf(stderr, "genprops error: unknown joining type in \"%s\" in ArabicShaping.txt\n", fields[2][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
/* parse joining group */
|
|
jg=getTokenIndex(jgNames, U_JG_COUNT, fields[3][0]);
|
|
if(jg<0) {
|
|
fprintf(stderr, "genprops error: unknown joining group in \"%s\" in ArabicShaping.txt\n", fields[3][0]);
|
|
*pErrorCode=U_PARSE_ERROR;
|
|
exit(U_PARSE_ERROR);
|
|
}
|
|
|
|
if(!upvec_setValue(pv, start, limit, 2, ((uint32_t)jt<<UPROPS_JT_SHIFT)|((uint32_t)jg<<UPROPS_JG_SHIFT), UPROPS_JT_MASK|UPROPS_JG_MASK, pErrorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set joining type/group code: %s\n", u_errorName(*pErrorCode));
|
|
exit(*pErrorCode);
|
|
}
|
|
}
|
|
|
|
static void
|
|
parseArabicShaping(char *filename, char *basename,
|
|
const char *suffix,
|
|
UErrorCode *pErrorCode) {
|
|
char *fields[4][2];
|
|
|
|
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
|
return;
|
|
}
|
|
|
|
writeUCDFilename(basename, "ArabicShaping", suffix);
|
|
|
|
u_parseDelimitedFile(filename, ';', fields, 4, arabicShapingLineFn, NULL, pErrorCode);
|
|
if(U_FAILURE(*pErrorCode)) {
|
|
fprintf(stderr, "error parsing ArabicShaping.txt: %s\n", u_errorName(*pErrorCode));
|
|
}
|
|
}
|
|
|
|
/* data serialization ------------------------------------------------------- */
|
|
|
|
U_CFUNC int32_t
|
|
writeAdditionalData(uint8_t *p, int32_t capacity, int32_t indexes[UPROPS_INDEX_COUNT]) {
|
|
int32_t length;
|
|
UErrorCode errorCode;
|
|
|
|
errorCode=U_ZERO_ERROR;
|
|
length=utrie_serialize(trie, p, capacity, getFoldedPropsValue, TRUE, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to serialize trie for additional properties: %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
if(p!=NULL) {
|
|
p+=length;
|
|
capacity-=length;
|
|
if(beVerbose) {
|
|
printf("size in bytes of additional props trie:%5u\n", length);
|
|
}
|
|
|
|
/* set indexes */
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=
|
|
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]+length/4;
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
|
indexes[UPROPS_RESERVED_INDEX]=
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]+pvCount;
|
|
|
|
indexes[UPROPS_MAX_VALUES_INDEX]=
|
|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
|
((int32_t)USCRIPT_CODE_LIMIT-1);
|
|
}
|
|
|
|
if(p!=NULL && (pvCount*4)<=capacity) {
|
|
uprv_memcpy(p, pv, pvCount*4);
|
|
if(beVerbose) {
|
|
printf("number of additional props vectors: %5u\n", pvCount/UPROPS_VECTOR_WORDS);
|
|
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
|
}
|
|
}
|
|
length+=pvCount*4;
|
|
|
|
if(p!=NULL) {
|
|
utrie_close(trie);
|
|
upvec_close(pv);
|
|
}
|
|
return length;
|
|
}
|