/* ******************************************************************************* * * Copyright (C) 2002, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: props2.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002feb24 * created by: Markus W. Scherer * * Parse more Unicode Character Database files and store * additional Unicode character properties in bit set vectors. */ #include #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/uscript.h" #include "cstring.h" #include "cmemory.h" #include "utrie.h" #include "uprops.h" #include "propsvec.h" #include "uparse.h" #include "genprops.h" #define FLAG(n) ((uint32_t)1<<(n)) /* data --------------------------------------------------------------------- */ static UNewTrie *trie; uint32_t *pv; static int32_t pvCount; static uint32_t prevStart=0, prevLimit=0, prevValue=0; /* prototypes --------------------------------------------------------------- */ static void parseTwoFieldFile(char *filename, char *basename, const char *ucdFile, const char *suffix, UParseLineFn *lineFn, UErrorCode *pErrorCode); static void parseArabicShaping(char *filename, char *basename, const char *suffix, UErrorCode *pErrorCode); static void U_CALLCONV ageLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV scriptsLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV blocksLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV propListLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV derivedPropListLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV eaWidthLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); static void U_CALLCONV lineBreakLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode); /* -------------------------------------------------------------------------- */ U_CFUNC void initAdditionalProperties() { pv=upvec_open(UPROPS_VECTOR_WORDS, 20000); } U_CFUNC void generateAdditionalProperties(char *filename, const char *suffix, UErrorCode *pErrorCode) { char *basename; basename=filename+uprv_strlen(filename); /* process various UCD .txt files */ parseTwoFieldFile(filename, basename, "DerivedAge", suffix, ageLineFn, pErrorCode); /* * UTR 24 says: * Section 2: * "Common - For characters that may be used * within multiple scripts, * or any unassigned code points." * * Section 4: * "The value COMMON is the default value, * given to all code points that are not * explicitly mentioned in the data file." */ if(!upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)USCRIPT_COMMON, UPROPS_SCRIPT_MASK, pErrorCode)) { fprintf(stderr, "genprops error: unable to set script code: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); } parseTwoFieldFile(filename, basename, "Scripts", suffix, scriptsLineFn, pErrorCode); parseTwoFieldFile(filename, basename, "Blocks", suffix, blocksLineFn, pErrorCode); parseTwoFieldFile(filename, basename, "PropList", suffix, propListLineFn, pErrorCode); parseTwoFieldFile(filename, basename, "DerivedCoreProperties", suffix, derivedPropListLineFn, pErrorCode); parseTwoFieldFile(filename, basename, "LineBreak", suffix, lineBreakLineFn, pErrorCode); parseArabicShaping(filename, basename, suffix, pErrorCode); /* * Preset East Asian Width defaults: * N for all * A for Private Use * W for plane 2 */ *pErrorCode=U_ZERO_ERROR; if( !upvec_setValue(pv, 0, 0x110000, 0, (uint32_t)(U_EA_NEUTRAL<15 || (*end!='.' && *end!=' ' && *end!='\t' && *end!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version=value<<4; /* parse minor version number */ if(*end=='.') { s=(char *)u_skipWhitespace(end+1); value=(uint32_t)uprv_strtoul(s, &end, 10); if(s==end || value>15 || (*end!=' ' && *end!='\t' && *end!=0)) { fprintf(stderr, "genprops: syntax error in DerivedAge.txt field 1 at %s\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } version|=value; } if(!upvec_setValue(pv, start, limit, 0, version<=0) { uint32_t flag=FLAG(UPROPS_XID_START+i); if(!upvec_setValue(pv, start, limit, 1, flag, flag, pErrorCode)) { fprintf(stderr, "genprops error: unable to set derived binary property: %s\n", u_errorName(*pErrorCode)); exit(*pErrorCode); } } } /* East Asian Width --------------------------------------------------------- */ /* keep this list in sync with UEAWidthCode in uprops.h or uchar.h */ static const char *const eaNames[U_EA_COUNT]={ "N", /* Non-East Asian Neutral, default for unassigned code points */ "A", /* Ambiguous, default for Private Use code points */ "H", /* Half-width */ "F", /* Full-width */ "Na", /* Narrow */ "W" /* Wide, default for plane 2 */ }; static void U_CALLCONV eaWidthLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t start, limit; int32_t i; u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in EastAsianWidth.txt field 0 at %s\n", fields[0][0]); exit(*pErrorCode); } ++limit; /* parse binary property name */ i=getTokenIndex(eaNames, U_EA_COUNT, fields[1][0]); if(i<0) { fprintf(stderr, "genprops error: unknown width name \"%s\" in EastAsianWidth.txt\n", fields[1][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* collect maximum ranges */ if(prevLimit==start && (uint32_t)i==prevValue) { prevLimit=limit; } else { if(!upvec_setValue(pv, prevStart, prevLimit, 0, (uint32_t)(prevValue<", "AIN", "ALAPH", "ALEF", "BEH", "BETH", "DAL", "DALATH RISH", "E", "FEH", "FINAL SEMKATH", "GAF", "GAMAL", "HAH", "HAMZA ON HEH GOAL", "HE", "HEH", "HEH GOAL", "HETH", "KAF", "KAPH", "KNOTTED HEH", "LAM", "LAMADH", "MEEM", "MIM", "NOON", "NUN", "PE", "QAF", "QAPH", "REH", "REVERSED PE", "SAD", "SADHE", "SEEN", "SEMKATH", "SHIN", "SWASH KAF", "SYRIAC WAW", "TAH", "TAW", "TEH MARBUTA", "TETH", "WAW", "YEH", "YEH BARREE", "YEH WITH TAIL", "YUDH", "YUDH HE", "ZAIN" }; static void U_CALLCONV arabicShapingLineFn(void *context, char *fields[][2], int32_t fieldCount, UErrorCode *pErrorCode) { uint32_t start, limit; int32_t jt, jg; u_parseCodePointRange(fields[0][0], &start, &limit, pErrorCode); if(U_FAILURE(*pErrorCode)) { fprintf(stderr, "genprops: syntax error in ArabicShaping.txt field 0 at %s\n", fields[0][0]); exit(*pErrorCode); } ++limit; /* parse joining type */ jt=getTokenIndex(jtNames, U_JT_COUNT, fields[2][0]); if(jt<0) { fprintf(stderr, "genprops error: unknown joining type in \"%s\" in ArabicShaping.txt\n", fields[2][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } /* parse joining group */ jg=getTokenIndex(jgNames, U_JG_COUNT, fields[3][0]); if(jg<0) { fprintf(stderr, "genprops error: unknown joining group in \"%s\" in ArabicShaping.txt\n", fields[3][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); } if(!upvec_setValue(pv, start, limit, 2, ((uint32_t)jt<