diff --git a/tools/unicode/c/gennorm/gennorm.c b/tools/unicode/c/gennorm/gennorm.c index 9687451077..d1b1fba45f 100644 --- a/tools/unicode/c/gennorm/gennorm.c +++ b/tools/unicode/c/gennorm/gennorm.c @@ -39,7 +39,7 @@ U_CDECL_BEGIN #include "gennorm.h" U_CDECL_END -UBool beVerbose=FALSE, haveCopyright=TRUE; +UBool beVerbose=FALSE; /* prototypes --------------------------------------------------------------- */ @@ -55,28 +55,18 @@ enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, - COPYRIGHT, DESTDIR, SOURCEDIR, - UNICODE_VERSION, - ICUDATADIR, - CSOURCE, - STORE_FLAGS, - WRITE_NORM2 + ICUDATADIR }; static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, - UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_SOURCEDIR, - UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), - UOPTION_ICUDATADIR, - UOPTION_DEF("csource", 'C', UOPT_NO_ARG), - UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG), - UOPTION_DEF("write-norm2", '\1', UOPT_NO_ARG) + UOPTION_ICUDATADIR }; extern int @@ -91,9 +81,8 @@ main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); /* preset then read command line options */ - options[4].value=u_getDataDirectory(); - options[5].value=""; - options[6].value="3.0.0"; + options[DESTDIR].value=u_getDataDirectory(); + options[SOURCEDIR].value=""; options[ICUDATADIR].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); @@ -103,7 +92,7 @@ main(int argc, char* argv[]) { "error in command line argument \"%s\"\n", argv[-argc]); } - if(argc<0 || options[0].doesOccur || options[1].doesOccur) { + if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { /* * Broken into chucks because the C89 standard says the minimum * required supported string length is 509 bytes. @@ -112,27 +101,13 @@ main(int argc, char* argv[]) { "Usage: %s [-options] [suffix]\n" "\n" "Read the UnicodeData.txt file and other Unicode properties files and\n" - "create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n" + "write nfc.txt and nfkc.txt files for gennorm2\n" "\n", argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" - "\t-v or --verbose verbose output\n" - "\t-c or --copyright include a copyright notice\n" - "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" - "\t-C or --csource generate a .c source file rather than the .icu binary\n"); - fprintf(stderr, - "\t-p or --prune flags Prune for data modularization:\n" - "\t Determine what data is to be stored.\n" - "\t 0 (zero) stores minimal data (only for NFD)\n" - "\t lowercase letters turn off data, uppercase turn on (use with 0)\n"); - fprintf(stderr, - "\t k: compatibility decompositions (NFKC, NFKD)\n" - "\t c: composition data (NFC, NFKC)\n" - "\t f: FCD data (will be generated at load time)\n" - "\t a: auxiliary data (canonical closure etc.)\n" - "\t x: exclusion sets (Unicode 3.2-level normalization)\n"); + "\t-v or --verbose verbose output\n"); fprintf(stderr, "\t-d or --destdir destination directory, followed by the path\n" "\t-s or --sourcedir source directory, followed by the path\n" @@ -142,16 +117,13 @@ main(int argc, char* argv[]) { "\t to the source file basenames before opening;\n" "\t 'gennorm new' will read UnicodeData-new.txt etc.\n", u_getDataDirectory()); - fprintf(stderr, - "\t--write-norm2 write nfc.txt and nfkc.txt files for gennorm2\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ - beVerbose=options[2].doesOccur; - haveCopyright=options[3].doesOccur; - srcDir=options[5].value; - destDir=options[4].value; + beVerbose=options[VERBOSE].doesOccur; + srcDir=options[SOURCEDIR].value; + destDir=options[DESTDIR].value; if(argc>=2) { suffix=argv[1]; @@ -159,73 +131,12 @@ main(int argc, char* argv[]) { suffix=NULL; } -#if UCONFIG_NO_NORMALIZATION - - fprintf(stderr, - "gennorm writes a dummy " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE - " because UCONFIG_NO_NORMALIZATION is set, \n" - "see icu/source/common/unicode/uconfig.h\n"); - generateData(destDir, options[CSOURCE].doesOccur); - -#else - - setUnicodeVersion(options[6].value); +#if !UCONFIG_NO_NORMALIZATION if (options[ICUDATADIR].doesOccur) { u_setDataDirectory(options[ICUDATADIR].value); } - if(options[STORE_FLAGS].doesOccur) { - const char *s=options[STORE_FLAGS].value; - char c; - - while((c=*s++)!=0) { - switch(c) { - case '0': - gStoreFlags=0; /* store minimal data (only for NFD) */ - break; - - /* lowercase letters: omit data */ - case 'k': - gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPAT); - break; - case 'c': - gStoreFlags&=~U_MASK(UGENNORM_STORE_COMPOSITION); - break; - case 'f': - gStoreFlags&=~U_MASK(UGENNORM_STORE_FCD); - break; - case 'a': - gStoreFlags&=~U_MASK(UGENNORM_STORE_AUX); - break; - case 'x': - gStoreFlags&=~U_MASK(UGENNORM_STORE_EXCLUSIONS); - break; - - /* uppercase letters: include data (use with 0) */ - case 'K': - gStoreFlags|=U_MASK(UGENNORM_STORE_COMPAT); - break; - case 'C': - gStoreFlags|=U_MASK(UGENNORM_STORE_COMPOSITION); - break; - case 'F': - gStoreFlags|=U_MASK(UGENNORM_STORE_FCD); - break; - case 'A': - gStoreFlags|=U_MASK(UGENNORM_STORE_AUX); - break; - case 'X': - gStoreFlags|=U_MASK(UGENNORM_STORE_EXCLUSIONS); - break; - - default: - fprintf(stderr, "ignoring undefined prune flag '%c'\n", c); - break; - } - } - } - /* * Verify that we can work with properties * but don't call u_init() because that needs unorm.icu which we are just @@ -290,14 +201,7 @@ main(int argc, char* argv[]) { /* process parsed data */ if(U_SUCCESS(errorCode)) { - if(options[WRITE_NORM2].doesOccur) { - writeNorm2(destDir); - } - - processData(); - - /* write the properties data file */ - generateData(destDir, options[CSOURCE].doesOccur); + writeNorm2(destDir); cleanUpData(); } @@ -319,7 +223,6 @@ derivedNormalizationPropertiesLineFn(void *context, char *s; uint32_t start, end; int32_t count; - uint8_t qcFlags; /* get code point range */ count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); @@ -335,78 +238,11 @@ derivedNormalizationPropertiesLineFn(void *context, /* get property - ignore unrecognized ones */ s=(char *)u_skipWhitespace(fields[1][0]); - if(*s=='N' && s[1]=='F') { - /* quick check flag */ - qcFlags=0x11; - s+=2; - if(*s=='K') { - qcFlags<<=1; - ++s; - } - - if(*s=='C' && s[1]=='_') { - s+=2; - } else if(*s=='D' && s[1]=='_') { - qcFlags<<=2; - s+=2; - } else { - return; - } - - if(0==uprv_strncmp(s, "NO", 2)) { - qcFlags&=0xf; - } else if(0==uprv_strncmp(s, "MAYBE", 5)) { - qcFlags&=0x30; - } else if(0==uprv_strncmp(s, "QC", 2) && *(s=(char *)u_skipWhitespace(s+2))==';') { - /* - * Unicode 4.0.1: - * changes single field "NFD_NO" -> two fields "NFD_QC; N" etc. - */ - /* start of the field */ - s=(char *)u_skipWhitespace(s+1); - if(*s=='N') { - qcFlags&=0xf; - } else if(*s=='M') { - qcFlags&=0x30; - } else { - return; /* do nothing for "Yes" because it's the default value */ - } - } else { - return; /* do nothing for "Yes" because it's the default value */ - } - - /* set this flag for all code points in this range */ - while(start<=end) { - setQCFlags(start++, qcFlags); - } - } else if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { + if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) { /* full composition exclusion */ while(start<=end) { setCompositionExclusion(start++); } - } else if( - ((0==uprv_memcmp(s, "FNC", 3) && *(s=(char *)u_skipWhitespace(s+3))==';') || - (0==uprv_memcmp(s, "FC_NFKC", 7) && *(s=(char *)u_skipWhitespace(s+7))==';')) - - ) { - /* FC_NFKC_Closure, parse field 2 to get the string */ - char *t; - - /* start of the field */ - s=(char *)u_skipWhitespace(s+1); - - /* find the end of the field */ - for(t=s; *t!=';' && *t!='#' && *t!=0 && *t!='\n' && *t!='\r'; ++t) {} - *t=0; - - string[0]=(UChar)u_parseString(s, string+1, 31, NULL, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "gennorm error: illegal FNC string at %s\n", fields[0][0]); - exit(*pErrorCode); - } - while(start<=end) { - setFNC(start++, string); - } } } @@ -450,12 +286,6 @@ unicodeDataLineFn(void *context, /* reset the properties */ uprv_memset(&norm, 0, sizeof(Norm)); - /* - * The combiningIndex must not be initialized to 0 because 0 is the - * combiningIndex of the first forward-combining character. - */ - norm.combiningIndex=0xffff; - /* get the character code, field 0 */ code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); if(end<=fields[0][0] || end!=fields[0][1]) { diff --git a/tools/unicode/c/gennorm/gennorm.h b/tools/unicode/c/gennorm/gennorm.h index aacadfd01b..faa9ba737d 100644 --- a/tools/unicode/c/gennorm/gennorm.h +++ b/tools/unicode/c/gennorm/gennorm.h @@ -18,11 +18,6 @@ #define __GENPROPS_H__ #include "unicode/utypes.h" -#include "unicode/uset.h" - -/* file definitions */ -#define DATA_NAME "unorm" -#define DATA_TYPE "icu" /* * data structure that holds the normalization properties for one or more @@ -30,70 +25,26 @@ */ typedef struct Norm { uint8_t udataCC, lenNFD, lenNFKD; - uint8_t qcFlags, combiningFlags; - uint16_t canonBothCCs, compatBothCCs, combiningIndex, specialTag; uint32_t *nfd, *nfkd; - uint32_t value32; /* temporary variable for generating runtime norm32 and fcd values */ - int32_t fncIndex; - USet *canonStart; - UBool unsafeStart; } Norm; -/* - * modularization flags - * - * Corresponding bits in gStoreFlags control whether certain kinds of data - * are to be stored in (1) or omitted from (0) the data file. - * The flags are controlled by a command-line argument, with a letter - * per flag. - */ -enum { - UGENNORM_STORE_COMPAT, /* (k) compatibility decompositions */ - UGENNORM_STORE_COMPOSITION, /* (c) composition data */ - UGENNORM_STORE_FCD, /* (f) FCD data */ - UGENNORM_STORE_AUX, /* (a) auxiliary trie and associated data */ - UGENNORM_STORE_EXCLUSIONS, /* (x) exclusion sets */ - UGENNORM_STORE_COUNT -}; - -extern uint32_t gStoreFlags; - -#define DO_STORE(flag) (0!=(gStoreFlags&U_MASK(flag))) -#define DO_NOT_STORE(flag) (0==(gStoreFlags&U_MASK(flag))) - /* global flags */ -extern UBool beVerbose, haveCopyright; +extern UBool beVerbose; /* prototypes */ -extern void -setUnicodeVersion(const char *v); - extern void init(void); extern void storeNorm(uint32_t code, Norm *norm); -extern void -setQCFlags(uint32_t code, uint8_t qcFlags); - extern void setCompositionExclusion(uint32_t code); -U_CFUNC void -setFNC(uint32_t c, UChar *s); - extern void writeNorm2(const char *dataDir); -extern void -processData(void); - -extern void -generateData(const char *dataDir, UBool csource); - extern void cleanUpData(void); #endif - diff --git a/tools/unicode/c/gennorm/store.c b/tools/unicode/c/gennorm/store.c index ddd4f7fd8f..45dc743fd6 100644 --- a/tools/unicode/c/gennorm/store.c +++ b/tools/unicode/c/gennorm/store.c @@ -19,17 +19,12 @@ #include #include #include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "unicode/ustring.h" -#include "cmemory.h" -#include "cstring.h" -#include "filestrm.h" #include "unicode/udata.h" -#include "utrie.h" -#include "utrie2.h" #include "unicode/uset.h" +#include "cmemory.h" +#include "filestrm.h" +#include "utrie.h" #include "toolutil.h" -#include "unewdata.h" #include "writesrc.h" #include "unormimp.h" #include "gennorm.h" @@ -38,133 +33,23 @@ #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) -/* - * The new implementation of the normalization code loads its data from - * unorm.icu, which is generated with this gennorm tool. - * The format of that file is described in unormimp.h . - */ - -/* file data ---------------------------------------------------------------- */ - -#if UCONFIG_NO_NORMALIZATION - -/* dummy UDataInfo cf. udata.h */ -static UDataInfo dataInfo = { - sizeof(UDataInfo), - 0, - - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - U_SIZEOF_UCHAR, - 0, - - { 0, 0, 0, 0 }, /* dummy dataFormat */ - { 0, 0, 0, 0 }, /* dummy formatVersion */ - { 0, 0, 0, 0 } /* dummy dataVersion */ -}; - -#else - -/* UDataInfo cf. udata.h */ -static UDataInfo dataInfo={ - sizeof(UDataInfo), - 0, - - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - U_SIZEOF_UCHAR, - 0, - - { 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */ - { 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */ - { 3, 2, 0, 0 } /* dataVersion (Unicode version) */ -}; - -extern void -setUnicodeVersion(const char *v) { - UVersionInfo version; - u_versionFromString(version, v); - uprv_memcpy(dataInfo.dataVersion, version, 4); -} - -static int32_t indexes[_NORM_INDEX_TOP]={ 0 }; +#if !UCONFIG_NO_NORMALIZATION /* builder data ------------------------------------------------------------- */ -/* modularization flags, see gennorm.h (default to "store everything") */ -uint32_t gStoreFlags=0xffffffff; +static UNewTrie *normTrie; -typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); - -static UNewTrie - *normTrie, - *norm32Trie, - *fcdTrie, - *auxTrie; - -static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem; +static UToolMemory *normMem, *utf32Mem; static Norm *norms; -#if GENNORM_OBSOLETE -/* - * set a flag for each code point that was seen in decompositions - - * avoid to decompose ones that have not been used before - */ -static uint32_t haveSeenFlags[256]; -#endif - -/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */ -static USet *nfdQCNoSet; - -/* see addCombiningCP() for details */ -static uint32_t combiningCPs[2000]; - -/* - * after processCombining() this contains for each code point in combiningCPs[] - * the runtime combining index - */ -static uint16_t combiningIndexes[2000]; - -/* section limits for combiningCPs[], see addCombiningCP() */ -static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0; - -/** - * Structure for a triple of code points, stored in combiningTriplesMem. - * The lead and trail code points combine into the the combined one, - * i.e., there is a canonical decomposition of combined-> . - * - * Before processCombining() is called, leadIndex and trailIndex are 0. - * After processCombining(), they contain the indexes of the lead and trail - * code point in the combiningCPs[] array. - * They are then sorted by leadIndex, then trailIndex. - * They are not sorted by code points. - */ -typedef struct CombiningTriple { - uint16_t leadIndex, trailIndex; - uint32_t lead, trail, combined; -} CombiningTriple; - -/* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */ -static uint16_t combiningTable[0x8000]; -static uint16_t combiningTableTop=0; - -#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000 -static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH - +10000]; /* +10000 for exclusion sets */ -static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP; -static int32_t canonSetsCount=0; +static USet *compositionExclusions; /* allocate and initialize a Norm unit */ static Norm * allocNorm() { /* allocate Norm */ Norm *p=(Norm *)utm_alloc(normMem); - /* - * The combiningIndex must not be initialized to 0 because 0 is the - * combiningIndex of the first forward-combining character. - */ - p->combiningIndex=0xffff; return p; } @@ -174,12 +59,6 @@ init() { normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); uprv_memset(normTrie, 0, sizeof(UNewTrie)); - norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); - uprv_memset(norm32Trie, 0, sizeof(UNewTrie)); - fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); - uprv_memset(fcdTrie, 0, sizeof(UNewTrie)); - auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie)); - uprv_memset(auxTrie, 0, sizeof(UNewTrie)); /* initialize the two tries */ if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) { @@ -194,31 +73,7 @@ init() { /* allocate UTF-32 string memory */ utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4); -#if GENNORM_OBSOLETE - /* reset all "have seen" flags */ - uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); -#endif - - /* open an empty set */ - nfdQCNoSet=uset_open(1, 0); - - /* allocate extra data memory for UTF-16 decomposition strings and other values */ - extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2); - /* initialize the extraMem counter for the top of FNC strings */ - p16=(uint16_t *)utm_alloc(extraMem); - *p16=1; - - /* allocate temporary memory for combining triples */ - combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple)); - - /* set the minimum code points for no/maybe quick check values to the end of the BMP */ - indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; - indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff; - indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff; - indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff; - - /* preset the indexes portion of canonStartSets */ - uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2); + compositionExclusions=uset_openEmpty(); } /* @@ -244,1607 +99,36 @@ createNorm(uint32_t code) { return p; } -/* get an existing Norm unit */ -static Norm * -getNorm(uint32_t code) { - uint32_t i; - - i=utrie_get32(normTrie, (UChar32)code, NULL); - if(i==0) { - return NULL; - } - return norms+i; -} - -/* get the canonical combining class of a character */ -static uint8_t -getCCFromCP(uint32_t code) { - Norm *norm=getNorm(code); - if(norm==NULL) { - return 0; - } else { - return norm->udataCC; - } -} - -/* - * enumerate all code points with their Norm structs and call a function for each - * return the number of code points with data - */ -static uint32_t -enumTrie(EnumTrieFn *fn, void *context) { - uint32_t count, i; - UChar32 code; - UBool isInBlockZero; - - count=0; - for(code=0; code<=0x10ffff;) { - i=utrie_get32(normTrie, code, &isInBlockZero); - if(isInBlockZero) { - code+=UTRIE_DATA_BLOCK_LENGTH; - } else { - if(i!=0) { - fn(context, (uint32_t)code, norms+i); - ++count; - } - ++code; - } - } - return count; -} - -#if GENNORM_OBSOLETE -static void -setHaveSeenString(const uint32_t *s, int32_t length) { - uint32_t c; - - while(length>0) { - c=*s++; - haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f)); - --length; - } -} - -#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f))) -#endif - -/* handle combining data ---------------------------------------------------- */ - -/* - * Insert an entry into combiningCPs[] for the new code point code with its flags. - * The flags indicate if code combines forward, backward, or both. - * - * combiningCPs[] contains three sections: - * 1. code points that combine forward - * 2. code points that combine forward and backward - * 3. code points that combine backward - * - * Search for code in the entire array. - * If it is found and already is in the right section (old flags==new flags) - * then we are done. - * If it is found but the flags are different, then remove it, - * union the old and new flags, and reinsert it into its correct section. - * If it is not found, then just insert it. - * - * Within each section, the code points are not sorted. - */ -static void -addCombiningCP(uint32_t code, uint8_t flags) { - uint32_t newEntry; - uint16_t i; - - newEntry=code|((uint32_t)flags<<24); - - /* search for this code point */ - for(i=0; i=sizeof(combiningCPs)/4) { - fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n", - (long)(sizeof(combiningCPs)/4)); - exit(U_MEMORY_ALLOCATION_ERROR); - } - - /* set i to the insertion point */ - flags=(uint8_t)(newEntry>>24); - if(flags==1) { - i=combineFwdTop++; - ++combineBothTop; - } else if(flags==3) { - i=combineBothTop++; - } else /* flags==2 */ { - i=combineBackTop; - } - - /* move the following code points up one and insert newEntry at i */ - if(icombiningFlags|=1; /* combines forward */ - createNorm(trail)->combiningFlags|=2; /* combines backward */ - - addCombiningCP(lead, 1); - addCombiningCP(trail, 2); - - triple=(CombiningTriple *)utm_alloc(combiningTriplesMem); - triple->lead=lead; - triple->trail=trail; - triple->combined=combined; -} -#endif - -static int -compareTriples(const void *l, const void *r) { - int diff; - diff=(int)((CombiningTriple *)l)->leadIndex- - (int)((CombiningTriple *)r)->leadIndex; - if(diff==0) { - diff=(int)((CombiningTriple *)l)->trailIndex- - (int)((CombiningTriple *)r)->trailIndex; - } - return diff; -} - -static void -processCombining() { - CombiningTriple *triples; - uint16_t *p; - uint32_t combined; - uint16_t i, j, count, tableTop, finalIndex, combinesFwd; - - triples=utm_getStart(combiningTriplesMem); - - /* add lead and trail indexes to the triples for sorting */ - count=(uint16_t)utm_countItems(combiningTriplesMem); - for(i=0; icombiningIndex=combiningIndexes[i]=tableTop; - - /* calculate the length of the combining data for this lead code point in the combiningTable */ - while(jcombiningIndex=combiningIndexes[i]=finalIndex++; - } - - /* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */ - if(finalIndex>0x8000) { - fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n", - tableTop, (long)(sizeof(combiningTable)/4)); - exit(U_MEMORY_ALLOCATION_ERROR); - } - - combiningTableTop=tableTop; - - /* store the combining data in the combiningTable, with the final indexes from above */ - p=combiningTable; - j=0; /* triples counter */ - - /* - * this is essentially the same loop as above, but - * it writes the table data instead of calculating and setting the final indexes; - * it is necessary to have two passes so that all the final indexes are known before - * they are written into the table - */ - for(i=0; icombiningFlags&1)<<13); - - *p++=finalIndex; - if(combined<=0x1fff) { - *p++=(uint16_t)(combinesFwd|combined); - } else if(combined<=0xffff) { - *p++=(uint16_t)(0x8000|combinesFwd); - *p++=(uint16_t)combined; - } else { - *p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10)); - *p++=(uint16_t)(0xdc00|(combined&0x3ff)); - } - } - - /* set a marker on the last final trail index in this lead's table */ - if(combined<=0x1fff) { - *(p-2)|=0x8000; - } else { - *(p-3)|=0x8000; - } - } - - /* post condition: tableTop==(p-combiningTable) */ -} - /* processing incoming normalization data ----------------------------------- */ -#if GENNORM_OBSOLETE -/* - * Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct. - * c must be a Hangul syllable code point. - */ -static void -getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) { - /* Hangul syllable: decompose algorithmically */ - uint32_t c2; - uint8_t length; - - uprv_memset(pHangulNorm, 0, sizeof(Norm)); - - c-=HANGUL_BASE; - - c2=c%JAMO_T_COUNT; - c/=JAMO_T_COUNT; - if(c2>0) { - hangulBuffer[2]=JAMO_T_BASE+c2; - length=3; - } else { - hangulBuffer[2]=0; - length=2; - } - - hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT; - hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT; - - pHangulNorm->nfd=hangulBuffer; - pHangulNorm->lenNFD=length; - if(DO_STORE(UGENNORM_STORE_COMPAT)) { - pHangulNorm->nfkd=hangulBuffer; - pHangulNorm->lenNFKD=length; - } -} -#endif - -/* - * decompose the one decomposition further, may generate two decompositions - * apply all previous characters' decompositions to this one - */ -static void -decompStoreNewNF(uint32_t code, Norm *norm) { -#if !GENNORM_OBSOLETE - /* always allocate the original string */ - uint32_t *s32; - uint8_t length; - if((length=norm->lenNFD)!=0) { - s32=utm_allocN(utf32Mem, norm->lenNFD); - uprv_memcpy(s32, norm->nfd, norm->lenNFD*4); - norm->nfd=s32; - } else if((length=norm->lenNFKD)!=0) { - s32=utm_allocN(utf32Mem, norm->lenNFKD); - uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4); - norm->nfkd=s32; - } -#else - uint32_t nfd[40], nfkd[40], hangulBuffer[3]; - Norm hangulNorm; - - uint32_t *s32; - Norm *p; - uint32_t c; - int32_t i, length; - uint8_t lenNFD=0, lenNFKD=0; - UBool changedNFD=FALSE, changedNFKD=FALSE; - - if((length=norm->lenNFD)!=0) { - /* always allocate the original string */ - changedNFD=TRUE; - s32=norm->nfd; - } else if((length=norm->lenNFKD)!=0) { - /* always allocate the original string */ - changedNFKD=TRUE; - s32=norm->nfkd; - } else { - /* no decomposition here, nothing to do */ - return; - } - - /* decompose each code point */ - for(i=0; ilenNFD!=0) { - uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4); - lenNFD+=p->lenNFD; - } else { - nfd[lenNFD++]=c; - } - } - - /* compatibility-decompose c */ - if(p->lenNFKD!=0) { - uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4); - lenNFKD+=p->lenNFKD; - changedNFKD=TRUE; - } else if(p->lenNFD!=0) { - uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4); - lenNFKD+=p->lenNFD; - /* - * not changedNFKD=TRUE; - * so that we do not store a new nfkd if there was no nfkd string before - * and we only see canonical decompositions - */ - } else { - nfkd[lenNFKD++]=c; - } - } - - /* assume that norm->lenNFD==1 or ==2 */ - if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) { - addCombiningTriple(s32[0], s32[1], code); - } - - if(changedNFD) { - if(lenNFD!=0) { - s32=utm_allocN(utf32Mem, lenNFD); - uprv_memcpy(s32, nfd, lenNFD*4); - } else { - s32=NULL; - } - norm->lenNFD=lenNFD; - norm->nfd=s32; - setHaveSeenString(nfd, lenNFD); - } - if(changedNFKD) { - if(lenNFKD!=0) { - s32=utm_allocN(utf32Mem, lenNFKD); - uprv_memcpy(s32, nfkd, lenNFKD*4); - } else { - s32=NULL; - } - norm->lenNFKD=lenNFKD; - norm->nfkd=s32; - setHaveSeenString(nfkd, lenNFKD); - } -#endif -} - -#if GENNORM_OBSOLETE -typedef struct DecompSingle { - uint32_t c; - Norm *norm; -} DecompSingle; - -/* - * apply this one character's decompositions (there is at least one!) to - * all previous characters' decompositions to decompose them further - */ -static void -decompWithSingleFn(void *context, uint32_t code, Norm *norm) { - uint32_t nfd[40], nfkd[40]; - uint32_t *s32; - DecompSingle *me=(DecompSingle *)context; - uint32_t c, myC; - int32_t i, length; - uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD; - UBool changedNFD=FALSE, changedNFKD=FALSE; - - /* get the new character's data */ - myC=me->c; - myLenNFD=me->norm->lenNFD; - myLenNFKD=me->norm->lenNFKD; - /* assume that myC has at least one decomposition */ - - if((length=norm->lenNFD)!=0 && myLenNFD!=0) { - /* apply NFD(myC) to norm->nfd */ - s32=norm->nfd; - for(i=0; inorm->nfd, myLenNFD*4); - lenNFD+=myLenNFD; - changedNFD=TRUE; - } else { - nfd[lenNFD++]=c; - } - } - } - - if((length=norm->lenNFKD)!=0) { - /* apply NFD(myC) and NFKD(myC) to norm->nfkd */ - s32=norm->nfkd; - for(i=0; inorm->nfkd, myLenNFKD*4); - lenNFKD+=myLenNFKD; - } else /* assume myLenNFD!=0 */ { - uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4); - lenNFKD+=myLenNFD; - } - changedNFKD=TRUE; - } else { - nfkd[lenNFKD++]=c; - } - } - } else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) { - /* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */ - s32=norm->nfd; - for(i=0; inorm->nfkd, myLenNFKD*4); - lenNFKD+=myLenNFKD; - changedNFKD=TRUE; - } else { - nfkd[lenNFKD++]=c; - } - } - } - - /* set the new decompositions, forget the old ones */ - if(changedNFD) { - if(lenNFD!=0) { - if(lenNFD>norm->lenNFD) { - s32=utm_allocN(utf32Mem, lenNFD); - } else { - s32=norm->nfd; - } - uprv_memcpy(s32, nfd, lenNFD*4); - } else { - s32=NULL; - } - norm->lenNFD=lenNFD; - norm->nfd=s32; - } - if(changedNFKD) { - if(lenNFKD!=0) { - if(lenNFKD>norm->lenNFKD) { - s32=utm_allocN(utf32Mem, lenNFKD); - } else { - s32=norm->nfkd; - } - uprv_memcpy(s32, nfkd, lenNFKD*4); - } else { - s32=NULL; - } - norm->lenNFKD=lenNFKD; - norm->nfkd=s32; - } -} -#endif - /* * process the data for one code point listed in UnicodeData; * UnicodeData itself never maps a code point to both NFD and NFKD */ extern void storeNorm(uint32_t code, Norm *norm) { -#if GENNORM_OBSOLETE - DecompSingle decompSingle; -#endif - Norm *p; - - if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) { - /* ignore compatibility decomposition */ - norm->lenNFKD=0; - } - - /* copy existing derived normalization properties */ - p=createNorm(code); - norm->qcFlags=p->qcFlags; - norm->combiningFlags=p->combiningFlags; - norm->fncIndex=p->fncIndex; - - /* process the decomposition if there is one here */ - if((norm->lenNFD|norm->lenNFKD)!=0) { - /* decompose this one decomposition further, may generate two decompositions */ - decompStoreNewNF(code, norm); - -#if GENNORM_OBSOLETE - /* has this code point been used in previous decompositions? */ - if(HAVE_SEEN(code)) { - /* use this decomposition to decompose other decompositions further */ - decompSingle.c=code; - decompSingle.norm=norm; - enumTrie(decompWithSingleFn, &decompSingle); - } -#endif - } + Norm *p=createNorm(code); /* store the data */ uprv_memcpy(p, norm, sizeof(Norm)); -} -extern void -setQCFlags(uint32_t code, uint8_t qcFlags) { - if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) { - /* ignore compatibility decomposition: unset the KC/KD flags */ - qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD); - - /* set the KC/KD flags to the same values as the C/D flags */ - qcFlags|=qcFlags<<1; - } - if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) { - /* ignore composition data: unset the C/KC flags */ - qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC); - - /* set the C/KC flags to the same values as the D/KD flags */ - qcFlags|=qcFlags>>2; - } - - createNorm(code)->qcFlags|=qcFlags; - - /* adjust the minimum code point for quick check no/maybe */ - if(code<0xffff) { - if((qcFlags&_NORM_QC_NFC) && (uint16_t)codelenNFD!=0) { + uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFD); + uprv_memcpy(s32, norm->nfd, norm->lenNFD*4); + p->nfd=s32; + } else if(norm->lenNFKD!=0) { + uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFKD); + uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4); + p->nfkd=s32; } } extern void setCompositionExclusion(uint32_t code) { - if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { - createNorm(code)->combiningFlags|=0x80; - } + uset_add(compositionExclusions, (UChar32)code); } -static void -setHangulJamoSpecials() { - Norm *norm; - uint32_t c, hangul; - - /* - * Hangul syllables are algorithmically decomposed into Jamos, - * and Jamos are algorithmically composed into Hangul syllables. - * The quick check flags are parsed, except for Hangul. - */ - - /* set Jamo L specials */ - hangul=0xac00; - for(c=0x1100; c<=0x1112; ++c) { - norm=createNorm(c); - norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L; - if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { - norm->combiningFlags=1; - } - - /* for each Jamo L create a set with its associated Hangul block */ - norm->canonStart=uset_open(hangul, hangul+21*28-1); - hangul+=21*28; - } - - /* set Jamo V specials */ - for(c=0x1161; c<=0x1175; ++c) { - norm=createNorm(c); - norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V; - if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { - norm->combiningFlags=2; - } - norm->unsafeStart=TRUE; - } - - /* set Jamo T specials */ - for(c=0x11a8; c<=0x11c2; ++c) { - norm=createNorm(c); - norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T; - if(DO_STORE(UGENNORM_STORE_COMPOSITION)) { - norm->combiningFlags=2; - } - norm->unsafeStart=TRUE; - } - - /* set Hangul specials, precompacted */ - norm=allocNorm(); - norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL; - if(DO_STORE(UGENNORM_STORE_COMPAT)) { - norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD; - } else { - norm->qcFlags=_NORM_QC_NFD; - } - - if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) { - fprintf(stderr, "error: too many normalization entries (setting Hangul)\n"); - exit(U_BUFFER_OVERFLOW_ERROR); - } -} - -/* - * set FC-NFKC-Closure string - * s contains the closure string; s[0]==length, s[1..length] is the actual string - * may modify s[0] - */ -U_CFUNC void -setFNC(uint32_t c, UChar *s) { - uint16_t *p; - int32_t length, i, count; - UChar first; - - if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) || - DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) || - DO_NOT_STORE(UGENNORM_STORE_AUX) - ) { - return; - } - - count=utm_countItems(extraMem); - length=s[0]; - first=s[1]; - - /* try to overlay single-unit strings with existing ones */ - if(length==1 && first<0xff00) { - p=utm_getStart(extraMem); - for(i=1; i_NORM_AUX_MAX_FNC) { - fprintf(stderr, "gennorm error: too many FNC strings\n"); - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } - - /* prepend 0xffxx with xx==length */ - s[0]=(uint16_t)(0xff00+length); - ++length; - p=(uint16_t *)utm_allocN(extraMem, length); - uprv_memcpy(p, s, length*2); - - /* update the top index in extraMem[0] */ - count+=length; - ((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count; - } - - /* store the index to the string */ - createNorm(c)->fncIndex=i; -} - -/* build runtime structures ------------------------------------------------- */ - -/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */ -static uint16_t -reorderString(uint32_t *s, int32_t length) { - uint8_t ccs[40]; - uint32_t c; - int32_t i, j; - uint8_t cc, prevCC; - - if(length<=0) { - return 0; - } - - for(i=0; ilenNFD; - if(length>0) { - norm->canonBothCCs=reorderString(norm->nfd, length); - } - - /* canonically reorder the NFKD */ - length=norm->lenNFKD; - if(length>0) { - norm->compatBothCCs=reorderString(norm->nfkd, length); - } - - /* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */ - if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) { - fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags); - } - if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) { - fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags); - } - - /* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */ -#if 0 - combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1; -#endif - - if(norm->combiningFlags&1) { - if(norm->udataCC!=0) { - /* illegal - data-derivable composition exclusion */ - fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC); - } - } - if(norm->combiningFlags&2) { - if((norm->qcFlags&0x11)==0) { - fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code); - } -#if 0 - /* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */ - if(norm->udataCC==0) { - printf("U+%04lx combines backward but udataCC==0\n", (long)code); - } -#endif - } - if((norm->combiningFlags&3)==3 && beVerbose) { - printf("U+%04lx combines both ways\n", (long)code); - } - - /* - * process canonical decompositions for canonical closure - * - * in each canonical decomposition: - * add the current character (code) to the set of canonical starters of its norm->nfd[0] - * set the "unsafe starter" flag for each norm->nfd[1..] - */ - length=norm->lenNFD; - if(length>0) { - Norm *otherNorm; - UChar32 c; - int32_t i; - - /* nfd[0].canonStart.add(code) */ - c=norm->nfd[0]; - otherNorm=createNorm(c); - if(otherNorm->canonStart==NULL) { - otherNorm->canonStart=uset_open(code, code); - if(otherNorm->canonStart==NULL) { - fprintf(stderr, "gennorm error: out of memory in uset_open()\n"); - exit(U_MEMORY_ALLOCATION_ERROR); - } - } else { - uset_add(otherNorm->canonStart, code); - if(!uset_contains(otherNorm->canonStart, code)) { - fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code); - exit(U_INTERNAL_PROGRAM_ERROR); - } - } - - /* for(i=1..length-1) nfd[i].unsafeStart=TRUE */ - for(i=1; infd[i])->unsafeStart=TRUE; - } - } -} - -static uint32_t -make32BitNorm(Norm *norm) { - UChar extra[100]; - const Norm *other; - uint32_t word; - int32_t i, length, beforeZero=0, count, start; - - /* - * Check for assumptions: - * - * Test that if a "true starter" (cc==0 && NF*C_YES) decomposes, - * then the decomposition also begins with a true starter. - */ - if(norm->udataCC==0) { - /* this is a starter */ - if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) { - /* a "true" NFC starter with a canonical decomposition */ - if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ - ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */ - ) { - fprintf(stderr, - "error: true NFC starter canonical decomposition[%u] does not begin\n" - " with a true NFC starter: U+%04lx U+%04lx%s\n", - norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], - norm->lenNFD<=2 ? "" : " ..."); - exit(U_INVALID_TABLE_FILE); - } - } - - if((norm->qcFlags&_NORM_QC_NFKC)==0) { - if(norm->lenNFKD>0) { - /* a "true" NFKC starter with a compatibility decomposition */ - if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */ - ((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */ - ) { - fprintf(stderr, - "error: true NFKC starter compatibility decomposition[%u] does not begin\n" - " with a true NFKC starter: U+%04lx U+%04lx%s\n", - norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1], - norm->lenNFKD<=2 ? "" : " ..."); - exit(U_INVALID_TABLE_FILE); - } - } else if(norm->lenNFD>0) { - /* a "true" NFKC starter with only a canonical decomposition */ - if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */ - ((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */ - ) { - fprintf(stderr, - "error: true NFKC starter canonical decomposition[%u] does not begin\n" - " with a true NFKC starter: U+%04lx U+%04lx%s\n", - norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1], - norm->lenNFD<=2 ? "" : " ..."); - exit(U_INVALID_TABLE_FILE); - } - } - } - } - - /* reset the 32-bit word and set the quick check flags */ - word=norm->qcFlags; - - /* set the UnicodeData combining class */ - word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT; - - /* set the combining flag and index */ - if(norm->combiningFlags&3) { - word|=(uint32_t)(norm->combiningFlags&3)<<6; - } - - /* set the combining index value into the extra data */ - /* 0xffff: no combining index; 0..0x7fff: combining index */ - if(norm->combiningIndex!=0xffff) { - extra[0]=norm->combiningIndex; - beforeZero=1; - } - - count=beforeZero; - - /* write the decompositions */ - if((norm->lenNFD|norm->lenNFKD)!=0) { - extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */ - - length=norm->lenNFD; - if(length>0) { - if(norm->canonBothCCs!=0) { - extra[beforeZero]|=0x80; - extra[count++]=norm->canonBothCCs; - } - start=count; - for(i=0; infd[i]); - } - extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */ - } - - length=norm->lenNFKD; - if(length>0) { - if(norm->compatBothCCs!=0) { - extra[beforeZero]|=0x8000; - extra[count++]=norm->compatBothCCs; - } - start=count; - for(i=0; infkd[i]); - } - extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */ - } - } - - /* allocate and copy the extra data */ - if(count!=0) { - UChar *p; - - if(norm->specialTag!=0) { - fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag); - exit(U_ILLEGAL_ARGUMENT_ERROR); - } - - p=(UChar *)utm_allocN(extraMem, count); - uprv_memcpy(p, extra, count*2); - - /* set the extra index, offset by beforeZero */ - word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT; - } else if(norm->specialTag!=0) { - /* set a special tag instead of an extra index */ - word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT; - } - - return word; -} - -/* turn all Norm structs into corresponding 32-bit norm values */ -static void -makeAll32() { - uint32_t *pNormData; - uint32_t n; - int32_t i, normLength, count; - - count=(int32_t)utm_countItems(normMem); - for(i=0; icanonStart!=NULL && !uset_isEmpty(norm->canonStart)) { - uint16_t *table; - int32_t c, tableLength; - UErrorCode errorCode=U_ZERO_ERROR; - - /* does the set contain exactly one code point? */ - c=usetContainsOne(norm->canonStart); - - /* add an entry to the BMP or supplementary search table */ - if(code<=0xffff) { - table=canonStartSets+_NORM_MAX_CANON_SETS; - tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; - - table[tableLength++]=(uint16_t)code; - - if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) { - /* single-code point BMP result for BMP code point */ - table[tableLength++]=(uint16_t)c; - } else { - table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop); - c=-1; - } - canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength; - } else { - table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH; - tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; - - table[tableLength++]=(uint16_t)(code>>16); - table[tableLength++]=(uint16_t)code; - - if(c>=0) { - /* single-code point result for supplementary code point */ - table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00)); - table[tableLength++]=(uint16_t)c; - } else { - table[tableLength++]=(uint16_t)canonStartSetsTop; - } - canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength; - } - - if(c<0) { - /* write a USerializedSet */ - ++canonSetsCount; - canonStartSetsTop+= - uset_serialize(norm->canonStart, - canonStartSets+canonStartSetsTop, - _NORM_MAX_CANON_SETS-canonStartSetsTop, - &errorCode); - } - canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop; - - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop); - exit(errorCode); - } - if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) { - fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n"); - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } - } -} - -/* for getSkippableFlags ---------------------------------------------------- */ - -/* combine the lead and trail code points; return <0 if they do not combine */ -static int32_t -combine(uint32_t lead, uint32_t trail) { - CombiningTriple *triples; - uint32_t i, count; - - /* search for all triples with c as lead code point */ - triples=utm_getStart(combiningTriplesMem); - count=utm_countItems(combiningTriplesMem); - - /* triples are not sorted by code point but for each lead CP there is one contiguous block */ - for(i=0; i1 && cc[%d], U+%04x, %u)\n", - (int)s[0], (int)s[1], (int)length, (int)c, cc); - exit(U_INTERNAL_PROGRAM_ERROR); - } - } - - /* try to combine/consume c, return TRUE if it is consumed */ - return combine((uint32_t)starter, c)>=0; -} - -/* does the starter s[0] combine forward with another char that is below trailCC? */ -static UBool -canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) { - if(trailCC<=1) { - /* no character will combine ahead of the trailing char of the decomposition */ - return FALSE; - } - - /* - * We are only checking skippable condition (f). - * Therefore, the original character does not have quick check flag NFC_NO (c), - * i.e., the decomposition recomposes completely back into the original code point. - * So s[0] must be a true starter with cc==0 and - * combining with following code points. - * - * Similarly, length==1 is not possible because that would be a singleton - * decomposition which is marked with NFC_NO and does not pass (c). - * - * Only a character with cc=trailCC would order after decomposition s[], - * composition would consume all of the decomposition, and here we know that - * the original char passed check d), i.e., it does not combine forward, - * therefore does not combine with anything after the decomposition is consumed. - * - * Now see if there is a character that - * 1. combines backward - * 2. has cc2 is a little harder: - * - * Since we will get different starters during recomposition, we need to - * enumerate each backward-combining character (1.) - * with cc0 && cc0 && ccspecialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) { - return 0; - } - - /* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */ - - /* - * Note: - * This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not. - * - * This means that (a)..(e) must always be derived from the runtime norm32 value, - * and (f) be checked from the auxTrie if the character is skippable per (a)..(e), - * the form is NF*C and there is a canonical decomposition (NFD_NO). - * - * (a) unassigned code points get "not skippable"==false because they - * don't have a Norm struct so they won't get here - */ - - /* (b) not skippable if cc!=0 */ - if(norm->udataCC!=0) { - return 0; /* non-zero flag for (f) only */ - } - - /* - * not NFC_Skippable if - * (c) quick check flag == NO or - * (d) combines forward or - * (e) combines back or - * (f) can change if another character is added - * - * for (f): - * For NF*C: Get corresponding decomposition, get its last starter (cc==0), - * check its composition list, - * see if any of the second code points in the list - * has cc less than the trailCC of the decomposition. - * - * For FCC: Test at runtime if the decomposition has a trailCC>1 - * -> there are characters with cc==1, they would order before the trail char - * and prevent contiguous combination with the trail char. - */ - if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 || - (norm->combiningFlags&3)!=0) { - return 0; /* non-zero flag for (f) only */ - } - if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) { - return _NORM_AUX_NFC_SKIP_F_MASK; - } - - return 0; /* skippable */ -} - -static void -makeAux() { - Norm *norm; - uint32_t *pData; - int32_t i, length; - - pData=utrie_getData(auxTrie, &length); - - for(i=0; icombiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))| - (uint32_t)norm->fncIndex; - - if(norm->unsafeStart || norm->udataCC!=0) { - pData[i]|=_NORM_AUX_UNSAFE_MASK; - } - - pData[i]|=getSkippableFlags(norm); - } -} - -/* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */ -static uint32_t U_CALLCONV -getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) { - uint32_t value, leadNorm32=0; - UChar32 limit; - UBool inBlockZero; - - limit=start+0x400; - while(start>UTRIE_SURROGATE_BLOCK_BITS) - )<<_NORM_EXTRA_SHIFT; - - return leadNorm32; -} - -/* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */ - -/* - * folding value for auxiliary data: - * store the non-zero offset in bits 9..0 (FNC bits) - * if there is any non-0 entry; - * "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points - */ -static uint32_t U_CALLCONV -getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) { - uint32_t value, oredValues; - UChar32 limit; - UBool inBlockZero; - - oredValues=0; - limit=start+0x400; - while(start>=UTRIE_SURROGATE_BLOCK_BITS; - if(offset>_NORM_AUX_FNC_MASK) { - fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n"); - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } - return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK); - } else { - return 0; - } -} - -extern void -processData() { -#if 0 - uint16_t i; -#endif - - processCombining(); - - /* canonically reorder decompositions and assign combining classes for decompositions */ - enumTrie(postParseFn, NULL); - -#if 0 - for(i=1; i<64; ++i) { - if(combineAndQC[i]) { - printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33); - } - } -#endif - - /* add hangul/jamo specials */ - setHangulJamoSpecials(); - - /* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */ - canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop; - - /* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */ - if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) { - enumTrie(makeCanonSetFn, NULL); - } - - /* clone the normalization builder trie to make the final data tries */ - if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) || - NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) || - NULL==utrie_clone(auxTrie, normTrie, NULL, 0) - ) { - fprintf(stderr, "error: unable to clone the normalization trie\n"); - exit(U_MEMORY_ALLOCATION_ERROR); - } - - /* --- finalize data for quick checks & normalization --- */ - - /* turn the Norm structs (stage2, norms) into 32-bit data words */ - makeAll32(); - - /* --- finalize data for FCD checks --- */ - - /* FCD data: take Norm.canonBothCCs and store them in the FCD table */ - makeFCD(); - - /* --- finalize auxiliary normalization data --- */ - makeAux(); - - if(beVerbose) { -#if 0 - printf("number of stage 2 entries: %ld\n", stage2Mem->index); - printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2); -#endif - printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop); - printf("combining table count: %u\n", combiningTableTop); - } -} - -/* is this a norm32 with a special index for a lead surrogate? */ -static U_INLINE UBool -isNorm32LeadSurrogate(uint32_t norm32) { - return _NORM_MIN_SPECIAL<=norm32 && norm32<_NORM_SURROGATES_TOP; -} - -/* normTrie: 32-bit trie result may contain a special extraData index with the folding offset */ -static int32_t U_CALLCONV -getFoldingNormOffset(uint32_t norm32) { - if(isNorm32LeadSurrogate(norm32)) { - return - UTRIE_BMP_INDEX_LENGTH+ - (((int32_t)norm32>>(_NORM_EXTRA_SHIFT-UTRIE_SURROGATE_BLOCK_BITS))& - (0x3ff<udataCC!=0 || - norm->combiningFlags&0x80 || + uset_contains(compositionExclusions, (UChar32)code) || hasMapping(norm->nfd[1]) ) { return TRUE; @@ -1926,7 +207,6 @@ hasOneWayMapping(uint32_t code, UBool withCompat) { } } } -#endif /* !UCONFIG_NO_NORMALIZATION */ static void writeAllMappings(FILE *f, UBool withCompat) { @@ -1938,7 +218,6 @@ writeAllMappings(FILE *f, UBool withCompat) { } else { fprintf(f, "\n# Canonical decomposition mappings\n"); } -#if !UCONFIG_NO_NORMALIZATION for(code=0; code<=0x10ffff;) { i=utrie_get32(normTrie, code, &isInBlockZero); if(isInBlockZero) { @@ -1970,7 +249,6 @@ writeAllMappings(FILE *f, UBool withCompat) { ++code; } } -#endif /* !UCONFIG_NO_NORMALIZATION */ } static void @@ -1990,357 +268,13 @@ writeNorm2(const char *dataDir) { writeNorm2TextFile(dataDir, "nfkc.txt", TRUE); } -extern void -generateData(const char *dataDir, UBool csource) { - static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000]; - - UNewDataMemory *pData; - UErrorCode errorCode=U_ZERO_ERROR; - int32_t size, dataLength; - -#if UCONFIG_NO_NORMALIZATION - - size=0; - -#else - - U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15); - U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12); - USet *set; - int32_t normTrieSize, fcdTrieSize, auxTrieSize; - - normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - - if(DO_STORE(UGENNORM_STORE_FCD)) { - fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - } else { - fcdTrieSize=0; - } - - if(DO_STORE(UGENNORM_STORE_AUX)) { - auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - } else { - auxTrieSize=0; - } - - /* move the parts of canonStartSets[] together into a contiguous block */ - if( canonStartSetsTop<_NORM_MAX_CANON_SETS && - canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0 - ) { - uprv_memmove(canonStartSets+canonStartSetsTop, - canonStartSets+_NORM_MAX_CANON_SETS, - canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2); - } - canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]; - - if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) && - canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0 - ) { - uprv_memmove(canonStartSets+canonStartSetsTop, - canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH, - canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2); - } - canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; - - /* create the normalization exclusion sets */ - /* - * nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]] - * but we cannot use NFD_QC from the pattern because that would require - * unorm.icu which we are just going to generate. - * Therefore we have manually collected nfdQCNoSet and intersect Ideographic - * with that. - */ - U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15); - U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12); - - canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop; - set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - uset_retainAll(set, nfdQCNoSet); - if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) { - uset_clear(set); - } - canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - uset_close(set); - - canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop; - set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) { - uset_clear(set); - } - canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode)); - exit(errorCode); - } - uset_close(set); - - canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop; - - /* make sure that the FCD trie is 4-aligned */ - if((utm_countItems(extraMem)+combiningTableTop)&1) { - combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */ - } - - /* pad canonStartSets to 4-alignment, too */ - if(canonStartSetsTop&1) { - canonStartSets[canonStartSetsTop++]=0x1235; - } - - size= - _NORM_INDEX_TOP*4+ - normTrieSize+ - utm_countItems(extraMem)*2+ - combiningTableTop*2+ - fcdTrieSize+ - auxTrieSize+ - canonStartSetsTop*2; - - if(beVerbose) { - printf("size of normalization trie %5u bytes\n", (int)normTrieSize); - printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem)); - printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]); - printf("size of combining table %5u uint16_t\n", combiningTableTop); - printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize); - printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize); - printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop); - printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP); - printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP); - printf(" number of sets %5d\n", (int)canonSetsCount); - printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]); - printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]); - printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]); - printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size); - } - - indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize; - indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem); - - indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; - indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; - indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop); - indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop); - - /* the quick check minimum code points are already set */ - - indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize; - indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize; - indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop; - -#endif - - if(csource) { -#if UCONFIG_NO_NORMALIZATION - /* no csource for dummy mode..? */ - fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n"); - exit(1); -#else - /* write .c file for hardcoded data */ - UTrie normRuntimeTrie={ NULL }, fcdRuntimeTrie={ NULL }, auxRuntimeTrie={ NULL }; - UTrie2 *normRuntimeTrie2, *fcdRuntimeTrie2=NULL, *auxRuntimeTrie2=NULL; - FILE *f; - - utrie_unserialize(&normRuntimeTrie, normTrieBlock, normTrieSize, &errorCode); - normRuntimeTrie.getFoldingOffset=getFoldingNormOffset; - if(fcdTrieSize>0) { - utrie_unserialize(&fcdRuntimeTrie, fcdTrieBlock, fcdTrieSize, &errorCode); - } - if(auxTrieSize>0) { - utrie_unserialize(&auxRuntimeTrie, auxTrieBlock, auxTrieSize, &errorCode); - auxRuntimeTrie.getFoldingOffset=getFoldingAuxOffset; - } - if(U_FAILURE(errorCode)) { - fprintf( - stderr, - "gennorm error: failed to utrie_unserialize() one of the tries - %s\n", - u_errorName(errorCode)); - exit(errorCode); - } - - /* use UTrie2 */ - normRuntimeTrie2=utrie2_fromUTrie(&normRuntimeTrie, 0, &errorCode); - if(fcdTrieSize>0) { - fcdRuntimeTrie2=utrie2_fromUTrie(&fcdRuntimeTrie, 0, &errorCode); - } - if(auxTrieSize>0) { - auxRuntimeTrie2=utrie2_fromUTrie(&auxRuntimeTrie, 0, &errorCode); - } - if(U_FAILURE(errorCode)) { - fprintf( - stderr, - "gennorm error: utrie2_fromUTrie() failed - %s\n", - u_errorName(errorCode)); - exit(errorCode); - } - if(auxTrieSize>0) { - /* delete lead surrogate code unit values */ - UChar lead; - auxRuntimeTrie2=utrie2_cloneAsThawed(auxRuntimeTrie2, &errorCode); - for(lead=0xd800; lead<0xdc00; ++lead) { - utrie2_set32ForLeadSurrogateCodeUnit(auxRuntimeTrie2, lead, auxRuntimeTrie2->initialValue, &errorCode); - } - utrie2_freeze(auxRuntimeTrie2, UTRIE2_16_VALUE_BITS, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf( - stderr, - "gennorm error: deleting lead surrogate code unit values failed - %s\n", - u_errorName(errorCode)); - exit(errorCode); - } - } - - f=usrc_create(dataDir, "unorm_props_data.c"); - if(f!=NULL) { - /* unused - usrc_writeArray(f, - "static const UVersionInfo formatVersion={ ", - dataInfo.formatVersion, 8, 4, - " };\n\n"); - */ - usrc_writeArray(f, - "static const UVersionInfo dataVersion={ ", - dataInfo.dataVersion, 8, 4, - " };\n\n"); - usrc_writeArray(f, - "static const int32_t indexes[_NORM_INDEX_TOP]={\n", - indexes, 32, _NORM_INDEX_TOP, - "\n};\n\n"); - usrc_writeUTrie2Arrays(f, - "static const uint16_t normTrie_index[%ld]={\n", - "static const uint32_t normTrie_data32[%ld]={\n", - normRuntimeTrie2, - "\n};\n\n"); - usrc_writeUTrie2Struct(f, - "static const UTrie2 normTrie={\n", - normRuntimeTrie2, "normTrie_index", "normTrie_data32", - "};\n\n"); - usrc_writeArray(f, - "static const uint16_t extraData[%ld]={\n", - utm_getStart(extraMem), 16, utm_countItems(extraMem), - "\n};\n\n"); - usrc_writeArray(f, - "static const uint16_t combiningTable[%ld]={\n", - combiningTable, 16, combiningTableTop, - "\n};\n\n"); - if(fcdTrieSize>0) { - usrc_writeUTrie2Arrays(f, - "static const uint16_t fcdTrie_index[%ld]={\n", NULL, - fcdRuntimeTrie2, - "\n};\n\n"); - usrc_writeUTrie2Struct(f, - "static const UTrie2 fcdTrie={\n", - fcdRuntimeTrie2, "fcdTrie_index", NULL, - "};\n\n"); - } else { - fputs( "static const UTrie2 fcdTrie={ NULL };\n\n", f); - } - if(auxTrieSize>0) { - usrc_writeUTrie2Arrays(f, - "static const uint16_t auxTrie_index[%ld]={\n", NULL, - auxRuntimeTrie2, - "\n};\n\n"); - usrc_writeUTrie2Struct(f, - "static const UTrie2 auxTrie={\n", - auxRuntimeTrie2, "auxTrie_index", NULL, - "};\n\n"); - } else { - fputs( "static const UTrie2 auxTrie={ NULL };\n\n", f); - } - usrc_writeArray(f, - "static const uint16_t canonStartSets[%ld]={\n", - canonStartSets, 16, canonStartSetsTop, - "\n};\n\n"); - fclose(f); - } - utrie2_close(normRuntimeTrie2); - utrie2_close(fcdRuntimeTrie2); - utrie2_close(auxRuntimeTrie2); -#endif - } else { - /* write the data */ - pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo, - haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode); - exit(errorCode); - } - -#if !UCONFIG_NO_NORMALIZATION - - udata_writeBlock(pData, indexes, sizeof(indexes)); - udata_writeBlock(pData, normTrieBlock, normTrieSize); - udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2); - udata_writeBlock(pData, combiningTable, combiningTableTop*2); - udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize); - udata_writeBlock(pData, auxTrieBlock, auxTrieSize); - udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2); - -#endif - - /* finish up */ - dataLength=udata_finish(pData, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode); - exit(errorCode); - } - - if(dataLength!=size) { - fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n", - (long)dataLength, (long)size); - exit(U_INTERNAL_PROGRAM_ERROR); - } - } -} - -#if !UCONFIG_NO_NORMALIZATION - extern void cleanUpData(void) { - int32_t i, count; - - count=utm_countItems(normMem); - for(i=0; i