b579ec4770
X-SVN-Rev: 23435
2115 lines
68 KiB
C
2115 lines
68 KiB
C
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 1999-2008, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: store.c
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2001may25
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Store Unicode normalization data in a memory-mappable file.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/ustring.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
#include "filestrm.h"
|
|
#include "unicode/udata.h"
|
|
#include "utrie.h"
|
|
#include "unicode/uset.h"
|
|
#include "toolutil.h"
|
|
#include "unewdata.h"
|
|
#include "writesrc.h"
|
|
#include "unormimp.h"
|
|
#include "gennorm.h"
|
|
|
|
#define DO_DEBUG_OUT 0
|
|
|
|
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
|
|
|
/*
|
|
* The new implementation of the normalization code loads its data from
|
|
* unorm.icu, which is generated with this gennorm tool.
|
|
* The format of that file is described in unormimp.h .
|
|
*/
|
|
|
|
/* file data ---------------------------------------------------------------- */
|
|
|
|
#if UCONFIG_NO_NORMALIZATION
|
|
|
|
/* dummy UDataInfo cf. udata.h */
|
|
static UDataInfo dataInfo = {
|
|
sizeof(UDataInfo),
|
|
0,
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
U_CHARSET_FAMILY,
|
|
U_SIZEOF_UCHAR,
|
|
0,
|
|
|
|
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
|
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
|
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
|
};
|
|
|
|
#else
|
|
|
|
/* UDataInfo cf. udata.h */
|
|
static UDataInfo dataInfo={
|
|
sizeof(UDataInfo),
|
|
0,
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
U_CHARSET_FAMILY,
|
|
U_SIZEOF_UCHAR,
|
|
0,
|
|
|
|
{ 0x4e, 0x6f, 0x72, 0x6d }, /* dataFormat="Norm" */
|
|
{ 2, 3, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
|
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
|
|
};
|
|
|
|
extern void
|
|
setUnicodeVersion(const char *v) {
|
|
UVersionInfo version;
|
|
u_versionFromString(version, v);
|
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
|
}
|
|
|
|
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
|
|
|
|
/* builder data ------------------------------------------------------------- */
|
|
|
|
/* modularization flags, see gennorm.h (default to "store everything") */
|
|
uint32_t gStoreFlags=0xffffffff;
|
|
|
|
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
|
|
|
|
static UNewTrie
|
|
*normTrie,
|
|
*norm32Trie,
|
|
*fcdTrie,
|
|
*auxTrie;
|
|
|
|
static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
|
|
|
|
static Norm *norms;
|
|
|
|
/*
|
|
* set a flag for each code point that was seen in decompositions -
|
|
* avoid to decompose ones that have not been used before
|
|
*/
|
|
static uint32_t haveSeenFlags[256];
|
|
|
|
/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
|
|
static USet *nfdQCNoSet;
|
|
|
|
/* see addCombiningCP() for details */
|
|
static uint32_t combiningCPs[2000];
|
|
|
|
/*
|
|
* after processCombining() this contains for each code point in combiningCPs[]
|
|
* the runtime combining index
|
|
*/
|
|
static uint16_t combiningIndexes[2000];
|
|
|
|
/* section limits for combiningCPs[], see addCombiningCP() */
|
|
static uint16_t combineFwdTop=0, combineBothTop=0, combineBackTop=0;
|
|
|
|
/**
|
|
* Structure for a triple of code points, stored in combiningTriplesMem.
|
|
* The lead and trail code points combine into the the combined one,
|
|
* i.e., there is a canonical decomposition of combined-> <lead, trail>.
|
|
*
|
|
* Before processCombining() is called, leadIndex and trailIndex are 0.
|
|
* After processCombining(), they contain the indexes of the lead and trail
|
|
* code point in the combiningCPs[] array.
|
|
* They are then sorted by leadIndex, then trailIndex.
|
|
* They are not sorted by code points.
|
|
*/
|
|
typedef struct CombiningTriple {
|
|
uint16_t leadIndex, trailIndex;
|
|
uint32_t lead, trail, combined;
|
|
} CombiningTriple;
|
|
|
|
/* 15b in the combining index -> <=0x8000 uint16_t values in the combining table */
|
|
static uint16_t combiningTable[0x8000];
|
|
static uint16_t combiningTableTop=0;
|
|
|
|
#define _NORM_MAX_SET_SEARCH_TABLE_LENGTH 0x4000
|
|
static uint16_t canonStartSets[_NORM_MAX_CANON_SETS+2*_NORM_MAX_SET_SEARCH_TABLE_LENGTH
|
|
+10000]; /* +10000 for exclusion sets */
|
|
static int32_t canonStartSetsTop=_NORM_SET_INDEX_TOP;
|
|
static int32_t canonSetsCount=0;
|
|
|
|
/* allocate and initialize a Norm unit */
|
|
static Norm *
|
|
allocNorm() {
|
|
/* allocate Norm */
|
|
Norm *p=(Norm *)utm_alloc(normMem);
|
|
/*
|
|
* The combiningIndex must not be initialized to 0 because 0 is the
|
|
* combiningIndex of the first forward-combining character.
|
|
*/
|
|
p->combiningIndex=0xffff;
|
|
return p;
|
|
}
|
|
|
|
extern void
|
|
init() {
|
|
uint16_t *p16;
|
|
|
|
normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
|
uprv_memset(normTrie, 0, sizeof(UNewTrie));
|
|
norm32Trie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
|
uprv_memset(norm32Trie, 0, sizeof(UNewTrie));
|
|
fcdTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
|
uprv_memset(fcdTrie, 0, sizeof(UNewTrie));
|
|
auxTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
|
uprv_memset(auxTrie, 0, sizeof(UNewTrie));
|
|
|
|
/* initialize the two tries */
|
|
if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
|
|
fprintf(stderr, "error: failed to initialize tries\n");
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
}
|
|
|
|
/* allocate Norm structures and reset the first one */
|
|
normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
|
|
norms=allocNorm();
|
|
|
|
/* allocate UTF-32 string memory */
|
|
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
|
|
|
|
/* reset all "have seen" flags */
|
|
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
|
|
|
|
/* open an empty set */
|
|
nfdQCNoSet=uset_open(1, 0);
|
|
|
|
/* allocate extra data memory for UTF-16 decomposition strings and other values */
|
|
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
|
|
/* initialize the extraMem counter for the top of FNC strings */
|
|
p16=(uint16_t *)utm_alloc(extraMem);
|
|
*p16=1;
|
|
|
|
/* allocate temporary memory for combining triples */
|
|
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
|
|
|
|
/* set the minimum code points for no/maybe quick check values to the end of the BMP */
|
|
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
|
|
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=0xffff;
|
|
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=0xffff;
|
|
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=0xffff;
|
|
|
|
/* preset the indexes portion of canonStartSets */
|
|
uprv_memset(canonStartSets, 0, _NORM_SET_INDEX_TOP*2);
|
|
}
|
|
|
|
/*
|
|
* get or create a Norm unit;
|
|
* get or create the intermediate trie entries for it as well
|
|
*/
|
|
static Norm *
|
|
createNorm(uint32_t code) {
|
|
Norm *p;
|
|
uint32_t i;
|
|
|
|
i=utrie_get32(normTrie, (UChar32)code, NULL);
|
|
if(i!=0) {
|
|
p=norms+i;
|
|
} else {
|
|
/* allocate Norm */
|
|
p=allocNorm();
|
|
if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
|
|
fprintf(stderr, "error: too many normalization entries\n");
|
|
exit(U_BUFFER_OVERFLOW_ERROR);
|
|
}
|
|
}
|
|
return p;
|
|
}
|
|
|
|
/* get an existing Norm unit */
|
|
static Norm *
|
|
getNorm(uint32_t code) {
|
|
uint32_t i;
|
|
|
|
i=utrie_get32(normTrie, (UChar32)code, NULL);
|
|
if(i==0) {
|
|
return NULL;
|
|
}
|
|
return norms+i;
|
|
}
|
|
|
|
/* get the canonical combining class of a character */
|
|
static uint8_t
|
|
getCCFromCP(uint32_t code) {
|
|
Norm *norm=getNorm(code);
|
|
if(norm==NULL) {
|
|
return 0;
|
|
} else {
|
|
return norm->udataCC;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* enumerate all code points with their Norm structs and call a function for each
|
|
* return the number of code points with data
|
|
*/
|
|
static uint32_t
|
|
enumTrie(EnumTrieFn *fn, void *context) {
|
|
uint32_t count, i;
|
|
UChar32 code;
|
|
UBool isInBlockZero;
|
|
|
|
count=0;
|
|
for(code=0; code<=0x10ffff;) {
|
|
i=utrie_get32(normTrie, code, &isInBlockZero);
|
|
if(isInBlockZero) {
|
|
code+=UTRIE_DATA_BLOCK_LENGTH;
|
|
} else {
|
|
if(i!=0) {
|
|
fn(context, (uint32_t)code, norms+i);
|
|
++count;
|
|
}
|
|
++code;
|
|
}
|
|
}
|
|
return count;
|
|
}
|
|
|
|
static void
|
|
setHaveSeenString(const uint32_t *s, int32_t length) {
|
|
uint32_t c;
|
|
|
|
while(length>0) {
|
|
c=*s++;
|
|
haveSeenFlags[(c>>5)&0xff]|=(1<<(c&0x1f));
|
|
--length;
|
|
}
|
|
}
|
|
|
|
#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
|
|
|
|
/* handle combining data ---------------------------------------------------- */
|
|
|
|
/*
|
|
* Insert an entry into combiningCPs[] for the new code point code with its flags.
|
|
* The flags indicate if code combines forward, backward, or both.
|
|
*
|
|
* combiningCPs[] contains three sections:
|
|
* 1. code points that combine forward
|
|
* 2. code points that combine forward and backward
|
|
* 3. code points that combine backward
|
|
*
|
|
* Search for code in the entire array.
|
|
* If it is found and already is in the right section (old flags==new flags)
|
|
* then we are done.
|
|
* If it is found but the flags are different, then remove it,
|
|
* union the old and new flags, and reinsert it into its correct section.
|
|
* If it is not found, then just insert it.
|
|
*
|
|
* Within each section, the code points are not sorted.
|
|
*/
|
|
static void
|
|
addCombiningCP(uint32_t code, uint8_t flags) {
|
|
uint32_t newEntry;
|
|
uint16_t i;
|
|
|
|
newEntry=code|((uint32_t)flags<<24);
|
|
|
|
/* search for this code point */
|
|
for(i=0; i<combineBackTop; ++i) {
|
|
if(code==(combiningCPs[i]&0xffffff)) {
|
|
/* found it */
|
|
if(newEntry==combiningCPs[i]) {
|
|
return; /* no change */
|
|
}
|
|
|
|
/* combine the flags, remove the old entry from the old place, and insert the new one */
|
|
newEntry|=combiningCPs[i];
|
|
if(i!=--combineBackTop) {
|
|
uprv_memmove(combiningCPs+i, combiningCPs+i+1, (combineBackTop-i)*4);
|
|
}
|
|
if(i<combineBothTop) {
|
|
--combineBothTop;
|
|
}
|
|
if(i<combineFwdTop) {
|
|
--combineFwdTop;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* not found or modified, insert it */
|
|
if(combineBackTop>=sizeof(combiningCPs)/4) {
|
|
fprintf(stderr, "error: gennorm combining code points - trying to use more than %ld units\n",
|
|
(long)(sizeof(combiningCPs)/4));
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
}
|
|
|
|
/* set i to the insertion point */
|
|
flags=(uint8_t)(newEntry>>24);
|
|
if(flags==1) {
|
|
i=combineFwdTop++;
|
|
++combineBothTop;
|
|
} else if(flags==3) {
|
|
i=combineBothTop++;
|
|
} else /* flags==2 */ {
|
|
i=combineBackTop;
|
|
}
|
|
|
|
/* move the following code points up one and insert newEntry at i */
|
|
if(i<combineBackTop) {
|
|
uprv_memmove(combiningCPs+i+1, combiningCPs+i, (combineBackTop-i)*4);
|
|
}
|
|
combiningCPs[i]=newEntry;
|
|
|
|
/* finally increment the total counter */
|
|
++combineBackTop;
|
|
}
|
|
|
|
/**
|
|
* Find the index in combiningCPs[] where code point code is stored.
|
|
* @param code code point to look for
|
|
* @param isLead is code a forward combining code point?
|
|
* @return index in combiningCPs[] where code is stored
|
|
*/
|
|
static uint16_t
|
|
findCombiningCP(uint32_t code, UBool isLead) {
|
|
uint16_t i, limit;
|
|
|
|
if(isLead) {
|
|
i=0;
|
|
limit=combineBothTop;
|
|
} else {
|
|
i=combineFwdTop;
|
|
limit=combineBackTop;
|
|
}
|
|
|
|
/* search for this code point */
|
|
for(; i<limit; ++i) {
|
|
if(code==(combiningCPs[i]&0xffffff)) {
|
|
/* found it */
|
|
return i;
|
|
}
|
|
}
|
|
|
|
/* not found */
|
|
return 0xffff;
|
|
}
|
|
|
|
static void
|
|
addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
|
|
CombiningTriple *triple;
|
|
|
|
if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
return;
|
|
}
|
|
|
|
/*
|
|
* set combiningFlags for the two code points
|
|
* do this after decomposition so that getNorm() above returns NULL
|
|
* if we do not have actual sub-decomposition data for the initial NFD here
|
|
*/
|
|
createNorm(lead)->combiningFlags|=1; /* combines forward */
|
|
createNorm(trail)->combiningFlags|=2; /* combines backward */
|
|
|
|
addCombiningCP(lead, 1);
|
|
addCombiningCP(trail, 2);
|
|
|
|
triple=(CombiningTriple *)utm_alloc(combiningTriplesMem);
|
|
triple->lead=lead;
|
|
triple->trail=trail;
|
|
triple->combined=combined;
|
|
}
|
|
|
|
static int
|
|
compareTriples(const void *l, const void *r) {
|
|
int diff;
|
|
diff=(int)((CombiningTriple *)l)->leadIndex-
|
|
(int)((CombiningTriple *)r)->leadIndex;
|
|
if(diff==0) {
|
|
diff=(int)((CombiningTriple *)l)->trailIndex-
|
|
(int)((CombiningTriple *)r)->trailIndex;
|
|
}
|
|
return diff;
|
|
}
|
|
|
|
static void
|
|
processCombining() {
|
|
CombiningTriple *triples;
|
|
uint16_t *p;
|
|
uint32_t combined;
|
|
uint16_t i, j, count, tableTop, finalIndex, combinesFwd;
|
|
|
|
triples=utm_getStart(combiningTriplesMem);
|
|
|
|
/* add lead and trail indexes to the triples for sorting */
|
|
count=(uint16_t)utm_countItems(combiningTriplesMem);
|
|
for(i=0; i<count; ++i) {
|
|
/* findCombiningCP() must always find the code point */
|
|
triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
|
|
triples[i].trailIndex=findCombiningCP(triples[i].trail, FALSE);
|
|
}
|
|
|
|
/* sort them by leadIndex, trailIndex */
|
|
qsort(triples, count, sizeof(CombiningTriple), compareTriples);
|
|
|
|
/* calculate final combining indexes and store them in the Norm entries */
|
|
tableTop=0;
|
|
j=0; /* triples counter */
|
|
|
|
/* first, combining indexes of fwd/both characters are indexes into the combiningTable */
|
|
for(i=0; i<combineBothTop; ++i) {
|
|
/* start a new table */
|
|
|
|
/* assign combining index */
|
|
createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=tableTop;
|
|
|
|
/* calculate the length of the combining data for this lead code point in the combiningTable */
|
|
while(j<count && i==triples[j].leadIndex) {
|
|
/* count 2 to 3 16-bit units per composition entry (back-index, code point) */
|
|
combined=triples[j++].combined;
|
|
if(combined<=0x1fff) {
|
|
tableTop+=2;
|
|
} else {
|
|
tableTop+=3;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* second, combining indexes of back-only characters are simply incremented from here to be unique */
|
|
finalIndex=tableTop;
|
|
for(; i<combineBackTop; ++i) {
|
|
createNorm(combiningCPs[i]&0xffffff)->combiningIndex=combiningIndexes[i]=finalIndex++;
|
|
}
|
|
|
|
/* it must be finalIndex<=0x8000 because bit 15 is used in combiningTable as an end-for-this-lead marker */
|
|
if(finalIndex>0x8000) {
|
|
fprintf(stderr, "error: gennorm combining table - trying to use %u units, more than the %ld units available\n",
|
|
tableTop, (long)(sizeof(combiningTable)/4));
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
}
|
|
|
|
combiningTableTop=tableTop;
|
|
|
|
/* store the combining data in the combiningTable, with the final indexes from above */
|
|
p=combiningTable;
|
|
j=0; /* triples counter */
|
|
|
|
/*
|
|
* this is essentially the same loop as above, but
|
|
* it writes the table data instead of calculating and setting the final indexes;
|
|
* it is necessary to have two passes so that all the final indexes are known before
|
|
* they are written into the table
|
|
*/
|
|
for(i=0; i<combineBothTop; ++i) {
|
|
/* start a new table */
|
|
|
|
combined=0; /* avoid compiler warning */
|
|
|
|
/* store the combining data for this lead code point in the combiningTable */
|
|
while(j<count && i==triples[j].leadIndex) {
|
|
Norm *normPtr;
|
|
finalIndex=combiningIndexes[triples[j].trailIndex];
|
|
combined=triples[j++].combined;
|
|
normPtr = getNorm(combined);
|
|
|
|
if (normPtr == NULL) {
|
|
fprintf(stderr, "error: processCombining did not get expected result. combined=%d\n", combined);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
|
|
/* is combined a starter? (i.e., cc==0 && combines forward) */
|
|
combinesFwd=(uint16_t)((normPtr->combiningFlags&1)<<13);
|
|
|
|
*p++=finalIndex;
|
|
if(combined<=0x1fff) {
|
|
*p++=(uint16_t)(combinesFwd|combined);
|
|
} else if(combined<=0xffff) {
|
|
*p++=(uint16_t)(0x8000|combinesFwd);
|
|
*p++=(uint16_t)combined;
|
|
} else {
|
|
*p++=(uint16_t)(0xc000|combinesFwd|((combined-0x10000)>>10));
|
|
*p++=(uint16_t)(0xdc00|(combined&0x3ff));
|
|
}
|
|
}
|
|
|
|
/* set a marker on the last final trail index in this lead's table */
|
|
if(combined<=0x1fff) {
|
|
*(p-2)|=0x8000;
|
|
} else {
|
|
*(p-3)|=0x8000;
|
|
}
|
|
}
|
|
|
|
/* post condition: tableTop==(p-combiningTable) */
|
|
}
|
|
|
|
/* processing incoming normalization data ----------------------------------- */
|
|
|
|
/*
|
|
* Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
|
|
* c must be a Hangul syllable code point.
|
|
*/
|
|
static void
|
|
getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3]) {
|
|
/* Hangul syllable: decompose algorithmically */
|
|
uint32_t c2;
|
|
uint8_t length;
|
|
|
|
uprv_memset(pHangulNorm, 0, sizeof(Norm));
|
|
|
|
c-=HANGUL_BASE;
|
|
|
|
c2=c%JAMO_T_COUNT;
|
|
c/=JAMO_T_COUNT;
|
|
if(c2>0) {
|
|
hangulBuffer[2]=JAMO_T_BASE+c2;
|
|
length=3;
|
|
} else {
|
|
hangulBuffer[2]=0;
|
|
length=2;
|
|
}
|
|
|
|
hangulBuffer[1]=JAMO_V_BASE+c%JAMO_V_COUNT;
|
|
hangulBuffer[0]=JAMO_L_BASE+c/JAMO_V_COUNT;
|
|
|
|
pHangulNorm->nfd=hangulBuffer;
|
|
pHangulNorm->lenNFD=length;
|
|
if(DO_STORE(UGENNORM_STORE_COMPAT)) {
|
|
pHangulNorm->nfkd=hangulBuffer;
|
|
pHangulNorm->lenNFKD=length;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* decompose the one decomposition further, may generate two decompositions
|
|
* apply all previous characters' decompositions to this one
|
|
*/
|
|
static void
|
|
decompStoreNewNF(uint32_t code, Norm *norm) {
|
|
uint32_t nfd[40], nfkd[40], hangulBuffer[3];
|
|
Norm hangulNorm;
|
|
|
|
uint32_t *s32;
|
|
Norm *p;
|
|
uint32_t c;
|
|
int32_t i, length;
|
|
uint8_t lenNFD=0, lenNFKD=0;
|
|
UBool changedNFD=FALSE, changedNFKD=FALSE;
|
|
|
|
if((length=norm->lenNFD)!=0) {
|
|
/* always allocate the original string */
|
|
changedNFD=TRUE;
|
|
s32=norm->nfd;
|
|
} else if((length=norm->lenNFKD)!=0) {
|
|
/* always allocate the original string */
|
|
changedNFKD=TRUE;
|
|
s32=norm->nfkd;
|
|
} else {
|
|
/* no decomposition here, nothing to do */
|
|
return;
|
|
}
|
|
|
|
/* decompose each code point */
|
|
for(i=0; i<length; ++i) {
|
|
c=s32[i];
|
|
p=getNorm(c);
|
|
if(p==NULL) {
|
|
if(HANGUL_BASE<=c && c<(HANGUL_BASE+HANGUL_COUNT)) {
|
|
getHangulDecomposition(c, &hangulNorm, hangulBuffer);
|
|
p=&hangulNorm;
|
|
} else {
|
|
/* no data, no decomposition */
|
|
nfd[lenNFD++]=c;
|
|
nfkd[lenNFKD++]=c;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
/* canonically decompose c */
|
|
if(changedNFD) {
|
|
if(p->lenNFD!=0) {
|
|
uprv_memcpy(nfd+lenNFD, p->nfd, p->lenNFD*4);
|
|
lenNFD+=p->lenNFD;
|
|
} else {
|
|
nfd[lenNFD++]=c;
|
|
}
|
|
}
|
|
|
|
/* compatibility-decompose c */
|
|
if(p->lenNFKD!=0) {
|
|
uprv_memcpy(nfkd+lenNFKD, p->nfkd, p->lenNFKD*4);
|
|
lenNFKD+=p->lenNFKD;
|
|
changedNFKD=TRUE;
|
|
} else if(p->lenNFD!=0) {
|
|
uprv_memcpy(nfkd+lenNFKD, p->nfd, p->lenNFD*4);
|
|
lenNFKD+=p->lenNFD;
|
|
/*
|
|
* not changedNFKD=TRUE;
|
|
* so that we do not store a new nfkd if there was no nfkd string before
|
|
* and we only see canonical decompositions
|
|
*/
|
|
} else {
|
|
nfkd[lenNFKD++]=c;
|
|
}
|
|
}
|
|
|
|
/* assume that norm->lenNFD==1 or ==2 */
|
|
if(norm->lenNFD==2 && !(norm->combiningFlags&0x80)) {
|
|
addCombiningTriple(s32[0], s32[1], code);
|
|
}
|
|
|
|
if(changedNFD) {
|
|
if(lenNFD!=0) {
|
|
s32=utm_allocN(utf32Mem, lenNFD);
|
|
uprv_memcpy(s32, nfd, lenNFD*4);
|
|
} else {
|
|
s32=NULL;
|
|
}
|
|
norm->lenNFD=lenNFD;
|
|
norm->nfd=s32;
|
|
setHaveSeenString(nfd, lenNFD);
|
|
}
|
|
if(changedNFKD) {
|
|
if(lenNFKD!=0) {
|
|
s32=utm_allocN(utf32Mem, lenNFKD);
|
|
uprv_memcpy(s32, nfkd, lenNFKD*4);
|
|
} else {
|
|
s32=NULL;
|
|
}
|
|
norm->lenNFKD=lenNFKD;
|
|
norm->nfkd=s32;
|
|
setHaveSeenString(nfkd, lenNFKD);
|
|
}
|
|
}
|
|
|
|
typedef struct DecompSingle {
|
|
uint32_t c;
|
|
Norm *norm;
|
|
} DecompSingle;
|
|
|
|
/*
|
|
* apply this one character's decompositions (there is at least one!) to
|
|
* all previous characters' decompositions to decompose them further
|
|
*/
|
|
static void
|
|
decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
|
|
uint32_t nfd[40], nfkd[40];
|
|
uint32_t *s32;
|
|
DecompSingle *me=(DecompSingle *)context;
|
|
uint32_t c, myC;
|
|
int32_t i, length;
|
|
uint8_t lenNFD=0, lenNFKD=0, myLenNFD, myLenNFKD;
|
|
UBool changedNFD=FALSE, changedNFKD=FALSE;
|
|
|
|
/* get the new character's data */
|
|
myC=me->c;
|
|
myLenNFD=me->norm->lenNFD;
|
|
myLenNFKD=me->norm->lenNFKD;
|
|
/* assume that myC has at least one decomposition */
|
|
|
|
if((length=norm->lenNFD)!=0 && myLenNFD!=0) {
|
|
/* apply NFD(myC) to norm->nfd */
|
|
s32=norm->nfd;
|
|
for(i=0; i<length; ++i) {
|
|
c=s32[i];
|
|
if(c==myC) {
|
|
uprv_memcpy(nfd+lenNFD, me->norm->nfd, myLenNFD*4);
|
|
lenNFD+=myLenNFD;
|
|
changedNFD=TRUE;
|
|
} else {
|
|
nfd[lenNFD++]=c;
|
|
}
|
|
}
|
|
}
|
|
|
|
if((length=norm->lenNFKD)!=0) {
|
|
/* apply NFD(myC) and NFKD(myC) to norm->nfkd */
|
|
s32=norm->nfkd;
|
|
for(i=0; i<length; ++i) {
|
|
c=s32[i];
|
|
if(c==myC) {
|
|
if(myLenNFKD!=0) {
|
|
uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
|
|
lenNFKD+=myLenNFKD;
|
|
} else /* assume myLenNFD!=0 */ {
|
|
uprv_memcpy(nfkd+lenNFKD, me->norm->nfd, myLenNFD*4);
|
|
lenNFKD+=myLenNFD;
|
|
}
|
|
changedNFKD=TRUE;
|
|
} else {
|
|
nfkd[lenNFKD++]=c;
|
|
}
|
|
}
|
|
} else if((length=norm->lenNFD)!=0 && myLenNFKD!=0) {
|
|
/* apply NFKD(myC) to norm->nfd, forming a new norm->nfkd */
|
|
s32=norm->nfd;
|
|
for(i=0; i<length; ++i) {
|
|
c=s32[i];
|
|
if(c==myC) {
|
|
uprv_memcpy(nfkd+lenNFKD, me->norm->nfkd, myLenNFKD*4);
|
|
lenNFKD+=myLenNFKD;
|
|
changedNFKD=TRUE;
|
|
} else {
|
|
nfkd[lenNFKD++]=c;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* set the new decompositions, forget the old ones */
|
|
if(changedNFD) {
|
|
if(lenNFD!=0) {
|
|
if(lenNFD>norm->lenNFD) {
|
|
s32=utm_allocN(utf32Mem, lenNFD);
|
|
} else {
|
|
s32=norm->nfd;
|
|
}
|
|
uprv_memcpy(s32, nfd, lenNFD*4);
|
|
} else {
|
|
s32=NULL;
|
|
}
|
|
norm->lenNFD=lenNFD;
|
|
norm->nfd=s32;
|
|
}
|
|
if(changedNFKD) {
|
|
if(lenNFKD!=0) {
|
|
if(lenNFKD>norm->lenNFKD) {
|
|
s32=utm_allocN(utf32Mem, lenNFKD);
|
|
} else {
|
|
s32=norm->nfkd;
|
|
}
|
|
uprv_memcpy(s32, nfkd, lenNFKD*4);
|
|
} else {
|
|
s32=NULL;
|
|
}
|
|
norm->lenNFKD=lenNFKD;
|
|
norm->nfkd=s32;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* process the data for one code point listed in UnicodeData;
|
|
* UnicodeData itself never maps a code point to both NFD and NFKD
|
|
*/
|
|
extern void
|
|
storeNorm(uint32_t code, Norm *norm) {
|
|
DecompSingle decompSingle;
|
|
Norm *p;
|
|
|
|
if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
|
|
/* ignore compatibility decomposition */
|
|
norm->lenNFKD=0;
|
|
}
|
|
|
|
/* copy existing derived normalization properties */
|
|
p=createNorm(code);
|
|
norm->qcFlags=p->qcFlags;
|
|
norm->combiningFlags=p->combiningFlags;
|
|
norm->fncIndex=p->fncIndex;
|
|
|
|
/* process the decomposition if there is one here */
|
|
if((norm->lenNFD|norm->lenNFKD)!=0) {
|
|
/* decompose this one decomposition further, may generate two decompositions */
|
|
decompStoreNewNF(code, norm);
|
|
|
|
/* has this code point been used in previous decompositions? */
|
|
if(HAVE_SEEN(code)) {
|
|
/* use this decomposition to decompose other decompositions further */
|
|
decompSingle.c=code;
|
|
decompSingle.norm=norm;
|
|
enumTrie(decompWithSingleFn, &decompSingle);
|
|
}
|
|
}
|
|
|
|
/* store the data */
|
|
uprv_memcpy(p, norm, sizeof(Norm));
|
|
}
|
|
|
|
extern void
|
|
setQCFlags(uint32_t code, uint8_t qcFlags) {
|
|
if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
|
|
/* ignore compatibility decomposition: unset the KC/KD flags */
|
|
qcFlags&=~(_NORM_QC_NFKC|_NORM_QC_NFKD);
|
|
|
|
/* set the KC/KD flags to the same values as the C/D flags */
|
|
qcFlags|=qcFlags<<1;
|
|
}
|
|
if(DO_NOT_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
/* ignore composition data: unset the C/KC flags */
|
|
qcFlags&=~(_NORM_QC_NFC|_NORM_QC_NFKC);
|
|
|
|
/* set the C/KC flags to the same values as the D/KD flags */
|
|
qcFlags|=qcFlags>>2;
|
|
}
|
|
|
|
createNorm(code)->qcFlags|=qcFlags;
|
|
|
|
/* adjust the minimum code point for quick check no/maybe */
|
|
if(code<0xffff) {
|
|
if((qcFlags&_NORM_QC_NFC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]) {
|
|
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=(uint16_t)code;
|
|
}
|
|
if((qcFlags&_NORM_QC_NFKC) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]) {
|
|
indexes[_NORM_INDEX_MIN_NFKC_NO_MAYBE]=(uint16_t)code;
|
|
}
|
|
if((qcFlags&_NORM_QC_NFD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]) {
|
|
indexes[_NORM_INDEX_MIN_NFD_NO_MAYBE]=(uint16_t)code;
|
|
}
|
|
if((qcFlags&_NORM_QC_NFKD) && (uint16_t)code<indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]) {
|
|
indexes[_NORM_INDEX_MIN_NFKD_NO_MAYBE]=(uint16_t)code;
|
|
}
|
|
}
|
|
|
|
if(qcFlags&_NORM_QC_NFD) {
|
|
uset_add(nfdQCNoSet, (UChar32)code);
|
|
}
|
|
}
|
|
|
|
extern void
|
|
setCompositionExclusion(uint32_t code) {
|
|
if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
createNorm(code)->combiningFlags|=0x80;
|
|
}
|
|
}
|
|
|
|
static void
|
|
setHangulJamoSpecials() {
|
|
Norm *norm;
|
|
uint32_t c, hangul;
|
|
|
|
/*
|
|
* Hangul syllables are algorithmically decomposed into Jamos,
|
|
* and Jamos are algorithmically composed into Hangul syllables.
|
|
* The quick check flags are parsed, except for Hangul.
|
|
*/
|
|
|
|
/* set Jamo L specials */
|
|
hangul=0xac00;
|
|
for(c=0x1100; c<=0x1112; ++c) {
|
|
norm=createNorm(c);
|
|
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
|
|
if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
norm->combiningFlags=1;
|
|
}
|
|
|
|
/* for each Jamo L create a set with its associated Hangul block */
|
|
norm->canonStart=uset_open(hangul, hangul+21*28-1);
|
|
hangul+=21*28;
|
|
}
|
|
|
|
/* set Jamo V specials */
|
|
for(c=0x1161; c<=0x1175; ++c) {
|
|
norm=createNorm(c);
|
|
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
|
|
if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
norm->combiningFlags=2;
|
|
}
|
|
norm->unsafeStart=TRUE;
|
|
}
|
|
|
|
/* set Jamo T specials */
|
|
for(c=0x11a8; c<=0x11c2; ++c) {
|
|
norm=createNorm(c);
|
|
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
|
|
if(DO_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
norm->combiningFlags=2;
|
|
}
|
|
norm->unsafeStart=TRUE;
|
|
}
|
|
|
|
/* set Hangul specials, precompacted */
|
|
norm=allocNorm();
|
|
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL;
|
|
if(DO_STORE(UGENNORM_STORE_COMPAT)) {
|
|
norm->qcFlags=_NORM_QC_NFD|_NORM_QC_NFKD;
|
|
} else {
|
|
norm->qcFlags=_NORM_QC_NFD;
|
|
}
|
|
|
|
if(!utrie_setRange32(normTrie, 0xac00, 0xd7a4, (uint32_t)(norm-norms), TRUE)) {
|
|
fprintf(stderr, "error: too many normalization entries (setting Hangul)\n");
|
|
exit(U_BUFFER_OVERFLOW_ERROR);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* set FC-NFKC-Closure string
|
|
* s contains the closure string; s[0]==length, s[1..length] is the actual string
|
|
* may modify s[0]
|
|
*/
|
|
U_CFUNC void
|
|
setFNC(uint32_t c, UChar *s) {
|
|
uint16_t *p;
|
|
int32_t length, i, count;
|
|
UChar first;
|
|
|
|
if( DO_NOT_STORE(UGENNORM_STORE_COMPAT) ||
|
|
DO_NOT_STORE(UGENNORM_STORE_COMPOSITION) ||
|
|
DO_NOT_STORE(UGENNORM_STORE_AUX)
|
|
) {
|
|
return;
|
|
}
|
|
|
|
count=utm_countItems(extraMem);
|
|
length=s[0];
|
|
first=s[1];
|
|
|
|
/* try to overlay single-unit strings with existing ones */
|
|
if(length==1 && first<0xff00) {
|
|
p=utm_getStart(extraMem);
|
|
for(i=1; i<count; ++i) {
|
|
if(first==p[i]) {
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
i=count;
|
|
}
|
|
|
|
/* append the new string if it cannot be overlayed with an old one */
|
|
if(i==count) {
|
|
if(count>_NORM_AUX_MAX_FNC) {
|
|
fprintf(stderr, "gennorm error: too many FNC strings\n");
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
|
|
/* prepend 0xffxx with xx==length */
|
|
s[0]=(uint16_t)(0xff00+length);
|
|
++length;
|
|
p=(uint16_t *)utm_allocN(extraMem, length);
|
|
uprv_memcpy(p, s, length*2);
|
|
|
|
/* update the top index in extraMem[0] */
|
|
count+=length;
|
|
((uint16_t *)utm_getStart(extraMem))[0]=(uint16_t)count;
|
|
}
|
|
|
|
/* store the index to the string */
|
|
createNorm(c)->fncIndex=i;
|
|
}
|
|
|
|
/* build runtime structures ------------------------------------------------- */
|
|
|
|
/* canonically reorder a UTF-32 string; return { leadCC, trailCC } */
|
|
static uint16_t
|
|
reorderString(uint32_t *s, int32_t length) {
|
|
uint8_t ccs[40];
|
|
uint32_t c;
|
|
int32_t i, j;
|
|
uint8_t cc, prevCC;
|
|
|
|
if(length<=0) {
|
|
return 0;
|
|
}
|
|
|
|
for(i=0; i<length; ++i) {
|
|
/* get the i-th code point and its combining class */
|
|
c=s[i];
|
|
cc=getCCFromCP(c);
|
|
if(cc!=0 && i!=0) {
|
|
/* it is a combining mark, see if it needs to be moved back */
|
|
j=i;
|
|
do {
|
|
prevCC=ccs[j-1];
|
|
if(prevCC<=cc) {
|
|
break; /* found the right place */
|
|
}
|
|
/* move the previous code point here and go back */
|
|
s[j]=s[j-1];
|
|
ccs[j]=prevCC;
|
|
} while(--j!=0);
|
|
s[j]=c;
|
|
ccs[j]=cc;
|
|
} else {
|
|
/* just store the combining class */
|
|
ccs[i]=cc;
|
|
}
|
|
}
|
|
|
|
return (uint16_t)(((uint16_t)ccs[0]<<8)|ccs[length-1]);
|
|
}
|
|
|
|
#if 0
|
|
static UBool combineAndQC[64]={ 0 };
|
|
#endif
|
|
|
|
/*
|
|
* canonically reorder the up to two decompositions
|
|
* and store the leading and trailing combining classes accordingly
|
|
*
|
|
* also process canonical decompositions for canonical closure
|
|
*/
|
|
static void
|
|
postParseFn(void *context, uint32_t code, Norm *norm) {
|
|
int32_t length;
|
|
|
|
/* canonically order the NFD */
|
|
length=norm->lenNFD;
|
|
if(length>0) {
|
|
norm->canonBothCCs=reorderString(norm->nfd, length);
|
|
}
|
|
|
|
/* canonically reorder the NFKD */
|
|
length=norm->lenNFKD;
|
|
if(length>0) {
|
|
norm->compatBothCCs=reorderString(norm->nfkd, length);
|
|
}
|
|
|
|
/* verify that code has a decomposition if and only if the quick check flags say "no" on NF(K)D */
|
|
if((norm->lenNFD!=0) != ((norm->qcFlags&_NORM_QC_NFD)!=0)) {
|
|
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->qcFlags);
|
|
}
|
|
if(((norm->lenNFD|norm->lenNFKD)!=0) != ((norm->qcFlags&(_NORM_QC_NFD|_NORM_QC_NFKD))!=0)) {
|
|
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
|
|
}
|
|
|
|
/* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
|
|
#if 0
|
|
combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
|
|
#endif
|
|
|
|
if(norm->combiningFlags&1) {
|
|
if(norm->udataCC!=0) {
|
|
/* illegal - data-derivable composition exclusion */
|
|
fprintf(stderr, "gennorm warning: U+%04lx combines forward but udataCC==%u\n", (long)code, norm->udataCC);
|
|
}
|
|
}
|
|
if(norm->combiningFlags&2) {
|
|
if((norm->qcFlags&0x11)==0) {
|
|
fprintf(stderr, "gennorm warning: U+%04lx combines backward but qcNF?C==0\n", (long)code);
|
|
}
|
|
#if 0
|
|
/* occurs sometimes, this one is ok (therefore #if 0) - still here for documentation */
|
|
if(norm->udataCC==0) {
|
|
printf("U+%04lx combines backward but udataCC==0\n", (long)code);
|
|
}
|
|
#endif
|
|
}
|
|
if((norm->combiningFlags&3)==3 && beVerbose) {
|
|
printf("U+%04lx combines both ways\n", (long)code);
|
|
}
|
|
|
|
/*
|
|
* process canonical decompositions for canonical closure
|
|
*
|
|
* in each canonical decomposition:
|
|
* add the current character (code) to the set of canonical starters of its norm->nfd[0]
|
|
* set the "unsafe starter" flag for each norm->nfd[1..]
|
|
*/
|
|
length=norm->lenNFD;
|
|
if(length>0) {
|
|
Norm *otherNorm;
|
|
UChar32 c;
|
|
int32_t i;
|
|
|
|
/* nfd[0].canonStart.add(code) */
|
|
c=norm->nfd[0];
|
|
otherNorm=createNorm(c);
|
|
if(otherNorm->canonStart==NULL) {
|
|
otherNorm->canonStart=uset_open(code, code);
|
|
if(otherNorm->canonStart==NULL) {
|
|
fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
}
|
|
} else {
|
|
uset_add(otherNorm->canonStart, code);
|
|
if(!uset_contains(otherNorm->canonStart, code)) {
|
|
fprintf(stderr, "gennorm error: uset_add(setOf(U+%4x), U+%4x)\n", (int)c, (int)code);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
}
|
|
|
|
/* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
|
|
for(i=1; i<length; ++i) {
|
|
createNorm(norm->nfd[i])->unsafeStart=TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
static uint32_t
|
|
make32BitNorm(Norm *norm) {
|
|
UChar extra[100];
|
|
const Norm *other;
|
|
uint32_t word;
|
|
int32_t i, length, beforeZero=0, count, start;
|
|
|
|
/*
|
|
* Check for assumptions:
|
|
*
|
|
* Test that if a "true starter" (cc==0 && NF*C_YES) decomposes,
|
|
* then the decomposition also begins with a true starter.
|
|
*/
|
|
if(norm->udataCC==0) {
|
|
/* this is a starter */
|
|
if((norm->qcFlags&_NORM_QC_NFC)==0 && norm->lenNFD>0) {
|
|
/* a "true" NFC starter with a canonical decomposition */
|
|
if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
|
|
((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFC)!=0) /* nfd[0] not NFC_YES */
|
|
) {
|
|
fprintf(stderr,
|
|
"error: true NFC starter canonical decomposition[%u] does not begin\n"
|
|
" with a true NFC starter: U+%04lx U+%04lx%s\n",
|
|
norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
|
|
norm->lenNFD<=2 ? "" : " ...");
|
|
exit(U_INVALID_TABLE_FILE);
|
|
}
|
|
}
|
|
|
|
if((norm->qcFlags&_NORM_QC_NFKC)==0) {
|
|
if(norm->lenNFKD>0) {
|
|
/* a "true" NFKC starter with a compatibility decomposition */
|
|
if( norm->compatBothCCs>=0x100 || /* lead cc!=0 or */
|
|
((other=getNorm(norm->nfkd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfkd[0] not NFKC_YES */
|
|
) {
|
|
fprintf(stderr,
|
|
"error: true NFKC starter compatibility decomposition[%u] does not begin\n"
|
|
" with a true NFKC starter: U+%04lx U+%04lx%s\n",
|
|
norm->lenNFKD, (long)norm->nfkd[0], (long)norm->nfkd[1],
|
|
norm->lenNFKD<=2 ? "" : " ...");
|
|
exit(U_INVALID_TABLE_FILE);
|
|
}
|
|
} else if(norm->lenNFD>0) {
|
|
/* a "true" NFKC starter with only a canonical decomposition */
|
|
if( norm->canonBothCCs>=0x100 || /* lead cc!=0 or */
|
|
((other=getNorm(norm->nfd[0]))!=NULL && (other->qcFlags&_NORM_QC_NFKC)!=0) /* nfd[0] not NFKC_YES */
|
|
) {
|
|
fprintf(stderr,
|
|
"error: true NFKC starter canonical decomposition[%u] does not begin\n"
|
|
" with a true NFKC starter: U+%04lx U+%04lx%s\n",
|
|
norm->lenNFD, (long)norm->nfd[0], (long)norm->nfd[1],
|
|
norm->lenNFD<=2 ? "" : " ...");
|
|
exit(U_INVALID_TABLE_FILE);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/* reset the 32-bit word and set the quick check flags */
|
|
word=norm->qcFlags;
|
|
|
|
/* set the UnicodeData combining class */
|
|
word|=(uint32_t)norm->udataCC<<_NORM_CC_SHIFT;
|
|
|
|
/* set the combining flag and index */
|
|
if(norm->combiningFlags&3) {
|
|
word|=(uint32_t)(norm->combiningFlags&3)<<6;
|
|
}
|
|
|
|
/* set the combining index value into the extra data */
|
|
/* 0xffff: no combining index; 0..0x7fff: combining index */
|
|
if(norm->combiningIndex!=0xffff) {
|
|
extra[0]=norm->combiningIndex;
|
|
beforeZero=1;
|
|
}
|
|
|
|
count=beforeZero;
|
|
|
|
/* write the decompositions */
|
|
if((norm->lenNFD|norm->lenNFKD)!=0) {
|
|
extra[count++]=0; /* set the pieces when available, into extra[beforeZero] */
|
|
|
|
length=norm->lenNFD;
|
|
if(length>0) {
|
|
if(norm->canonBothCCs!=0) {
|
|
extra[beforeZero]|=0x80;
|
|
extra[count++]=norm->canonBothCCs;
|
|
}
|
|
start=count;
|
|
for(i=0; i<length; ++i) {
|
|
UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfd[i]);
|
|
}
|
|
extra[beforeZero]|=(UChar)(count-start); /* set the decomp length as the number of UTF-16 code units */
|
|
}
|
|
|
|
length=norm->lenNFKD;
|
|
if(length>0) {
|
|
if(norm->compatBothCCs!=0) {
|
|
extra[beforeZero]|=0x8000;
|
|
extra[count++]=norm->compatBothCCs;
|
|
}
|
|
start=count;
|
|
for(i=0; i<length; ++i) {
|
|
UTF_APPEND_CHAR_UNSAFE(extra, count, norm->nfkd[i]);
|
|
}
|
|
extra[beforeZero]|=(UChar)((count-start)<<8); /* set the decomp length as the number of UTF-16 code units */
|
|
}
|
|
}
|
|
|
|
/* allocate and copy the extra data */
|
|
if(count!=0) {
|
|
UChar *p;
|
|
|
|
if(norm->specialTag!=0) {
|
|
fprintf(stderr, "error: gennorm - illegal to have both extra data and a special tag (0x%x)\n", norm->specialTag);
|
|
exit(U_ILLEGAL_ARGUMENT_ERROR);
|
|
}
|
|
|
|
p=(UChar *)utm_allocN(extraMem, count);
|
|
uprv_memcpy(p, extra, count*2);
|
|
|
|
/* set the extra index, offset by beforeZero */
|
|
word|=(uint32_t)(beforeZero+(p-(UChar *)utm_getStart(extraMem)))<<_NORM_EXTRA_SHIFT;
|
|
} else if(norm->specialTag!=0) {
|
|
/* set a special tag instead of an extra index */
|
|
word|=(uint32_t)norm->specialTag<<_NORM_EXTRA_SHIFT;
|
|
}
|
|
|
|
return word;
|
|
}
|
|
|
|
/* turn all Norm structs into corresponding 32-bit norm values */
|
|
static void
|
|
makeAll32() {
|
|
uint32_t *pNormData;
|
|
uint32_t n;
|
|
int32_t i, normLength, count;
|
|
|
|
count=(int32_t)utm_countItems(normMem);
|
|
for(i=0; i<count; ++i) {
|
|
norms[i].value32=make32BitNorm(norms+i);
|
|
}
|
|
|
|
pNormData=utrie_getData(norm32Trie, &normLength);
|
|
|
|
count=0; /* count is now just used for debugging */
|
|
for(i=0; i<normLength; ++i) {
|
|
n=pNormData[i];
|
|
if(0!=(pNormData[i]=norms[n].value32)) {
|
|
++count;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* extract all Norm.canonBothCCs into the FCD table
|
|
* set 32-bit values to use the common fold and compact functions
|
|
*/
|
|
static void
|
|
makeFCD() {
|
|
uint32_t *pFCDData;
|
|
uint32_t n;
|
|
int32_t i, count, fcdLength;
|
|
uint16_t bothCCs;
|
|
|
|
count=utm_countItems(normMem);
|
|
for(i=0; i<count; ++i) {
|
|
bothCCs=norms[i].canonBothCCs;
|
|
if(bothCCs==0) {
|
|
/* if there are no decomposition cc's then use the udataCC twice */
|
|
bothCCs=norms[i].udataCC;
|
|
bothCCs|=bothCCs<<8;
|
|
}
|
|
norms[i].value32=bothCCs;
|
|
}
|
|
|
|
pFCDData=utrie_getData(fcdTrie, &fcdLength);
|
|
|
|
for(i=0; i<fcdLength; ++i) {
|
|
n=pFCDData[i];
|
|
pFCDData[i]=norms[n].value32;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* If the given set contains exactly one character, then return it.
|
|
* Otherwise return -1.
|
|
*/
|
|
static int32_t
|
|
usetContainsOne(const USet* set) {
|
|
if(uset_getItemCount(set)==1) {
|
|
/* there is a single item (a single range) */
|
|
UChar32 start, end;
|
|
UErrorCode ec=U_ZERO_ERROR;
|
|
int32_t len=uset_getItem(set, 0, &start, &end, NULL, 0, &ec);
|
|
if (len==0 && start==end) { /* a range (len==0) with a single code point */
|
|
return start;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
static void
|
|
makeCanonSetFn(void *context, uint32_t code, Norm *norm) {
|
|
if(norm->canonStart!=NULL && !uset_isEmpty(norm->canonStart)) {
|
|
uint16_t *table;
|
|
int32_t c, tableLength;
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
|
|
/* does the set contain exactly one code point? */
|
|
c=usetContainsOne(norm->canonStart);
|
|
|
|
/* add an entry to the BMP or supplementary search table */
|
|
if(code<=0xffff) {
|
|
table=canonStartSets+_NORM_MAX_CANON_SETS;
|
|
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
|
|
|
table[tableLength++]=(uint16_t)code;
|
|
|
|
if(c>=0 && c<=0xffff && (c&_NORM_CANON_SET_BMP_MASK)!=_NORM_CANON_SET_BMP_IS_INDEX) {
|
|
/* single-code point BMP result for BMP code point */
|
|
table[tableLength++]=(uint16_t)c;
|
|
} else {
|
|
table[tableLength++]=(uint16_t)(_NORM_CANON_SET_BMP_IS_INDEX|canonStartSetsTop);
|
|
c=-1;
|
|
}
|
|
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]=(uint16_t)tableLength;
|
|
} else {
|
|
table=canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH;
|
|
tableLength=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
|
|
|
table[tableLength++]=(uint16_t)(code>>16);
|
|
table[tableLength++]=(uint16_t)code;
|
|
|
|
if(c>=0) {
|
|
/* single-code point result for supplementary code point */
|
|
table[tableLength-2]|=(uint16_t)(0x8000|((c>>8)&0x1f00));
|
|
table[tableLength++]=(uint16_t)c;
|
|
} else {
|
|
table[tableLength++]=(uint16_t)canonStartSetsTop;
|
|
}
|
|
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]=(uint16_t)tableLength;
|
|
}
|
|
|
|
if(c<0) {
|
|
/* write a USerializedSet */
|
|
++canonSetsCount;
|
|
canonStartSetsTop+=
|
|
uset_serialize(norm->canonStart,
|
|
canonStartSets+canonStartSetsTop,
|
|
_NORM_MAX_CANON_SETS-canonStartSetsTop,
|
|
&errorCode);
|
|
}
|
|
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), (int)canonStartSetsTop);
|
|
exit(errorCode);
|
|
}
|
|
if(tableLength>_NORM_MAX_SET_SEARCH_TABLE_LENGTH) {
|
|
fprintf(stderr, "gennorm error: search table for canonical starter sets too long\n");
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* for getSkippableFlags ---------------------------------------------------- */
|
|
|
|
/* combine the lead and trail code points; return <0 if they do not combine */
|
|
static int32_t
|
|
combine(uint32_t lead, uint32_t trail) {
|
|
CombiningTriple *triples;
|
|
uint32_t i, count;
|
|
|
|
/* search for all triples with c as lead code point */
|
|
triples=utm_getStart(combiningTriplesMem);
|
|
count=utm_countItems(combiningTriplesMem);
|
|
|
|
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
|
|
for(i=0; i<count && lead!=triples[i].lead; ++i) {}
|
|
|
|
/* check each triple for this code point */
|
|
for(; i<count && lead==triples[i].lead; ++i) {
|
|
if(trail==triples[i].trail) {
|
|
return (int32_t)triples[i].combined;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
/*
|
|
* Starting from the canonical decomposition s[0..length[ of a single code point,
|
|
* is the code point c consumed in an NFC/FCC recomposition?
|
|
*
|
|
* No need to handle discontiguous composition because that would not consume some
|
|
* intermediate character, so would not compose back to the original character.
|
|
* See comments in canChangeWithFollowing().
|
|
*
|
|
* No need to compose beyond where c canonically orders because if it is consumed
|
|
* then the result differs from the original anyway.
|
|
*
|
|
* Possible optimization:
|
|
* - Verify that there are no cases of the same combining mark stacking twice.
|
|
* - return FALSE right away if c inserts after a copy of itself
|
|
* without attempting to recompose; will happen because each mark in
|
|
* the decomposition will be enumerated and passed in as c.
|
|
* More complicated and fragile though than it is already.
|
|
*
|
|
* markus 2002nov04
|
|
*/
|
|
static UBool
|
|
doesComposeConsume(const uint32_t *s, int32_t length, uint32_t c, uint8_t cc) {
|
|
int32_t starter, i;
|
|
|
|
/* ignore trailing characters where cc<prevCC */
|
|
while(length>1 && cc<getCCFromCP(s[length-1])) {
|
|
--length;
|
|
}
|
|
|
|
/* start consuming/combining from the beginning */
|
|
starter=(int32_t)s[0];
|
|
for(i=1; i<length; ++i) {
|
|
starter=combine((uint32_t)starter, s[i]);
|
|
if(starter<0) {
|
|
fprintf(stderr, "error: unable to consume normal decomposition in doesComposeConsume(<%04x, %04x, ...>[%d], U+%04x, %u)\n",
|
|
(int)s[0], (int)s[1], (int)length, (int)c, cc);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
}
|
|
|
|
/* try to combine/consume c, return TRUE if it is consumed */
|
|
return combine((uint32_t)starter, c)>=0;
|
|
}
|
|
|
|
/* does the starter s[0] combine forward with another char that is below trailCC? */
|
|
static UBool
|
|
canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
|
|
if(trailCC<=1) {
|
|
/* no character will combine ahead of the trailing char of the decomposition */
|
|
return FALSE;
|
|
}
|
|
|
|
/*
|
|
* We are only checking skippable condition (f).
|
|
* Therefore, the original character does not have quick check flag NFC_NO (c),
|
|
* i.e., the decomposition recomposes completely back into the original code point.
|
|
* So s[0] must be a true starter with cc==0 and
|
|
* combining with following code points.
|
|
*
|
|
* Similarly, length==1 is not possible because that would be a singleton
|
|
* decomposition which is marked with NFC_NO and does not pass (c).
|
|
*
|
|
* Only a character with cc<trailCC can change the composition.
|
|
* Reason: A char with cc>=trailCC would order after decomposition s[],
|
|
* composition would consume all of the decomposition, and here we know that
|
|
* the original char passed check d), i.e., it does not combine forward,
|
|
* therefore does not combine with anything after the decomposition is consumed.
|
|
*
|
|
* Now see if there is a character that
|
|
* 1. combines backward
|
|
* 2. has cc<trailCC
|
|
* 3. is consumed in recomposition
|
|
*
|
|
* length==2 is simple:
|
|
*
|
|
* Characters that fulfill these conditions are exactly the ones that combine directly
|
|
* with the starter c==s[0] because there is no intervening character after
|
|
* reordering.
|
|
* We can just enumerate all chars with which c combines (they all pass 1. and 3.)
|
|
* and see if one has cc<trailCC (passes 2.).
|
|
*
|
|
* length>2 is a little harder:
|
|
*
|
|
* Since we will get different starters during recomposition, we need to
|
|
* enumerate each backward-combining character (1.)
|
|
* with cc<trailCC (2.) and
|
|
* see if it gets consumed in recomposition. (3.)
|
|
* No need to enumerate both-ways combining characters because they must have cc==0.
|
|
*/
|
|
if(length==2) {
|
|
/* enumerate all chars that combine with this one and check their cc */
|
|
CombiningTriple *triples;
|
|
uint32_t c, i, count;
|
|
uint8_t cc;
|
|
|
|
/* search for all triples with c as lead code point */
|
|
triples=utm_getStart(combiningTriplesMem);
|
|
count=utm_countItems(combiningTriplesMem);
|
|
c=s[0];
|
|
|
|
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
|
|
for(i=0; i<count && c!=triples[i].lead; ++i) {}
|
|
|
|
/* check each triple for this code point */
|
|
for(; i<count && c==triples[i].lead; ++i) {
|
|
cc=getCCFromCP(triples[i].trail);
|
|
if(cc>0 && cc<trailCC) {
|
|
/* this trail code point combines with c and has cc<trailCC */
|
|
return TRUE;
|
|
}
|
|
}
|
|
} else {
|
|
/* enumerate all chars that combine backward */
|
|
uint32_t c2;
|
|
uint16_t i;
|
|
uint8_t cc;
|
|
|
|
for(i=combineBothTop; i<combineBackTop; ++i) {
|
|
c2=combiningCPs[i]&0xffffff;
|
|
cc=getCCFromCP(c2);
|
|
/* pass in length-1 because we already know that c2 will insert before the last character with trailCC */
|
|
if(cc>0 && cc<trailCC && doesComposeConsume(s, length-1, c2, cc)) {
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* this decomposition is not modified by any appended character */
|
|
return FALSE;
|
|
}
|
|
|
|
/* see unormimp.h for details on NF*C Skippable flags */
|
|
static uint32_t
|
|
getSkippableFlags(const Norm *norm) {
|
|
/* ignore NF*D skippable properties because they are covered by norm32, test at runtime */
|
|
|
|
/* ignore Hangul, test those at runtime (LV Hangul are not skippable) */
|
|
if(norm->specialTag==_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_HANGUL) {
|
|
return 0;
|
|
}
|
|
|
|
/* ### TODO check other data generation functions whether they should & do ignore Hangul/Jamo specials */
|
|
|
|
/*
|
|
* Note:
|
|
* This function returns a non-zero flag only if (a)..(e) indicate skippable but (f) does not.
|
|
*
|
|
* This means that (a)..(e) must always be derived from the runtime norm32 value,
|
|
* and (f) be checked from the auxTrie if the character is skippable per (a)..(e),
|
|
* the form is NF*C and there is a canonical decomposition (NFD_NO).
|
|
*
|
|
* (a) unassigned code points get "not skippable"==false because they
|
|
* don't have a Norm struct so they won't get here
|
|
*/
|
|
|
|
/* (b) not skippable if cc!=0 */
|
|
if(norm->udataCC!=0) {
|
|
return 0; /* non-zero flag for (f) only */
|
|
}
|
|
|
|
/*
|
|
* not NFC_Skippable if
|
|
* (c) quick check flag == NO or
|
|
* (d) combines forward or
|
|
* (e) combines back or
|
|
* (f) can change if another character is added
|
|
*
|
|
* for (f):
|
|
* For NF*C: Get corresponding decomposition, get its last starter (cc==0),
|
|
* check its composition list,
|
|
* see if any of the second code points in the list
|
|
* has cc less than the trailCC of the decomposition.
|
|
*
|
|
* For FCC: Test at runtime if the decomposition has a trailCC>1
|
|
* -> there are characters with cc==1, they would order before the trail char
|
|
* and prevent contiguous combination with the trail char.
|
|
*/
|
|
if( (norm->qcFlags&(_NORM_QC_NFC&_NORM_QC_ANY_NO))!=0 ||
|
|
(norm->combiningFlags&3)!=0) {
|
|
return 0; /* non-zero flag for (f) only */
|
|
}
|
|
if(norm->lenNFD!=0 && canChangeWithFollowing(norm->nfd, norm->lenNFD, (uint8_t)norm->canonBothCCs)) {
|
|
return _NORM_AUX_NFC_SKIP_F_MASK;
|
|
}
|
|
|
|
return 0; /* skippable */
|
|
}
|
|
|
|
static void
|
|
makeAux() {
|
|
Norm *norm;
|
|
uint32_t *pData;
|
|
int32_t i, length;
|
|
|
|
pData=utrie_getData(auxTrie, &length);
|
|
|
|
for(i=0; i<length; ++i) {
|
|
norm=norms+pData[i];
|
|
/*
|
|
* 16-bit auxiliary normalization properties
|
|
* see unormimp.h
|
|
*/
|
|
pData[i]=
|
|
((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
|
|
(uint32_t)norm->fncIndex;
|
|
|
|
if(norm->unsafeStart || norm->udataCC!=0) {
|
|
pData[i]|=_NORM_AUX_UNSAFE_MASK;
|
|
}
|
|
|
|
pData[i]|=getSkippableFlags(norm);
|
|
}
|
|
}
|
|
|
|
/* folding value for normalization: just store the offset (16 bits) if there is any non-0 entry */
|
|
static uint32_t U_CALLCONV
|
|
getFoldedNormValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
|
uint32_t value, leadNorm32=0;
|
|
UChar32 limit;
|
|
UBool inBlockZero;
|
|
|
|
limit=start+0x400;
|
|
while(start<limit) {
|
|
value=utrie_get32(trie, start, &inBlockZero);
|
|
if(inBlockZero) {
|
|
start+=UTRIE_DATA_BLOCK_LENGTH;
|
|
} else {
|
|
if(value!=0) {
|
|
leadNorm32|=value;
|
|
}
|
|
++start;
|
|
}
|
|
}
|
|
|
|
/* turn multi-bit fields into the worst-case value */
|
|
if(leadNorm32&_NORM_CC_MASK) {
|
|
leadNorm32|=_NORM_CC_MASK;
|
|
}
|
|
|
|
/* clean up unnecessarily ored bit fields */
|
|
leadNorm32&=~((uint32_t)0xffffffff<<_NORM_EXTRA_SHIFT);
|
|
|
|
if(leadNorm32==0) {
|
|
/* nothing to do (only composition exclusions?) */
|
|
return 0;
|
|
}
|
|
|
|
/* add the extra surrogate index, offset by the BMP top, for the new stage 1 location */
|
|
leadNorm32|=(
|
|
(uint32_t)_NORM_EXTRA_INDEX_TOP+
|
|
(uint32_t)((offset-UTRIE_BMP_INDEX_LENGTH)>>UTRIE_SURROGATE_BLOCK_BITS)
|
|
)<<_NORM_EXTRA_SHIFT;
|
|
|
|
return leadNorm32;
|
|
}
|
|
|
|
/* folding value for FCD: use default function (just store the offset (16 bits) if there is any non-0 entry) */
|
|
|
|
/*
|
|
* folding value for auxiliary data:
|
|
* store the non-zero offset in bits 9..0 (FNC bits)
|
|
* if there is any non-0 entry;
|
|
* "or" [verb!] together data bits 15..10 of all of the 1024 supplementary code points
|
|
*/
|
|
static uint32_t U_CALLCONV
|
|
getFoldedAuxValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
|
uint32_t value, oredValues;
|
|
UChar32 limit;
|
|
UBool inBlockZero;
|
|
|
|
oredValues=0;
|
|
limit=start+0x400;
|
|
while(start<limit) {
|
|
value=utrie_get32(trie, start, &inBlockZero);
|
|
if(inBlockZero) {
|
|
start+=UTRIE_DATA_BLOCK_LENGTH;
|
|
} else {
|
|
oredValues|=value;
|
|
++start;
|
|
}
|
|
}
|
|
|
|
if(oredValues!=0) {
|
|
/* move the 10 significant offset bits into bits 9..0 */
|
|
offset>>=UTRIE_SURROGATE_BLOCK_BITS;
|
|
if(offset>_NORM_AUX_FNC_MASK) {
|
|
fprintf(stderr, "gennorm error: folding offset too large (auxTrie)\n");
|
|
exit(U_INDEX_OUTOFBOUNDS_ERROR);
|
|
}
|
|
return (uint32_t)offset|(oredValues&~_NORM_AUX_FNC_MASK);
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
extern void
|
|
processData() {
|
|
#if 0
|
|
uint16_t i;
|
|
#endif
|
|
|
|
processCombining();
|
|
|
|
/* canonically reorder decompositions and assign combining classes for decompositions */
|
|
enumTrie(postParseFn, NULL);
|
|
|
|
#if 0
|
|
for(i=1; i<64; ++i) {
|
|
if(combineAndQC[i]) {
|
|
printf("combiningFlags==0x%02x qcFlags(NF?C)==0x%02x\n", (i&0xc)>>2, i&0x33);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/* add hangul/jamo specials */
|
|
setHangulJamoSpecials();
|
|
|
|
/* set this value; will be updated as makeCanonSetFn() adds sets (if there are any, see gStoreFlags) */
|
|
canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]=(uint16_t)canonStartSetsTop;
|
|
|
|
/* store search tables and USerializedSets for canonical starters (after Hangul/Jamo specials!) */
|
|
if(DO_STORE(UGENNORM_STORE_AUX) && DO_STORE(UGENNORM_STORE_COMPOSITION)) {
|
|
enumTrie(makeCanonSetFn, NULL);
|
|
}
|
|
|
|
/* clone the normalization builder trie to make the final data tries */
|
|
if( NULL==utrie_clone(norm32Trie, normTrie, NULL, 0) ||
|
|
NULL==utrie_clone(fcdTrie, normTrie, NULL, 0) ||
|
|
NULL==utrie_clone(auxTrie, normTrie, NULL, 0)
|
|
) {
|
|
fprintf(stderr, "error: unable to clone the normalization trie\n");
|
|
exit(U_MEMORY_ALLOCATION_ERROR);
|
|
}
|
|
|
|
/* --- finalize data for quick checks & normalization --- */
|
|
|
|
/* turn the Norm structs (stage2, norms) into 32-bit data words */
|
|
makeAll32();
|
|
|
|
/* --- finalize data for FCD checks --- */
|
|
|
|
/* FCD data: take Norm.canonBothCCs and store them in the FCD table */
|
|
makeFCD();
|
|
|
|
/* --- finalize auxiliary normalization data --- */
|
|
makeAux();
|
|
|
|
if(beVerbose) {
|
|
#if 0
|
|
printf("number of stage 2 entries: %ld\n", stage2Mem->index);
|
|
printf("size of stage 1 (BMP) & 2 (uncompacted) + extra data: %ld bytes\n", _NORM_STAGE_1_BMP_COUNT*2+stage2Mem->index*4+extraMem->index*2);
|
|
#endif
|
|
printf("combining CPs tops: fwd %u both %u back %u\n", combineFwdTop, combineBothTop, combineBackTop);
|
|
printf("combining table count: %u\n", combiningTableTop);
|
|
}
|
|
}
|
|
|
|
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
|
|
|
extern void
|
|
generateData(const char *dataDir, UBool csource) {
|
|
static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
|
|
|
|
UNewDataMemory *pData;
|
|
UErrorCode errorCode=U_ZERO_ERROR;
|
|
int32_t size, dataLength;
|
|
|
|
#if UCONFIG_NO_NORMALIZATION
|
|
|
|
size=0;
|
|
|
|
#else
|
|
|
|
U_STRING_DECL(nxCJKCompatPattern, "[:Ideographic:]", 15);
|
|
U_STRING_DECL(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
|
|
USet *set;
|
|
int32_t normTrieSize, fcdTrieSize, auxTrieSize;
|
|
|
|
normTrieSize=utrie_serialize(norm32Trie, normTrieBlock, sizeof(normTrieBlock), getFoldedNormValue, FALSE, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: utrie_serialize(normalization properties) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
|
|
if(DO_STORE(UGENNORM_STORE_FCD)) {
|
|
fcdTrieSize=utrie_serialize(fcdTrie, fcdTrieBlock, sizeof(fcdTrieBlock), NULL, TRUE, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: utrie_serialize(FCD data) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
} else {
|
|
fcdTrieSize=0;
|
|
}
|
|
|
|
if(DO_STORE(UGENNORM_STORE_AUX)) {
|
|
auxTrieSize=utrie_serialize(auxTrie, auxTrieBlock, sizeof(auxTrieBlock), getFoldedAuxValue, TRUE, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: utrie_serialize(auxiliary data) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
} else {
|
|
auxTrieSize=0;
|
|
}
|
|
|
|
/* move the parts of canonStartSets[] together into a contiguous block */
|
|
if( canonStartSetsTop<_NORM_MAX_CANON_SETS &&
|
|
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]!=0
|
|
) {
|
|
uprv_memmove(canonStartSets+canonStartSetsTop,
|
|
canonStartSets+_NORM_MAX_CANON_SETS,
|
|
canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]*2);
|
|
}
|
|
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH];
|
|
|
|
if( canonStartSetsTop<(_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH) &&
|
|
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]!=0
|
|
) {
|
|
uprv_memmove(canonStartSets+canonStartSetsTop,
|
|
canonStartSets+_NORM_MAX_CANON_SETS+_NORM_MAX_SET_SEARCH_TABLE_LENGTH,
|
|
canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]*2);
|
|
}
|
|
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
|
|
|
|
/* create the normalization exclusion sets */
|
|
/*
|
|
* nxCJKCompatPattern should be [[:Ideographic:]&[:NFD_QC=No:]]
|
|
* but we cannot use NFD_QC from the pattern because that would require
|
|
* unorm.icu which we are just going to generate.
|
|
* Therefore we have manually collected nfdQCNoSet and intersect Ideographic
|
|
* with that.
|
|
*/
|
|
U_STRING_INIT(nxCJKCompatPattern, "[:Ideographic:]", 15);
|
|
U_STRING_INIT(nxUnicode32Pattern, "[:^Age=3.2:]", 12);
|
|
|
|
canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]=canonStartSetsTop;
|
|
set=uset_openPattern(nxCJKCompatPattern, -1, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: uset_openPattern([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
uset_retainAll(set, nfdQCNoSet);
|
|
if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
|
|
uset_clear(set);
|
|
}
|
|
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: uset_serialize([:Ideographic:]&[:NFD_QC=No:]) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
uset_close(set);
|
|
|
|
canonStartSets[_NORM_SET_INDEX_NX_UNICODE32_OFFSET]=canonStartSetsTop;
|
|
set=uset_openPattern(nxUnicode32Pattern, -1, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: uset_openPattern([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
if(DO_NOT_STORE(UGENNORM_STORE_EXCLUSIONS)) {
|
|
uset_clear(set);
|
|
}
|
|
canonStartSetsTop+=uset_serialize(set, canonStartSets+canonStartSetsTop, LENGTHOF(canonStartSets)-canonStartSetsTop, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: uset_serialize([:^Age=3.2:]) failed, %s\n", u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
uset_close(set);
|
|
|
|
canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]=canonStartSetsTop;
|
|
|
|
/* make sure that the FCD trie is 4-aligned */
|
|
if((utm_countItems(extraMem)+combiningTableTop)&1) {
|
|
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
|
|
}
|
|
|
|
/* pad canonStartSets to 4-alignment, too */
|
|
if(canonStartSetsTop&1) {
|
|
canonStartSets[canonStartSetsTop++]=0x1235;
|
|
}
|
|
|
|
size=
|
|
_NORM_INDEX_TOP*4+
|
|
normTrieSize+
|
|
utm_countItems(extraMem)*2+
|
|
combiningTableTop*2+
|
|
fcdTrieSize+
|
|
auxTrieSize+
|
|
canonStartSetsTop*2;
|
|
|
|
if(beVerbose) {
|
|
printf("size of normalization trie %5u bytes\n", (int)normTrieSize);
|
|
printf("size of 16-bit extra memory %5u UChars/uint16_t\n", (int)utm_countItems(extraMem));
|
|
printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
|
|
printf("size of combining table %5u uint16_t\n", combiningTableTop);
|
|
printf("size of FCD trie %5u bytes\n", (int)fcdTrieSize);
|
|
printf("size of auxiliary trie %5u bytes\n", (int)auxTrieSize);
|
|
printf("size of canonStartSets[] %5u uint16_t\n", (int)canonStartSetsTop);
|
|
printf(" number of indexes %5u uint16_t\n", _NORM_SET_INDEX_TOP);
|
|
printf(" size of sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP);
|
|
printf(" number of sets %5d\n", (int)canonSetsCount);
|
|
printf(" size of BMP search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]);
|
|
printf(" size of supplementary search table %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]);
|
|
printf(" length of exclusion sets %5u uint16_t\n", canonStartSets[_NORM_SET_INDEX_NX_RESERVED_OFFSET]-canonStartSets[_NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET]);
|
|
printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
|
}
|
|
|
|
indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
|
|
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
|
|
|
|
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
|
|
indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
|
|
indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=(uint16_t)(combineBothTop-combineFwdTop);
|
|
indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=(uint16_t)(combineBackTop-combineBothTop);
|
|
|
|
/* the quick check minimum code points are already set */
|
|
|
|
indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
|
|
indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
|
|
indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
|
|
|
|
#endif
|
|
|
|
if(csource) {
|
|
#if UCONFIG_NO_NORMALIZATION
|
|
/* no csource for dummy mode..? */
|
|
fprintf(stderr, "gennorm error: UCONFIG_NO_NORMALIZATION is on in csource mode.\n");
|
|
exit(1);
|
|
#else
|
|
/* write .c file for hardcoded data */
|
|
UTrie normTrie2={ NULL }, fcdTrie2={ NULL }, auxTrie2={ NULL };
|
|
FILE *f;
|
|
|
|
utrie_unserialize(&normTrie2, normTrieBlock, normTrieSize, &errorCode);
|
|
if(fcdTrieSize>0) {
|
|
utrie_unserialize(&fcdTrie2, fcdTrieBlock, fcdTrieSize, &errorCode);
|
|
}
|
|
if(auxTrieSize>0) {
|
|
utrie_unserialize(&auxTrie2, auxTrieBlock, auxTrieSize, &errorCode);
|
|
}
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(
|
|
stderr,
|
|
"gennorm error: failed to utrie_unserialize() one of the tries - %s\n",
|
|
u_errorName(errorCode));
|
|
exit(errorCode);
|
|
}
|
|
|
|
f=usrc_create(dataDir, "unorm_props_data.c");
|
|
if(f!=NULL) {
|
|
usrc_writeArray(f,
|
|
"static const UVersionInfo formatVersion={ ",
|
|
dataInfo.formatVersion, 8, 4,
|
|
" };\n\n");
|
|
usrc_writeArray(f,
|
|
"static const UVersionInfo dataVersion={ ",
|
|
dataInfo.dataVersion, 8, 4,
|
|
" };\n\n");
|
|
usrc_writeArray(f,
|
|
"static const int32_t indexes[_NORM_INDEX_TOP]={\n",
|
|
indexes, 32, _NORM_INDEX_TOP,
|
|
"\n};\n\n");
|
|
usrc_writeUTrieArrays(f,
|
|
"static const uint16_t normTrie_index[%ld]={\n",
|
|
"static const uint32_t normTrie_data32[%ld]={\n",
|
|
&normTrie2,
|
|
"\n};\n\n");
|
|
usrc_writeUTrieStruct(f,
|
|
"static const UTrie normTrie={\n",
|
|
&normTrie2, "normTrie_index", "normTrie_data32", "getFoldingNormOffset",
|
|
"};\n\n");
|
|
usrc_writeArray(f,
|
|
"static const uint16_t extraData[%ld]={\n",
|
|
utm_getStart(extraMem), 16, utm_countItems(extraMem),
|
|
"\n};\n\n");
|
|
usrc_writeArray(f,
|
|
"static const uint16_t combiningTable[%ld]={\n",
|
|
combiningTable, 16, combiningTableTop,
|
|
"\n};\n\n");
|
|
if(fcdTrieSize>0) {
|
|
usrc_writeUTrieArrays(f,
|
|
"static const uint16_t fcdTrie_index[%ld]={\n", NULL,
|
|
&fcdTrie2,
|
|
"\n};\n\n");
|
|
usrc_writeUTrieStruct(f,
|
|
"static const UTrie fcdTrie={\n",
|
|
&fcdTrie2, "fcdTrie_index", NULL, NULL,
|
|
"};\n\n");
|
|
} else {
|
|
fputs( "static const UTrie fcdTrie={ NULL };\n\n", f);
|
|
}
|
|
if(auxTrieSize>0) {
|
|
usrc_writeUTrieArrays(f,
|
|
"static const uint16_t auxTrie_index[%ld]={\n", NULL,
|
|
&auxTrie2,
|
|
"\n};\n\n");
|
|
usrc_writeUTrieStruct(f,
|
|
"static const UTrie auxTrie={\n",
|
|
&auxTrie2, "auxTrie_index", NULL, "getFoldingAuxOffset",
|
|
"};\n\n");
|
|
} else {
|
|
fputs( "static const UTrie auxTrie={ NULL };\n\n", f);
|
|
}
|
|
usrc_writeArray(f,
|
|
"static const uint16_t canonStartSets[%ld]={\n",
|
|
canonStartSets, 16, canonStartSetsTop,
|
|
"\n};\n\n");
|
|
fclose(f);
|
|
}
|
|
#endif
|
|
} else {
|
|
/* write the data */
|
|
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
|
|
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
|
|
exit(errorCode);
|
|
}
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
udata_writeBlock(pData, indexes, sizeof(indexes));
|
|
udata_writeBlock(pData, normTrieBlock, normTrieSize);
|
|
udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
|
|
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
|
|
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
|
|
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
|
|
udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
|
|
|
|
#endif
|
|
|
|
/* finish up */
|
|
dataLength=udata_finish(pData, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "gennorm: error %d writing the output file\n", errorCode);
|
|
exit(errorCode);
|
|
}
|
|
|
|
if(dataLength!=size) {
|
|
fprintf(stderr, "gennorm error: data length %ld != calculated size %ld\n",
|
|
(long)dataLength, (long)size);
|
|
exit(U_INTERNAL_PROGRAM_ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
extern void
|
|
cleanUpData(void) {
|
|
int32_t i, count;
|
|
|
|
count=utm_countItems(normMem);
|
|
for(i=0; i<count; ++i) {
|
|
uset_close(norms[i].canonStart);
|
|
}
|
|
|
|
utm_close(normMem);
|
|
utm_close(utf32Mem);
|
|
utm_close(extraMem);
|
|
utm_close(combiningTriplesMem);
|
|
utrie_close(normTrie);
|
|
utrie_close(norm32Trie);
|
|
utrie_close(fcdTrie);
|
|
utrie_close(auxTrie);
|
|
|
|
uset_close(nfdQCNoSet);
|
|
|
|
uprv_free(normTrie);
|
|
uprv_free(norm32Trie);
|
|
uprv_free(fcdTrie);
|
|
uprv_free(auxTrie);
|
|
}
|
|
|
|
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
|
|
|
/*
|
|
* Hey, Emacs, please set the following:
|
|
*
|
|
* Local Variables:
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
*/
|