ICU-1754 store data for canonical closure

X-SVN-Rev: 7927
This commit is contained in:
Markus Scherer 2002-03-09 06:00:42 +00:00
parent 74d240ec1e
commit dad663425c
3 changed files with 102 additions and 12 deletions

View File

@ -73,15 +73,17 @@ enum {
/* value constants for auxTrie */
enum {
_NORM_AUX_CANON_FLAG_SHIFT=11,
_NORM_AUX_UNSAFE_SHIFT=14,
_NORM_AUX_FNC_SHIFT=20,
_NORM_AUX_COMP_EX_SHIFT=30,
_NORM_AUX_IS_LEAD_SHIFT=31
};
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
#define _NORM_AUX_MAX_CANON_SET ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
#define _NORM_AUX_CANON_SET_MASK (((uint32_t)1<<_NORM_AUX_CANON_FLAG_SHIFT)-1)
#define _NORM_AUX_CANON_SET_MASK (_NORM_AUX_MAX_CANON_SET-1)
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
#define _NORM_AUX_FNC_MASK ((uint32_t)(_NORM_AUX_MAX_FNC-1)<<_NORM_AUX_FNC_SHIFT)
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
#define _NORM_AUX_IS_LEAD_MASK ((uint32_t)1<<_NORM_AUX_IS_LEAD_SHIFT)
@ -104,7 +106,7 @@ enum {
_NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */
_NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */
_NORM_INDEX_UNICODE_SET_COUNT, /* number of int32_t in the UnicodeSet array */
_NORM_INDEX_CANON_SET_COUNT, /* number of uint16_t in the array of serialized USet */
_NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */
};
@ -280,7 +282,8 @@ unorm_internalIsFullCompositionExclusion(UChar32 c);
*
* UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
*
* int32_t unicodeSets[unicodeSetsTop] -- unicodeSetsTop=indexes[_NORM_INDEX_UNICODE_SET_COUNT]
* uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT]
* serialized USets, see uset.c
*
*
* The indexes array contains lengths and sizes of the following arrays and structures
@ -459,9 +462,12 @@ unorm_internalIsFullCompositionExclusion(UChar32 c);
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
* or lead surrogate offset (bit 31==1)
* 19..16 skippable flags
* 15..13 reserved
* 11 flag: not a safe starter for canonical closure
* 10.. 0 index to UnicodeSet for canonical closure
* 15 reserved
* 14 flag: not a safe starter for canonical closure
* 13.. 0 index to serialized USet for canonical closure
* the set lists the code points whose decompositions start with
* the one that this data is for
* for how USets are serialized see uset.c
*
* - FC_NFKC_Closure strings in extraData[]
*

View File

@ -18,6 +18,7 @@
#define __GENPROPS_H__
#include "unicode/utypes.h"
#include "uset.h"
/* file definitions */
#define DATA_NAME "unorm"
@ -34,6 +35,8 @@ typedef struct Norm {
uint32_t *nfd, *nfkd;
uint32_t value32; /* temporary variable for generating runtime norm32 and fcd values */
int32_t fncIndex;
USet *canonStart;
UBool unsafeStart;
} Norm;
/* global flags */

View File

@ -25,6 +25,7 @@
#include "filestrm.h"
#include "unicode/udata.h"
#include "utrie.h"
#include "uset.h"
#include "unewdata.h"
#include "unormimp.h"
#include "gennorm.h"
@ -180,6 +181,9 @@ typedef struct CombiningTriple {
static uint16_t combiningTable[0x8000];
static uint16_t combiningTableTop=0;
static uint16_t canonStartSets[_NORM_AUX_MAX_CANON_SET]={ 0 };
static int32_t canonStartSetsTop=1;
extern void
init() {
uint16_t *p16;
@ -781,7 +785,7 @@ setCompositionExclusion(uint32_t code) {
static void
setHangulJamoSpecials() {
Norm *norm;
uint32_t c;
uint32_t c, hangul;
/*
* Hangul syllables are algorithmically decomposed into Jamos,
@ -790,10 +794,15 @@ setHangulJamoSpecials() {
*/
/* set Jamo L specials */
hangul=0xac00;
for(c=0x1100; c<=0x1112; ++c) {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
norm->combiningFlags=1;
/* for each Jamo L create a set with its associated Hangul block */
norm->canonStart=uset_open(hangul, hangul+21*28);
hangul+=21*28;
}
/* set Jamo V specials */
@ -801,6 +810,7 @@ setHangulJamoSpecials() {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
norm->combiningFlags=2;
norm->unsafeStart=TRUE;
}
/* set Jamo T specials */
@ -808,6 +818,7 @@ setHangulJamoSpecials() {
norm=createNorm(c);
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
norm->combiningFlags=2;
norm->unsafeStart=TRUE;
}
/* set Hangul specials, precompacted */
@ -916,6 +927,8 @@ static UBool combineAndQC[64]={ 0 };
/*
* canonically reorder the up to two decompositions
* and store the leading and trailing combining classes accordingly
*
* also process canonical decompositions for canonical closure
*/
static void
postParseFn(void *context, uint32_t code, Norm *norm) {
@ -941,7 +954,7 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
}
/* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
/* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
if(norm->combiningFlags&1) {
@ -964,6 +977,41 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
if((norm->combiningFlags&3)==3 && beVerbose) {
printf("U+%04lx combines both ways\n", (long)code);
}
/*
* process canonical decompositions for canonical closure
*
* in each canonical decomposition:
* add the current character (code) to the set of canonical starters of its norm->nfd[0]
* set the "unsafe starter" flag for each norm->nfd[1..]
*/
length=norm->lenNFD;
if(length>0) {
Norm *otherNorm;
UChar32 c;
int32_t i;
/* nfd[0].canonStart.add(code) */
c=norm->nfd[0];
otherNorm=createNorm(c);
if(otherNorm->canonStart==NULL) {
otherNorm->canonStart=uset_open(code, code+1);
if(otherNorm->canonStart==NULL) {
fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
} else {
if(!uset_add(otherNorm->canonStart, code)) {
fprintf(stderr, "gennorm error: uset_add(setOf(U+%04x), U+%04x)\n", c, code);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
/* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
for(i=1; i<length; ++i) {
createNorm(norm->nfd[i])->unsafeStart=TRUE;
}
}
}
static uint32_t
@ -1153,6 +1201,7 @@ makeAux() {
Norm *norm;
uint32_t *pData;
int32_t i, length;
UErrorCode errorCode=U_ZERO_ERROR;
pData=utrie_getData(&auxTrie, &length);
@ -1165,6 +1214,23 @@ makeAux() {
pData[i]=
((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
(uint32_t)(norm->fncIndex<<_NORM_AUX_FNC_SHIFT);
if(norm->unsafeStart || norm->udataCC!=0) {
pData[i]|=_NORM_AUX_UNSAFE_MASK;
}
if(!uset_isEmpty(norm->canonStart)) {
pData[i]|=(uint32_t)canonStartSetsTop;
canonStartSetsTop+=
uset_serialize(norm->canonStart,
canonStartSets+canonStartSetsTop,
_NORM_AUX_MAX_CANON_SET-canonStartSetsTop,
&errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
exit(errorCode);
}
}
}
}
@ -1357,13 +1423,19 @@ generateData(const char *dataDir) {
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
}
/* pad canonStartSets to 4-alignment, too */
if(canonStartSetsTop&1) {
canonStartSets[canonStartSetsTop++]=0x1235;
}
size=
_NORM_INDEX_TOP*4+
normTrieSize+
extraMem->index*2+
combiningTableTop*2+
fcdTrieSize+
auxTrieSize;
auxTrieSize+
canonStartSetsTop*2;
if(beVerbose) {
printf("size of normalization trie %5lu bytes\n", normTrieSize);
@ -1372,6 +1444,7 @@ generateData(const char *dataDir) {
printf("size of combining table %5lu uint16_t\n", combiningTableTop);
printf("size of FCD trie %5lu bytes\n", fcdTrieSize);
printf("size of auxiliary trie %5lu bytes\n", auxTrieSize);
printf("size of canonStartSets %5lu uint16_t\n", canonStartSetsTop);
printf("size of " DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
}
@ -1387,7 +1460,7 @@ generateData(const char *dataDir) {
indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
/* ### TODO indexes[_NORM_INDEX_UNICODE_SET_COUNT]=###; */
indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
/* write the data */
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
@ -1403,6 +1476,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
/* finish up */
dataLength=udata_finish(pData, &errorCode);
@ -1420,6 +1494,13 @@ generateData(const char *dataDir) {
extern void
cleanUpData(void) {
int32_t i, count;
count=(int32_t)normMem->index;
for(i=0; i<count; ++i) {
uset_close(norms[i].canonStart);
}
utm_close(normMem);
utm_close(utf32Mem);
utm_close(extraMem);