ICU-1754 store data for canonical closure
X-SVN-Rev: 7927
This commit is contained in:
parent
74d240ec1e
commit
dad663425c
@ -73,15 +73,17 @@ enum {
|
||||
|
||||
/* value constants for auxTrie */
|
||||
enum {
|
||||
_NORM_AUX_CANON_FLAG_SHIFT=11,
|
||||
_NORM_AUX_UNSAFE_SHIFT=14,
|
||||
_NORM_AUX_FNC_SHIFT=20,
|
||||
_NORM_AUX_COMP_EX_SHIFT=30,
|
||||
_NORM_AUX_IS_LEAD_SHIFT=31
|
||||
};
|
||||
|
||||
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
|
||||
#define _NORM_AUX_MAX_CANON_SET ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
|
||||
#define _NORM_AUX_MAX_FNC ((int32_t)1<<(_NORM_AUX_COMP_EX_SHIFT-_NORM_AUX_FNC_SHIFT))
|
||||
|
||||
#define _NORM_AUX_CANON_SET_MASK (((uint32_t)1<<_NORM_AUX_CANON_FLAG_SHIFT)-1)
|
||||
#define _NORM_AUX_CANON_SET_MASK (_NORM_AUX_MAX_CANON_SET-1)
|
||||
#define _NORM_AUX_UNSAFE_MASK ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
|
||||
#define _NORM_AUX_FNC_MASK ((uint32_t)(_NORM_AUX_MAX_FNC-1)<<_NORM_AUX_FNC_SHIFT)
|
||||
#define _NORM_AUX_COMP_EX_MASK ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
|
||||
#define _NORM_AUX_IS_LEAD_MASK ((uint32_t)1<<_NORM_AUX_IS_LEAD_SHIFT)
|
||||
@ -104,7 +106,7 @@ enum {
|
||||
_NORM_INDEX_FCD_TRIE_SIZE, /* number of bytes in FCD trie */
|
||||
|
||||
_NORM_INDEX_AUX_TRIE_SIZE, /* number of bytes in the auxiliary trie */
|
||||
_NORM_INDEX_UNICODE_SET_COUNT, /* number of int32_t in the UnicodeSet array */
|
||||
_NORM_INDEX_CANON_SET_COUNT, /* number of uint16_t in the array of serialized USet */
|
||||
|
||||
_NORM_INDEX_TOP=32 /* changing this requires a new formatVersion */
|
||||
};
|
||||
@ -280,7 +282,8 @@ unorm_internalIsFullCompositionExclusion(UChar32 c);
|
||||
*
|
||||
* UTrie auxTrie; -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
|
||||
*
|
||||
* int32_t unicodeSets[unicodeSetsTop] -- unicodeSetsTop=indexes[_NORM_INDEX_UNICODE_SET_COUNT]
|
||||
* uint16_t canonStartSets[canonStartSetsTop] -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT]
|
||||
* serialized USets, see uset.c
|
||||
*
|
||||
*
|
||||
* The indexes array contains lengths and sizes of the following arrays and structures
|
||||
@ -459,9 +462,12 @@ unorm_internalIsFullCompositionExclusion(UChar32 c);
|
||||
* 29..20 index into extraData[] to FC_NFKC_Closure string (bit 31==0),
|
||||
* or lead surrogate offset (bit 31==1)
|
||||
* 19..16 skippable flags
|
||||
* 15..13 reserved
|
||||
* 11 flag: not a safe starter for canonical closure
|
||||
* 10.. 0 index to UnicodeSet for canonical closure
|
||||
* 15 reserved
|
||||
* 14 flag: not a safe starter for canonical closure
|
||||
* 13.. 0 index to serialized USet for canonical closure
|
||||
* the set lists the code points whose decompositions start with
|
||||
* the one that this data is for
|
||||
* for how USets are serialized see uset.c
|
||||
*
|
||||
* - FC_NFKC_Closure strings in extraData[]
|
||||
*
|
||||
|
@ -18,6 +18,7 @@
|
||||
#define __GENPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "uset.h"
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "unorm"
|
||||
@ -34,6 +35,8 @@ typedef struct Norm {
|
||||
uint32_t *nfd, *nfkd;
|
||||
uint32_t value32; /* temporary variable for generating runtime norm32 and fcd values */
|
||||
int32_t fncIndex;
|
||||
USet *canonStart;
|
||||
UBool unsafeStart;
|
||||
} Norm;
|
||||
|
||||
/* global flags */
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include "filestrm.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "utrie.h"
|
||||
#include "uset.h"
|
||||
#include "unewdata.h"
|
||||
#include "unormimp.h"
|
||||
#include "gennorm.h"
|
||||
@ -180,6 +181,9 @@ typedef struct CombiningTriple {
|
||||
static uint16_t combiningTable[0x8000];
|
||||
static uint16_t combiningTableTop=0;
|
||||
|
||||
static uint16_t canonStartSets[_NORM_AUX_MAX_CANON_SET]={ 0 };
|
||||
static int32_t canonStartSetsTop=1;
|
||||
|
||||
extern void
|
||||
init() {
|
||||
uint16_t *p16;
|
||||
@ -781,7 +785,7 @@ setCompositionExclusion(uint32_t code) {
|
||||
static void
|
||||
setHangulJamoSpecials() {
|
||||
Norm *norm;
|
||||
uint32_t c;
|
||||
uint32_t c, hangul;
|
||||
|
||||
/*
|
||||
* Hangul syllables are algorithmically decomposed into Jamos,
|
||||
@ -790,10 +794,15 @@ setHangulJamoSpecials() {
|
||||
*/
|
||||
|
||||
/* set Jamo L specials */
|
||||
hangul=0xac00;
|
||||
for(c=0x1100; c<=0x1112; ++c) {
|
||||
norm=createNorm(c);
|
||||
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_L;
|
||||
norm->combiningFlags=1;
|
||||
|
||||
/* for each Jamo L create a set with its associated Hangul block */
|
||||
norm->canonStart=uset_open(hangul, hangul+21*28);
|
||||
hangul+=21*28;
|
||||
}
|
||||
|
||||
/* set Jamo V specials */
|
||||
@ -801,6 +810,7 @@ setHangulJamoSpecials() {
|
||||
norm=createNorm(c);
|
||||
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_V;
|
||||
norm->combiningFlags=2;
|
||||
norm->unsafeStart=TRUE;
|
||||
}
|
||||
|
||||
/* set Jamo T specials */
|
||||
@ -808,6 +818,7 @@ setHangulJamoSpecials() {
|
||||
norm=createNorm(c);
|
||||
norm->specialTag=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_JAMO_T;
|
||||
norm->combiningFlags=2;
|
||||
norm->unsafeStart=TRUE;
|
||||
}
|
||||
|
||||
/* set Hangul specials, precompacted */
|
||||
@ -916,6 +927,8 @@ static UBool combineAndQC[64]={ 0 };
|
||||
/*
|
||||
* canonically reorder the up to two decompositions
|
||||
* and store the leading and trailing combining classes accordingly
|
||||
*
|
||||
* also process canonical decompositions for canonical closure
|
||||
*/
|
||||
static void
|
||||
postParseFn(void *context, uint32_t code, Norm *norm) {
|
||||
@ -941,7 +954,7 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
|
||||
fprintf(stderr, "gennorm warning: U+%04lx has NFD[%d] NFKD[%d] but quick check 0x%02x\n", (long)code, norm->lenNFD, norm->lenNFKD, norm->qcFlags);
|
||||
}
|
||||
|
||||
/* ### see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
|
||||
/* see which combinations of combiningFlags and qcFlags are used for NFC/NFKC */
|
||||
combineAndQC[(norm->qcFlags&0x33)|((norm->combiningFlags&3)<<2)]=1;
|
||||
|
||||
if(norm->combiningFlags&1) {
|
||||
@ -964,6 +977,41 @@ postParseFn(void *context, uint32_t code, Norm *norm) {
|
||||
if((norm->combiningFlags&3)==3 && beVerbose) {
|
||||
printf("U+%04lx combines both ways\n", (long)code);
|
||||
}
|
||||
|
||||
/*
|
||||
* process canonical decompositions for canonical closure
|
||||
*
|
||||
* in each canonical decomposition:
|
||||
* add the current character (code) to the set of canonical starters of its norm->nfd[0]
|
||||
* set the "unsafe starter" flag for each norm->nfd[1..]
|
||||
*/
|
||||
length=norm->lenNFD;
|
||||
if(length>0) {
|
||||
Norm *otherNorm;
|
||||
UChar32 c;
|
||||
int32_t i;
|
||||
|
||||
/* nfd[0].canonStart.add(code) */
|
||||
c=norm->nfd[0];
|
||||
otherNorm=createNorm(c);
|
||||
if(otherNorm->canonStart==NULL) {
|
||||
otherNorm->canonStart=uset_open(code, code+1);
|
||||
if(otherNorm->canonStart==NULL) {
|
||||
fprintf(stderr, "gennorm error: out of memory in uset_open()\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
} else {
|
||||
if(!uset_add(otherNorm->canonStart, code)) {
|
||||
fprintf(stderr, "gennorm error: uset_add(setOf(U+%04x), U+%04x)\n", c, code);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/* for(i=1..length-1) nfd[i].unsafeStart=TRUE */
|
||||
for(i=1; i<length; ++i) {
|
||||
createNorm(norm->nfd[i])->unsafeStart=TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static uint32_t
|
||||
@ -1153,6 +1201,7 @@ makeAux() {
|
||||
Norm *norm;
|
||||
uint32_t *pData;
|
||||
int32_t i, length;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
pData=utrie_getData(&auxTrie, &length);
|
||||
|
||||
@ -1165,6 +1214,23 @@ makeAux() {
|
||||
pData[i]=
|
||||
((uint32_t)(norm->combiningFlags&0x80)<<(_NORM_AUX_COMP_EX_SHIFT-7))|
|
||||
(uint32_t)(norm->fncIndex<<_NORM_AUX_FNC_SHIFT);
|
||||
|
||||
if(norm->unsafeStart || norm->udataCC!=0) {
|
||||
pData[i]|=_NORM_AUX_UNSAFE_MASK;
|
||||
}
|
||||
|
||||
if(!uset_isEmpty(norm->canonStart)) {
|
||||
pData[i]|=(uint32_t)canonStartSetsTop;
|
||||
canonStartSetsTop+=
|
||||
uset_serialize(norm->canonStart,
|
||||
canonStartSets+canonStartSetsTop,
|
||||
_NORM_AUX_MAX_CANON_SET-canonStartSetsTop,
|
||||
&errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "gennorm error: uset_serialize()->%s (canonStartSetsTop=%d)\n", u_errorName(errorCode), canonStartSetsTop);
|
||||
exit(errorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1357,13 +1423,19 @@ generateData(const char *dataDir) {
|
||||
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
|
||||
}
|
||||
|
||||
/* pad canonStartSets to 4-alignment, too */
|
||||
if(canonStartSetsTop&1) {
|
||||
canonStartSets[canonStartSetsTop++]=0x1235;
|
||||
}
|
||||
|
||||
size=
|
||||
_NORM_INDEX_TOP*4+
|
||||
normTrieSize+
|
||||
extraMem->index*2+
|
||||
combiningTableTop*2+
|
||||
fcdTrieSize+
|
||||
auxTrieSize;
|
||||
auxTrieSize+
|
||||
canonStartSetsTop*2;
|
||||
|
||||
if(beVerbose) {
|
||||
printf("size of normalization trie %5lu bytes\n", normTrieSize);
|
||||
@ -1372,6 +1444,7 @@ generateData(const char *dataDir) {
|
||||
printf("size of combining table %5lu uint16_t\n", combiningTableTop);
|
||||
printf("size of FCD trie %5lu bytes\n", fcdTrieSize);
|
||||
printf("size of auxiliary trie %5lu bytes\n", auxTrieSize);
|
||||
printf("size of canonStartSets %5lu uint16_t\n", canonStartSetsTop);
|
||||
printf("size of " DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
||||
}
|
||||
|
||||
@ -1387,7 +1460,7 @@ generateData(const char *dataDir) {
|
||||
|
||||
indexes[_NORM_INDEX_FCD_TRIE_SIZE]=fcdTrieSize;
|
||||
indexes[_NORM_INDEX_AUX_TRIE_SIZE]=auxTrieSize;
|
||||
/* ### TODO indexes[_NORM_INDEX_UNICODE_SET_COUNT]=###; */
|
||||
indexes[_NORM_INDEX_CANON_SET_COUNT]=canonStartSetsTop;
|
||||
|
||||
/* write the data */
|
||||
pData=udata_create(dataDir, DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
@ -1403,6 +1476,7 @@ generateData(const char *dataDir) {
|
||||
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
|
||||
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
|
||||
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
|
||||
udata_writeBlock(pData, canonStartSets, canonStartSetsTop*2);
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
@ -1420,6 +1494,13 @@ generateData(const char *dataDir) {
|
||||
|
||||
extern void
|
||||
cleanUpData(void) {
|
||||
int32_t i, count;
|
||||
|
||||
count=(int32_t)normMem->index;
|
||||
for(i=0; i<count; ++i) {
|
||||
uset_close(norms[i].canonStart);
|
||||
}
|
||||
|
||||
utm_close(normMem);
|
||||
utm_close(utf32Mem);
|
||||
utm_close(extraMem);
|
||||
|
Loading…
Reference in New Issue
Block a user