ICU-1586 better compaction, find all identical blocks like ucmpX_compact()

X-SVN-Rev: 7787
This commit is contained in:
Markus Scherer 2002-02-26 02:06:12 +00:00
parent bb6a024c9a
commit 21bbcb0e19

View File

@ -474,34 +474,69 @@ utrie_fold(UNewTrie *trie, UNewTrieGetFoldedValue *getFoldedValue, UErrorCode *p
#endif
trie->indexLength=indexLength;
return;
}
/*
* Set a value in the trie index map to indicate which data block
* is referenced and which one is not.
* utrie_compact() will remove data blocks that are not used at all.
* Set
* - 0 if it is used
* - -1 if it is not used
*/
static void
_findUnusedBlocks(UNewTrie *trie) {
int32_t i;
/* fill the entire map with "not used" */
uprv_memset(trie->map, 0xff, (UTRIE_MAX_BUILD_TIME_DATA_LENGTH>>UTRIE_SHIFT)*4);
/* mark each block that _is_ used with 0 */
for(i=0; i<trie->indexLength; ++i) {
trie->map[ABS(trie->index[i])>>UTRIE_SHIFT]=0;
}
/* never move the all-initial-value block 0 */
trie->map[0]=0;
}
static int32_t
_findSameDataBlock(const uint32_t *data, int32_t dataLength,
int32_t otherBlock, int32_t step) {
int32_t block, i;
/* ensure that we do not even partially get past dataLength */
dataLength-=UTRIE_DATA_BLOCK_LENGTH;
for(block=UTRIE_DATA_BLOCK_LENGTH; block<=dataLength; block+=step) {
for(i=0; i<UTRIE_DATA_BLOCK_LENGTH; ++i) {
if(data[block+i]!=data[otherBlock+i]) {
break;
}
}
if(i==UTRIE_DATA_BLOCK_LENGTH) {
return block;
}
}
return -1;
}
/*
* Compact a folded build-time trie.
*
* The compaction
* - removes all-initial-value blocks
* - maps all blocks that are completely filled with the same values to only of them
* - overlaps adjacent blocks as much as possible
* - removes blocks that are identical with earlier ones
* - overlaps adjacent blocks as much as possible (if overlap==TRUE)
* - moves blocks in steps of the data granularity
*
* It does not
* - find blocks that are identical but not completely filled with the same value
* - try to move and overlap blocks that are not already adjacent
* - try to move and overlap blocks that overlap with multiple values in the overlap region
*/
static void
utrie_compact(UNewTrie *trie, UErrorCode *pErrorCode) {
/*
* Map of whole blocks that are filled with all the same value.
* The first such block per value is stored in this lookup table,
* and following blocks will be replaced with the previous block's index.
*/
uint32_t wholeBlockValues[64];
int32_t wholeBlockIndexes[64];
utrie_compact(UNewTrie *trie, UBool overlap, UErrorCode *pErrorCode) {
uint32_t x;
int32_t i, start, prevEnd, newStart, overlapStart, countWholeBlocks;
UBool addWholeBlock;
int32_t i, start, prevEnd, newStart, overlapStart;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
@ -518,13 +553,8 @@ utrie_compact(UNewTrie *trie, UErrorCode *pErrorCode) {
/* compaction */
/* never move the all-initial-value block 0 */
trie->map[0]=0;
/* prime the whole blocks lookup table with the all-initial-value block 0 */
wholeBlockValues[0]=trie->data[0];
wholeBlockIndexes[0]=0;
countWholeBlocks=1;
/* initialize the index map with "block is used/unused" flags */
_findUnusedBlocks(trie);
/* if Latin-1 is preallocated and linear, then do not compact Latin-1 data */
if(trie->isLatin1Linear && UTRIE_SHIFT<=8) {
@ -542,58 +572,35 @@ utrie_compact(UNewTrie *trie, UErrorCode *pErrorCode) {
* newStart: index where the current block is to be moved
*/
/* x: first value in the current block */
x=trie->data[start];
addWholeBlock=FALSE;
/* skip blocks that are not used */
if(trie->map[start>>UTRIE_SHIFT]<0) {
/* advance start to the next block */
start+=UTRIE_DATA_BLOCK_LENGTH;
/* see if the current block is filled with this value x */
for(i=1; i<UTRIE_DATA_BLOCK_LENGTH && x==trie->data[start+i]; ++i) {}
if(i==UTRIE_DATA_BLOCK_LENGTH) {
/*
* yes, the block is filled with x
* if this is the first such block, then add it to the whole block lookup table,
* but defer that until after overlap checking
*/
if(countWholeBlocks<sizeof(wholeBlockValues)/4) {
addWholeBlock=TRUE;
}
/* leave prevEnd and newStart with the previous block! */
continue;
}
/* did we already see another block that is also filled with x? */
for(i=0; i<countWholeBlocks; ++i) {
if(x==wholeBlockValues[i]) {
if(start>=overlapStart) {
/* yes, set the other block's index value for the current block */
trie->map[start>>UTRIE_SHIFT]=wholeBlockIndexes[i];
/* search for an identical block */
if( start>=overlapStart &&
(i=_findSameDataBlock(trie->data, newStart, start,
overlap ? UTRIE_DATA_GRANULARITY : UTRIE_DATA_BLOCK_LENGTH))
>=0
) {
/* found an identical block, set the other block's index value for the current block */
trie->map[start>>UTRIE_SHIFT]=i;
/* advance start to the next block */
start+=UTRIE_DATA_BLOCK_LENGTH;
/* advance start to the next block */
start+=UTRIE_DATA_BLOCK_LENGTH;
/* leave prevEnd and newStart with the previous block! */
break; /* Java: continue outerLoop; */
} else {
/*
* Latin-1 is linear and this is a Latin-1 block
* do not replace its index value (to keep it linear)
* do not add it into the whole blocks lookup table
* (because an equivalent block is in there already)
* finish the rest of the outer loop
*/
addWholeBlock=FALSE;
}
}
}
if(i<countWholeBlocks) {
/*
* this is from the break in the previous loop
* because C does not allow a multi-level "continue outer loop";
* we have replaced this block's index value by a repeat block
*/
continue;
}
/* leave prevEnd and newStart with the previous block! */
continue;
}
/* see if the beginning of this block can be overlapped with the end of the previous block */
if(x==trie->data[prevEnd] && start>=overlapStart) {
/* x: first value in the current block */
x=trie->data[start];
if(x==trie->data[prevEnd] && overlap && start>=overlapStart) {
/* overlap by at least one */
for(i=1; i<UTRIE_DATA_BLOCK_LENGTH && x==trie->data[start+i] && x==trie->data[prevEnd-i]; ++i) {}
@ -603,13 +610,6 @@ utrie_compact(UNewTrie *trie, UErrorCode *pErrorCode) {
i=0;
}
if(addWholeBlock) {
/* add this block to the lookup table */
wholeBlockValues[countWholeBlocks]=x;
wholeBlockIndexes[countWholeBlocks]=newStart-i;
++countWholeBlocks;
}
if(i>0) {
/* some overlap */
trie->map[start>>UTRIE_SHIFT]=newStart-i;
@ -646,7 +646,6 @@ utrie_compact(UNewTrie *trie, UErrorCode *pErrorCode) {
#endif
trie->dataLength=newStart;
return;
}
/* serialization ------------------------------------------------------------ */
@ -722,8 +721,15 @@ utrie_serialize(UNewTrie *trie, uint8_t *data, int32_t capacity,
/* fold and compact if necessary, also checks that indexLength is within limits */
if(!trie->isCompacted) {
/* compact once without overlap to improve folding */
utrie_compact(trie, FALSE, pErrorCode);
/* fold the supplementary part of the index array */
utrie_fold(trie, getFoldedValue, pErrorCode);
utrie_compact(trie, pErrorCode);
/* compact again with overlap for minimum data array length */
utrie_compact(trie, TRUE, pErrorCode);
trie->isCompacted=TRUE;
if(U_FAILURE(*pErrorCode)) {
return 0;