ICU-861 Added last-char-in-contraction hash table

X-SVN-Rev: 4642
This commit is contained in:
Andy Heninger 2001-05-10 22:12:53 +00:00
parent 9bbdff614f
commit ccb47e551d
4 changed files with 104 additions and 39 deletions

View File

@ -484,7 +484,9 @@ static const uint16_t *FCD_STAGE_3_;
inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
if (c < coll->minUnsafeCP) return FALSE;
if (c < coll->minUnsafeCP) {
return FALSE;
}
int32_t hash = c;
uint8_t htbyte;
@ -513,7 +515,30 @@ inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
}
inline UBool ucol_contractionEndCP(UChar c, const UCollator *coll) {
return TRUE;
return true;
if (c < coll->minContrEndCP) {
return FALSE;
}
int32_t hash = c;
uint8_t htbyte;
if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
}
htbyte = coll->contrEndCP[hash>>3];
if (((htbyte >> (hash & 7)) & 1) == 1) {
return TRUE;
}
/* TODO: main UCA table data needs to be merged into tailoring tables, */
/* and this second level of test removed from here. */
if (coll == UCA || UCA == NULL) {
return FALSE;
}
htbyte = UCA->contrEndCP[hash>>3];
return ((htbyte >> (hash & 7)) & 1) == 1;
}
@ -588,6 +613,13 @@ UCollator* ucol_initCollator(const UCATableHeader *image, UCollator *fillIn, UEr
}
result->minUnsafeCP = c;
result->contrEndCP = (uint8_t *)result->image + result->image->contrEndCP;
result->minContrEndCP = 0;
for (c=0; c<0x300; c++) { // Find the Contraction-ending char.
if (ucol_contractionEndCP(c, result)) break;
}
result->minContrEndCP = c;
/* max expansion tables */
result->endExpansionCE = (uint32_t*)((uint8_t*)result->image +
result->image->endExpansionCE);
@ -2737,15 +2769,22 @@ int32_t ucol_getSortKeySize(const UCollator *coll, collIterate *s, int32_t curre
int32_t i = 0;
int32_t c, prev=0x50;
int32_t diff;
while(i<len) {
UTF_NEXT_CHAR(ident, i, len, c);
diff = c-prev;
if(diff>=SLOPE_REACH_NEG_1) {
currentSize += (diff<=SLOPE_REACH_POS_1)?1:((diff<=SLOPE_REACH_POS_2)?2:3);
} else {
currentSize += (diff>=SLOPE_REACH_NEG_2)?2:3;
}
prev=c;
// while(i<len) {
for (;;) {
if (len >=0 && i>=len) {
break;
}
UTF_NEXT_CHAR(ident, i, len, c);
if (c==0) {
break;
}
diff = c-prev;
if(diff>=SLOPE_REACH_NEG_1) {
currentSize += (diff<=SLOPE_REACH_POS_1)?1:((diff<=SLOPE_REACH_POS_2)?2:3);
} else {
currentSize += (diff>=SLOPE_REACH_NEG_2)?2:3;
}
prev=c;
}
}
return currentSize;
@ -3378,6 +3417,7 @@ ucol_calcSortKey(const UCollator *coll,
return sortKeySize;
}
int32_t
ucol_calcSortKeySimpleTertiary(const UCollator *coll,
const UChar *source,
@ -3387,6 +3427,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
UBool allocatePrimary,
UErrorCode *status)
{
U_ALIGN_CODE(16);
uint32_t i = 0; /* general purpose counter */
/* Stack allocated buffers for buffers we use */
@ -3411,7 +3452,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
UChar *normSource = normBuffer;
int32_t normSourceLen = UCOL_NORMALIZATION_MAX_BUFFER;
int32_t len = (sourceLength == -1 ? u_strlen(source) : sourceLength);
int32_t len = sourceLength;
collIterate s;
@ -3419,25 +3460,22 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
/* If we need to normalize, we'll do it all at once at the beggining! */
UColAttributeValue normMode = coll->normalizationMode;
if((normMode != UCOL_OFF)
/* && (unorm_quickCheck(source, len, UNORM_NFD, status) != UNORM_YES)
&& (unorm_quickCheck(source, len, UNORM_NFC, status) != UNORM_YES)) */
/* changed by synwee */
&& !checkFCD(source, len, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
*status=U_ZERO_ERROR;
normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar));
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, (normSourceLen+1), status);
if(normMode != UCOL_OFF) {
if (!checkFCD(source, len, status))
{
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, normSourceLen, status);
if(U_FAILURE(*status)) {
*status=U_ZERO_ERROR;
normSource = (UChar *) uprv_malloc((normSourceLen+1)*sizeof(UChar));
normSourceLen = unorm_normalize(source, sourceLength, UNORM_NFD, 0, normSource, (normSourceLen+1), status);
}
normSource[normSourceLen] = 0;
IInit_collIterate(coll, normSource, -1, &s);
s.flags &= ~(UCOL_ITER_NORM);
len = normSourceLen;
}
normSource[normSourceLen] = 0;
IInit_collIterate(coll, normSource, -1, &s);
s.flags &= ~(UCOL_ITER_NORM);
len = normSourceLen;
}
if(resultLength == 0 || primaries == NULL) {
return ucol_getSortKeySize(coll, &s, sortKeySize, coll->strength, len);
@ -3542,7 +3580,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
}
#endif
if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
if(secondary > 0) { /* I think that != 0 test should be != IGNORABLE */
/* This is compression code. */
if (secondary == UCOL_COMMON2 && notIsContinuation) {
++count2;
@ -3568,7 +3606,7 @@ ucol_calcSortKeySimpleTertiary(const UCollator *coll,
}
if(tertiary > 0) {
if(tertiary > 0) {
/* This is compression code. */
/* sequence size check is included in the if clause */
if (tertiary == UCOL_COMMON3 && notIsContinuation) {
@ -4287,11 +4325,7 @@ ucol_strcoll( const UCollator *coll,
const UChar *target,
int32_t targetLength)
{
#ifdef _MSC_VER
/* TODO: this really does speed thing up significantly on MSVC builds on P6 processors. */
/* What's the best way to ifdef it in? */
// __asm align 16
#endif
U_ALIGN_CODE(16);
/* Scan the strings. Find: */
/* The length of any leading portion that is equal */

View File

@ -151,8 +151,10 @@ tempUCATable * uprv_uca_initTempTable(UCATableHeader *image, UColOptionSet *opts
}
t->unsafeCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
t->contrEndCP = (uint8_t *)uprv_malloc(UCOL_UNSAFECP_TABLE_SIZE);
uprv_memset(t->unsafeCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
return t;
uprv_memset(t->contrEndCP, 0, UCOL_UNSAFECP_TABLE_SIZE);
return t;
}
void uprv_uca_closeTempTable(tempUCATable *t) {
@ -167,6 +169,7 @@ void uprv_uca_closeTempTable(tempUCATable *t) {
uprv_free(t->maxExpansions->expansionCESize);
uprv_free(t->maxExpansions);
uprv_free(t->unsafeCP);
uprv_free(t->contrEndCP);
uprv_free(t);
}
@ -301,6 +304,19 @@ int uprv_uca_setMaxExpansion(uint32_t endexpansion,
}
static void ContrEndCPSet(uint8_t *table, UChar c) {
uint32_t hash;
uint8_t *htByte;
hash = c;
if (hash >= UCOL_UNSAFECP_TABLE_SIZE*8) {
hash = (hash & UCOL_UNSAFECP_TABLE_MASK) + 256;
}
htByte = &table[hash>>3];
*htByte |= (1 << (hash & 7));
}
static void unsafeCPSet(uint8_t *table, UChar c) {
uint32_t hash;
uint8_t *htByte;
@ -334,6 +350,9 @@ uint32_t uprv_uca_addContraction(tempUCATable *t, uint32_t CE, UCAElements *elem
for (i=1; i<element->cSize; i++) { /* First add contraction chars to unsafe CP hash table */
unsafeCPSet(t->unsafeCP, element->cPoints[i]);
}
// Add the last char of the contraction to the contraction-end hash table.
ContrEndCPSet(t->contrEndCP, element->cPoints[element->cSize -1]);
if(UCOL_ISJAMO(element->cPoints[0])) {
t->image->jamoSpecial = TRUE;
}
@ -568,7 +587,9 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
+ paddedsize(maxexpansion->position * sizeof(uint32_t)) +
/* maxexpansion size array */
paddedsize(maxexpansion->position * sizeof(uint8_t)) +
paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
paddedsize(UCOL_UNSAFECP_TABLE_SIZE) + /* Unsafe chars */
paddedsize(UCOL_UNSAFECP_TABLE_SIZE); /* Contraction Ending chars */
dataStart = (uint8_t *)malloc(toAllocate);
@ -654,6 +675,11 @@ UCATableHeader *uprv_uca_assembleTable(tempUCATable *t, UErrorCode *status) {
tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
/* Contraction Ending chars hash table. Copy it out. */
myData->contrEndCP = tableOffset;
uprv_memcpy(dataStart + tableOffset, t->contrEndCP, UCOL_UNSAFECP_TABLE_SIZE);
tableOffset += paddedsize(UCOL_UNSAFECP_TABLE_SIZE);
if(tableOffset != toAllocate) {
fprintf(stderr, "calculation screwup!!! Expected to write %i but wrote %i instead!!!\n", toAllocate, tableOffset);
*status = U_INTERNAL_PROGRAM_ERROR;

View File

@ -64,6 +64,7 @@ typedef struct {
UColOptionSet *options;
MaxExpansionTable *maxExpansions;
uint8_t *unsafeCP;
uint8_t *contrEndCP;
const UCollator *UCA;
} tempUCATable;

View File

@ -532,7 +532,9 @@ typedef struct {
collation elements with last element
in endExpansionCE*/
int32_t endExpansionCECount; /* size of endExpansionCE */
uint32_t unsafeCP;
uint32_t unsafeCP; /* hash table of unsafe code points */
uint32_t contrEndCP; /* hash table of final code points */
/* in contractions. */
int32_t CEcount;
UBool jamoSpecial; /* is jamoSpecial */
@ -611,7 +613,9 @@ struct UCollator {
corresponding to endExpansionCE,
terminated with a null */
const uint8_t *unsafeCP; /* unsafe code points hashtable */
const uint8_t *contrEndCP; /* Contraction ending chars hash table */
UChar minUnsafeCP; /* Smallest unsafe Code Point. */
UChar minContrEndCP; /* Smallest code point at end of a contraction */
};
/* various internal functions */