ICU-1851 CODAN implementation, take 1

X-SVN-Rev: 12686
This commit is contained in:
Vladimir Weinstein 2003-07-25 05:31:54 +00:00
parent e371874f36
commit 29de7f8abe
5 changed files with 623 additions and 86 deletions

View File

@ -2657,7 +2657,7 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
}
// Now we have the character that needs to be decomposed
// if the normalizing buffer was not used, we can just use our structure and be happy.
if(source->flags & UCOL_ITER_INNORMBUF == 0) {
if((source->flags & UCOL_ITER_INNORMBUF) == 0) {
// decompose into writable buffer
int32_t decompLen = unorm_getDecomposition(cp, FALSE, &(source->writableBuffer[1]), UCOL_WRITABLE_BUFFER_SIZE-1);
if(decompLen < 0) {
@ -3027,6 +3027,221 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
}
return CE;
}
case DIGIT_TAG:
{
/*
We do a check to see if we want to collate digits as numbers; if so we generate
a custom collation key. Otherwise we pull out the value stored in the expansion table.
*/
uint32_t size;
uint32_t i; /* general counter */
collIterateState digitState;
if (coll->numericCollation == UCOL_ON){
UChar32 char32 = 0;
uint32_t digIndx = 0;
uint32_t endIndex = 0;
uint32_t trailingZeroIndex = 0;
uint32_t primWeight = 0;
uint32_t digVal = 0;
uint8_t collateVal = 0;
UBool nonZeroValReached = false;
uint8_t *numTempBuf;
uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
uint32_t numTempBufSize = UCOL_MAX_BUFFER;
numTempBuf = stackNumTempBuf;
/*
We parse the source string until we hit a char that's NOT a digit.
Use this u_charDigitValue. This might be slow because we have to
handle surrogates...
*/
/*
if (U16_IS_LEAD(ch)){
if (!collIter_eos(source)) {
backupState(source, &digitState);
UChar trail = getNextNormalizedChar(source);
if(U16_IS_TRAIL(trail)) {
char32 = U16_GET_SUPPLEMENTARY(ch, trail);
} else {
loadState(source, &digitState, TRUE);
char32 = ch;
}
} else {
char32 = ch;
}
} else {
char32 = ch;
}
digVal = u_charDigitValue(char32);
*/
digVal = u_charDigitValue(cp); // if we have arrived here, we have
// already processed possible supplementaries that trigered the digit tag -
// all supplementaries are marked in the UCA.
/*
We pad a zero in front of the first element anyways. This takes
care of the (probably) most common case where people are sorting things followed
by a single digit
*/
digIndx++;
for(;;){
// Make sure we have enough space.
if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
{
numTempBufSize *= 2;
if (numTempBuf == stackNumTempBuf){
numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
}else
realloc(numTempBuf, numTempBufSize);
}
// Skipping over leading zeroes.
if (digVal != 0 || nonZeroValReached){
if (digVal != 0 && !nonZeroValReached)
nonZeroValReached = true;
/*
We parse the digit string into base 100 numbers (this fits into a byte).
We only add to the buffer in twos, thus if we are parsing an odd character,
that serves as the 'tens' digit while the if we are parsing an even one, that
is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
than all the other bytes.
*/
if (digIndx % 2 == 1){
collateVal += (uint8_t)digVal;
// This removes trailing zeroes.
if (collateVal == 0 && !trailingZeroIndex)
trailingZeroIndex = ((digIndx-1)/2) + 2;
else if (trailingZeroIndex)
trailingZeroIndex = 0;
numTempBuf[((digIndx-1)/2) + 2] = collateVal*2 + 6;
collateVal = 0;
}
else{
// We drop the collation value into the buffer so if we need to do
// a "front patch" we don't have to check to see if we're hitting the
// last element.
collateVal = (uint8_t)(digVal * 10);
numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
}
digIndx++;
}
// Get next character.
if (!collIter_eos(source)){
ch = getNextNormalizedChar(source);
if (U16_IS_LEAD(ch)){
if (!collIter_eos(source)) {
backupState(source, &digitState);
UChar trail = getNextNormalizedChar(source);
if(U16_IS_TRAIL(trail)) {
char32 = U16_GET_SUPPLEMENTARY(ch, trail);
} else {
loadState(source, &digitState, TRUE);
char32 = ch;
}
}
} else {
char32 = ch;
}
if ((digVal = u_charDigitValue(char32)) == -1){
// Resetting position to point to the next unprocessed char. We
// overshot it when doing our test/set for numbers.
if (char32 > 0xFFFF) { // For surrogates.
loadState(source, &digitState, TRUE);
//goBackOne(source);
}
goBackOne(source);
break;
}
} else {
break;
}
}
if (nonZeroValReached == false){
digIndx = 2;
numTempBuf[2] = 6;
}
endIndex = trailingZeroIndex ? trailingZeroIndex : ((digIndx/2) + 2) ;
if (digIndx % 2 != 0){
/*
We missed a value. Since digIndx isn't even, stuck too many values into the buffer (this is what
we get for padding the first byte with a zero). "Front-patch" now by pushing all nybbles forward.
Doing it this way ensures that at least 50% of the time (statistically speaking) we'll only be doing a
single pass and optimizes for strings with single digits. I'm just assuming that's the more common case.
*/
for(i = 2; i < endIndex; i++){
numTempBuf[i] = (((((numTempBuf[i] - 6)/2) % 10) * 10) +
(((numTempBuf[i+1])-6)/2) / 10) * 2 + 6;
}
--digIndx;
}
// Subtract one off of the last byte.
numTempBuf[endIndex-1] -= 1;
/*
We want to skip over the first two slots in the buffer. The first slot
is reserved for the header byte 0x1B. The second slot is for the
sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
*/
numTempBuf[0] = 0x1B;
numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
size = ((endIndex+1) & ~1)/2;
CE = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
UCOL_BYTE_COMMON; // Tertiary weight.
i = 2; // Reset the index into the buffer.
while(i < endIndex)
{
primWeight = numTempBuf[i++] << 8;
if ( i < endIndex)
primWeight |= numTempBuf[i++];
*(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
}
if (numTempBuf != stackNumTempBuf)
free(numTempBuf);
} else {
// no numeric mode, we'll just switch to whatever we stashed and continue
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
CE = *CEOffset++;
break;
#if 0
CEOffset = (uint32_t *)coll->image+getExpansionOffset(CE); /* find the offset to expansion table */
size = getExpansionCount(CE);
CE = *CEOffset++;
if(size != 0) { /* if there are less than 16 elements in expansion, we don't terminate */
for(i = 1; i<size; i++) {
*(source->CEpos++) = *CEOffset++;
}
} else { /* else, we do */
while(*CEOffset != 0) {
*(source->CEpos++) = *CEOffset++;
}
}
#endif
}
return CE;
}
/* various implicits optimization */
// TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
@ -3492,6 +3707,217 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
source->CEpos = source->CEs;
}
return *(source->toReturn);
case DIGIT_TAG:
{
/*
We do a check to see if we want to collate digits as numbers; if so we generate
a custom collation key. Otherwise we pull out the value stored in the expansion table.
*/
//uint32_t size;
uint32_t i; /* general counter */
collIterateState state;
if (coll->numericCollation == UCOL_ON){
UChar32 char32 = 0;
uint32_t digIndx = 0;
uint32_t endIndex = 0;
uint32_t leadingZeroIndex = 0;
uint32_t primWeight = 0;
uint32_t digVal = 0;
uint8_t collateVal = 0;
UBool nonZeroValReached = false;
uint8_t *numTempBuf;
uint8_t stackNumTempBuf[UCOL_MAX_BUFFER]; // I just need a temporary place to store my generated CEs.
uint32_t numTempBufSize = UCOL_MAX_BUFFER;
numTempBuf = stackNumTempBuf;
/*
We parse the source string until we hit a char that's NOT a digit.
Use this u_charDigitValue. This might be slow because we have to
handle surrogates...
*/
if (U16_IS_TRAIL (ch)){
if (!collIter_bos(source)){
UChar lead = getPrevNormalizedChar(source);
if(U16_IS_LEAD(lead)) {
char32 = U16_GET_SUPPLEMENTARY(lead,ch);
goBackOne(source);
} else {
char32 = ch;
}
} else {
char32 = ch;
}
} else {
char32 = ch;
}
digVal = u_charDigitValue(char32);
for(;;){
// Make sure we have enough space.
if (digIndx >= ((numTempBufSize - 2) * 2) + 1)
{
numTempBufSize *= 2;
if (numTempBuf == stackNumTempBuf){
numTempBuf = (uint8_t *)malloc(sizeof(uint8_t) * numTempBufSize);
memcpy(numTempBuf, stackNumTempBuf, UCOL_MAX_BUFFER);
}else
realloc(numTempBuf, numTempBufSize);
}
// Skipping over "trailing" zeroes but we still add to digIndx.
if (digVal != 0 || nonZeroValReached){
if (digVal != 0 && !nonZeroValReached)
nonZeroValReached = true;
/*
We parse the digit string into base 100 numbers (this fits into a byte).
We only add to the buffer in twos, thus if we are parsing an odd character,
that serves as the 'tens' digit while the if we are parsing an even one, that
is the 'ones' digit. We dumped the parsed base 100 value (collateVal) into
a buffer. We multiply each collateVal by 2 (to give us room) and add 5 (to avoid
overlapping magic CE byte values). The last byte we subtract 1 to ensure it is less
than all the other bytes.
Since we're doing in this reverse we want to put the first digit encountered into the
ones place and the second digit encountered into the tens place.
*/
if (digIndx % 2 == 1){
collateVal += (uint8_t)(digVal * 10);
// This removes leading zeroes.
if (collateVal == 0 && !leadingZeroIndex)
leadingZeroIndex = ((digIndx-1)/2) + 2;
else if (leadingZeroIndex)
leadingZeroIndex = 0;
numTempBuf[((digIndx-1)/2) + 2] = collateVal*2 + 6;
collateVal = 0;
}
else{
collateVal = (uint8_t)digVal;
}
}
digIndx++;
if (!collIter_bos(source)){
ch = getPrevNormalizedChar(source);
//goBackOne(source);
if (U16_IS_TRAIL(ch)){
backupState(source, &state);
if (!collIter_bos(source))
{
goBackOne(source);
UChar lead = getPrevNormalizedChar(source);
if(U16_IS_LEAD(lead)) {
char32 = U16_GET_SUPPLEMENTARY(lead,ch);
} else {
loadState(source, &state, FALSE);
char32 = ch;
}
}
}
else
char32 = ch;
if ((digVal = u_charDigitValue(char32)) == -1){
if (char32 > 0xFFFF) {// For surrogates.
loadState(source, &state, FALSE);
}
// Don't need to "reverse" the goBackOne call,
// as this points to the next position to process..
//if (char32 > 0xFFFF) // For surrogates.
//getNextNormalizedChar(source);
break;
}
goBackOne(source);
}else
break;
}
if (nonZeroValReached == false){
digIndx = 2;
numTempBuf[2] = 6;
}
if (digIndx % 2 != 0){
numTempBuf[((digIndx)/2) + 2] = collateVal*2 + 6;
digIndx += 1;
}
endIndex = leadingZeroIndex ? leadingZeroIndex : ((digIndx/2) + 2) ;
// Subtract one off of the last byte. Really the first byte here, but it's reversed...
numTempBuf[2] -= 1;
/*
We want to skip over the first two slots in the buffer. The first slot
is reserved for the header byte 0x1B. The second slot is for the
sign/exponent byte: 0x80 + (decimalPos/2) & 7f.
*/
numTempBuf[0] = 0x1B;
numTempBuf[1] = (uint8_t)(0x80 + ((digIndx/2) & 0x7F));
// Now transfer the collation key to our collIterate struct.
// The total size for our collation key is endIndx bumped up to the next largest even value divided by two.
//size = ((endIndex+1) & ~1)/2;
*(source->CEpos++) = (((numTempBuf[0] << 8) | numTempBuf[1]) << UCOL_PRIMARYORDERSHIFT) | //Primary weight
(UCOL_BYTE_COMMON << UCOL_SECONDARYORDERSHIFT) | // Secondary weight
UCOL_BYTE_COMMON; // Tertiary weight.
i = endIndex - 1; // Reset the index into the buffer.
while(i >= 2)
{
primWeight = numTempBuf[i--] << 8;
if ( i >= 2)
primWeight |= numTempBuf[i--];
*(source->CEpos++) = (primWeight << UCOL_PRIMARYORDERSHIFT) | UCOL_CONTINUATION_MARKER;
}
if (numTempBuf != stackNumTempBuf)
free(numTempBuf);
source->toReturn = source->CEpos -1;
return *(source->toReturn);
}
else {
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
CE = *(CEOffset++);
break;
#if 0
/* find the offset to expansion table */
CEOffset = (uint32_t *)coll->image + getExpansionOffset(CE);
size = getExpansionCount(CE);
if (size != 0) {
/*
if there are less than 16 elements in expansion, we don't terminate
*/
uint32_t count;
for (count = 0; count < size; count++) {
*(source->CEpos ++) = *CEOffset++;
}
}
else {
/* else, we do */
while (*CEOffset != 0) {
*(source->CEpos ++) = *CEOffset ++;
}
}
source->toReturn = source->CEpos - 1;
// in case of one element expansion, we
// want to immediately return CEpos
if(source->toReturn == source->CEs) {
source->CEpos = source->CEs;
}
return *(source->toReturn);
#endif
}
}
case HANGUL_SYLLABLE_TAG: /* AC00-D7AF*/
{
const uint32_t
@ -6485,6 +6911,20 @@ ucol_setAttribute(UCollator *coll, UColAttribute attr, UColAttributeValue value,
UColAttributeValue oldFrench = coll->frenchCollation;
UColAttributeValue oldCaseFirst = coll->caseFirst;
switch(attr) {
case UCOL_NUMERIC_COLLATION: /* sort substrings of digits as numbers */
if(value == UCOL_ON) {
coll->numericCollation = UCOL_ON;
coll->numericCollationisDefault = FALSE;
} else if (value == UCOL_OFF) {
coll->numericCollation = UCOL_OFF;
coll->numericCollationisDefault = FALSE;
} else if (value == UCOL_DEFAULT) {
coll->numericCollationisDefault = TRUE;
coll->numericCollation = (UColAttributeValue)coll->options->numericCollation;
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case UCOL_HIRAGANA_QUATERNARY_MODE: /* special quaternary values for Hiragana */
if(value == UCOL_ON) {
coll->hiraganaQ = UCOL_ON;
@ -6602,6 +7042,8 @@ ucol_getAttribute(const UCollator *coll, UColAttribute attr, UErrorCode *status)
return UCOL_DEFAULT;
}
switch(attr) {
case UCOL_NUMERIC_COLLATION:
return coll->numericCollation;
case UCOL_HIRAGANA_QUATERNARY_MODE:
return coll->hiraganaQ;
case UCOL_FRENCH_COLLATION: /* attribute for direction of secondary weights*/
@ -8399,3 +8841,4 @@ returnResult:
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -470,7 +470,7 @@ int uprv_uca_setMaxExpansion(uint32_t endexpansion,
start = mid;
}
}
if (*start == endexpansion) {
result = start - pendexpansionce;
}
@ -478,7 +478,7 @@ int uprv_uca_setMaxExpansion(uint32_t endexpansion,
if (*limit == endexpansion) {
result = limit - pendexpansionce;
}
if (result > -1) {
/* found the ce in expansion, we'll just modify the size if it is
smaller */
@ -494,7 +494,7 @@ int uprv_uca_setMaxExpansion(uint32_t endexpansion,
int shiftsize = (pendexpansionce + pos) - start;
uint32_t *shiftpos = start + 1;
uint8_t *sizeshiftpos = pexpansionsize + (shiftpos - pendexpansionce);
/* okay need to rearrange the array into sorted order */
if (shiftsize == 0 || *(pendexpansionce + pos) < endexpansion) {
*(pendexpansionce + pos + 1) = endexpansion;
@ -631,7 +631,7 @@ int uprv_uca_setMaxJamoExpansion(UChar ch,
*(pendexpansionce + maxexpansion->position) = endexpansion;
*(maxexpansion->isV + maxexpansion->position) = isV;
maxexpansion->position ++;
return maxexpansion->position;
}
@ -1012,9 +1012,12 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
if(U_FAILURE(*status)) {
return 0xFFFF;
}
element->mapCE = 0; // clear mapCE so that we can catch expansions
if(element->noOfCEs == 1) {
if(element->isThai == FALSE) {
element->mapCE = element->CEs[0];
element->mapCE = element->CEs[0];
} else { /* add thai - totally bad here */
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (THAI_TAG<<UCOL_TAG_SHIFT)
| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
@ -1048,10 +1051,10 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
| ((element->CEs[0]>>8) & 0xFFFF00) // first and second byte of primary
| ((element->CEs[1]>>24) & 0xFF); // third byte of primary
} else {
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
& 0xFFFFF0);
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (EXPANSION_TAG<<UCOL_TAG_SHIFT)
| ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4)
& 0xFFFFF0);
for(i = 1; i<element->noOfCEs; i++) {
uprv_uca_addExpansion(expansions, element->CEs[i], status);
}
@ -1076,6 +1079,32 @@ uprv_uca_addAnElement(tempUCATable *t, UCAElements *element, UErrorCode *status)
}
}
// We treat digits differently - they are "uber special" and should be
// processed differently if numeric collation is on.
UChar32 uniChar = 0;
//printElement(element);
if ((element->cSize == 2) && U16_IS_LEAD(element->uchars[0])){
uniChar = U16_GET_SUPPLEMENTARY(element->uchars[0], element->uchars[1]);
} else if (element->cSize == 1){
uniChar = element->uchars[0];
}
// Here, we either have one normal CE OR mapCE is set. Therefore, we stuff only
// one element to the expansion buffer. When we encounter a digit and we don't
// do numeric collation, we will just pick the CE we have and break out of case
// (see ucol.cpp ucol_prv_getSpecialCE && ucol_prv_getSpecialPrevCE). If we picked
// a special, further processing will occur. If it's a simple CE, we'll return due
// to how the loop is constructed.
if (uniChar != 0 && u_isdigit(uniChar)){
expansion = (uint32_t)(UCOL_SPECIAL_FLAG | (DIGIT_TAG<<UCOL_TAG_SHIFT) | 1); // prepare the element
if(element->mapCE) { // if there is an expansion, we'll pick it here
expansion |= ((uprv_uca_addExpansion(expansions, element->mapCE, status)+(headersize>>2))<<4);
} else {
expansion |= ((uprv_uca_addExpansion(expansions, element->CEs[0], status)+(headersize>>2))<<4);
}
element->mapCE = expansion;
}
// here we want to add the prefix structure.
// I will try to process it as a reverse contraction, if possible.
// prefix buffer is already reversed.
@ -1157,7 +1186,7 @@ void uprv_uca_getMaxExpansionJamo(UNewTrie *mapping,
const uint32_t TBASE = 0x11A8;
const uint32_t VCOUNT = 21;
const uint32_t TCOUNT = 28;
uint32_t v = VBASE + VCOUNT - 1;
uint32_t t = TBASE + TCOUNT - 1;
uint32_t ce;
@ -1556,3 +1585,5 @@ uprv_uca_canonicalClosure(tempUCATable *t, UErrorCode *status)
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -635,6 +635,9 @@ typedef enum {
LONG_PRIMARY_TAG = 12, /* This is a three byte primary with starting secondaries and tertiaries */
/* It fits in a single 32 bit CE and is used instead of expansion to save */
/* space without affecting the performance (hopefully) */
DIGIT_TAG = 13, /* COllate Digits As Numbers (CODAN) implementation */
CE_TAGS_COUNT
} UColCETags;
@ -653,7 +656,8 @@ typedef struct {
/*UColAttributeValue*/ int32_t caseLevel; /* do we have an extra case level */
/*UColAttributeValue*/ int32_t normalizationMode; /* attribute for normalization */
/*UColAttributeValue*/ int32_t strength; /* attribute for strength */
/*UColAttributeValue*/ int32_t hiraganaQ; /* attribuge for special Hiragana */
/*UColAttributeValue*/ int32_t hiraganaQ; /* attribute for special Hiragana */
/*UColAttributeValue*/ int32_t numericCollation;
uint8_t reserved[64]; /* for future use */
} UColOptionSet;
@ -862,6 +866,7 @@ struct UCollator {
UColAttributeValue normalizationMode; /* attribute for normalization */
UColAttributeValue strength; /* attribute for strength */
UColAttributeValue hiraganaQ; /* attribute for Hiragana */
UColAttributeValue numericCollation;
UBool variableTopValueisDefault;
UBool frenchCollationisDefault;
UBool alternateHandlingisDefault; /* attribute for handling variable elements*/
@ -870,6 +875,7 @@ struct UCollator {
UBool normalizationModeisDefault; /* attribute for normalization */
UBool strengthisDefault; /* attribute for strength */
UBool hiraganaQisDefault; /* attribute for Hiragana */
UBool numericCollationisDefault;
UBool hasRealData; /* some collators have only options, like French, no rules */
/* to speed up things, we use the UCA image, but we don't want it */
/* to run around */
@ -980,4 +986,3 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
#endif /* #if !UCONFIG_NO_COLLATION */
#endif

View File

@ -212,13 +212,16 @@ void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, U
case UCOL_STRENGTH:
opts->strength = value;
break;
case UCOL_NUMERIC_COLLATION:
opts->numericCollation = value;
break;
case UCOL_ATTRIBUTE_COUNT:
default:
break;
}
}
#define UTOK_OPTION_COUNT 19
#define UTOK_OPTION_COUNT 20
static UBool didInit = FALSE;
/* we can be strict, or we can be lenient */
@ -265,6 +268,7 @@ U_STRING_DECL(option_15, "first", 5);
U_STRING_DECL(option_16, "last", 4);
U_STRING_DECL(option_17, "optimize", 8);
U_STRING_DECL(option_18, "suppressContractions", 20);
U_STRING_DECL(option_19, "numericOrdering", 15);
/*
@ -320,6 +324,30 @@ static const ucolTokSuboption firstLastSub[7] = {
{suboption_17, 8, UCOL_PRIMARY},
};
enum OptionNumber {
OPTION_ALTERNATE_HANDLING = 0,
OPTION_FRENCH_COLLATION,
OPTION_CASE_LEVEL,
OPTION_CASE_FIRST,
OPTION_NORMALIZATION_MODE,
OPTION_HIRAGANA_QUATERNARY,
OPTION_STRENGTH,
OPTION_NUMERIC_COLLATION,
OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
OPTION_VARIABLE_TOP,
OPTION_REARRANGE,
OPTION_BEFORE,
OPTION_TOP,
OPTION_FIRST,
OPTION_LAST,
OPTION_OPTIMIZE,
OPTION_SUPPRESS_CONTRACTIONS,
OPTION_UNDEFINED,
OPTION_SCRIPT_ORDER,
OPTION_CHARSET_NAME,
OPTION_CHARSET
} ;
static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
/*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
/*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
@ -328,18 +356,19 @@ static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
/*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
/*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
/*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
/*07*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
/*08*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
/*09*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
/*10*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
/*11*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
/*12*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
/*13*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
/*14*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
/*15*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
/*16*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
/*17*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
/*18*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
/*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
/*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
/*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
/*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
/*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
/*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
/*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
/*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
/*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
/*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
/*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
/*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
/*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
};
static
@ -406,6 +435,7 @@ void ucol_uprv_tok_initData() {
U_STRING_INIT(option_16, "last", 4);
U_STRING_INIT(option_17, "optimize", 8);
U_STRING_INIT(option_18, "suppressContractions", 20);
U_STRING_INIT(option_19, "numericOrdering", 15);
didInit = TRUE;
}
}
@ -558,65 +588,85 @@ uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status)
if(i < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
} else if(i<7) {
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
result = UCOL_TOK_SUCCESS;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else if(i == 7) { /* variable top */
result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
} else if(i == 8) { /*rearange */
result = UCOL_TOK_SUCCESS;
} else if(i == 9) { /*before*/
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else if(i == 10) { /*top */ /* we are going to have an array with structures of limit CEs */
/* index to this array will be src->parsedToken.indirectIndex*/
src->parsedToken.indirectIndex = 0;
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
} else if(i == 11 || i ==12) { /* first, last */
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
src->parsedToken.indirectIndex = (uint16_t)(i-10+j*2);
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
} else if(i == 13 || i == 14) { // copy and remove are handled before normalization
// we need to move end here
int32_t noOpenBraces = 1;
src->current++; // skip opening brace
while(src->current < src->end && noOpenBraces != 0) {
if(*src->current == 0x005b) {
noOpenBraces++;
} else if(*src->current == 0x005D) { // closing brace
noOpenBraces--;
}
src->current++;
}
result = UCOL_TOK_SUCCESS;
} else {
*status = U_UNSUPPORTED_ERROR;
int32_t noOpenBraces = 1;
switch(i) {
case OPTION_ALTERNATE_HANDLING:
case OPTION_FRENCH_COLLATION:
case OPTION_CASE_LEVEL:
case OPTION_CASE_FIRST:
case OPTION_NORMALIZATION_MODE:
case OPTION_HIRAGANA_QUATERNARY:
case OPTION_STRENGTH:
case OPTION_NUMERIC_COLLATION:
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
result = UCOL_TOK_SUCCESS;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_VARIABLE_TOP:
result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
break;
case OPTION_REARRANGE:
result = UCOL_TOK_SUCCESS;
break;
case OPTION_BEFORE:
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
/* index to this array will be src->parsedToken.indirectIndex*/
src->parsedToken.indirectIndex = 0;
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
break;
case OPTION_FIRST:
case OPTION_LAST: /* first, last */
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
// the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
// element of indirect boundaries is reserved for top.
src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_OPTIMIZE:
case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
// we need to move end here
src->current++; // skip opening brace
while(src->current < src->end && noOpenBraces != 0) {
if(*src->current == 0x005b) {
noOpenBraces++;
} else if(*src->current == 0x005D) { // closing brace
noOpenBraces--;
}
src->current++;
}
result = UCOL_TOK_SUCCESS;
break;
default:
*status = U_UNSUPPORTED_ERROR;
break;
}
}
//src->current = u_strchr(src->current, 0x005d /*']'*/);
src->current = u_memchr(src->current, 0x005d, src->end-src->current);
return result;
}
@ -1546,7 +1596,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
// while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
//optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
if(optionNumber == 13) { /* copy - parts of UCA to tailoring */
if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
if(U_SUCCESS(*status)) {
if(src->copySet == NULL) {
@ -1558,7 +1608,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
} else {
return;
}
} else if(optionNumber == 14) {
} else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
if(U_SUCCESS(*status)) {
if(src->removeSet == NULL) {
@ -1698,3 +1748,4 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -228,7 +228,13 @@ typedef enum {
* non-ignorables on quaternary level
* This is a sneaky way to produce JIS
* sort order */
UCOL_HIRAGANA_QUATERNARY_MODE,
UCOL_HIRAGANA_QUATERNARY_MODE,
/** when turned on, this attribute
* generates a collation key
* for the numeric value of substrings
* of digits. This is a way to get '100'
* to sort AFTER '2'.*/
UCOL_NUMERIC_COLLATION,
UCOL_ATTRIBUTE_COUNT
} UColAttribute;
@ -793,3 +799,4 @@ ucol_getTailoredSet(const UCollator *coll, UErrorCode *status);
#endif /* #if !UCONFIG_NO_COLLATION */
#endif