ICU-7015 Implementing compact collation tailoring syntax by introducing a star-notation.

X-SVN-Rev: 27521
This commit is contained in:
Umesh Nair 2010-02-09 19:59:06 +00:00
parent 6d69ce9111
commit b11c49bcd5
3 changed files with 96 additions and 10 deletions

View File

@ -760,6 +760,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
uint32_t extensionOffset = 0; uint32_t extensionOffset = 0;
uint32_t newStrength = UCOL_TOK_UNSET; uint32_t newStrength = UCOL_TOK_UNSET;
UChar buff[10]; UChar buff[10];
UChar32 codepoint;
src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
@ -823,6 +824,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop; goto EndOfLoop;
} }
newStrength = UCOL_IDENTICAL; newStrength = UCOL_IDENTICAL;
if(*(src->current+1) == 0x002A) {/*'*'*/
src->current++;
src->prevStrength = newStrength;
}else{
src->prevStrength = UCOL_TOK_UNSET;
}
break; break;
case 0x002C/*','*/: case 0x002C/*','*/:
@ -838,6 +845,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop; goto EndOfLoop;
} }
newStrength = UCOL_TERTIARY; newStrength = UCOL_TERTIARY;
src->prevStrength = UCOL_TOK_UNSET;
break; break;
case 0x003B/*';'*/: case 0x003B/*';'*/:
@ -853,6 +861,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop; goto EndOfLoop;
} }
newStrength = UCOL_SECONDARY; newStrength = UCOL_SECONDARY;
src->prevStrength = UCOL_TOK_UNSET;
break; break;
case 0x003C/*'<'*/: case 0x003C/*'<'*/:
@ -880,6 +889,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
} else { /* just one */ } else { /* just one */
newStrength = UCOL_PRIMARY; newStrength = UCOL_PRIMARY;
} }
if(*(src->current+1) == 0x002A) {/*'*'*/
src->current++;
src->prevStrength = newStrength;
}else{
src->prevStrength = UCOL_TOK_UNSET;
}
break; break;
case 0x0026/*'&'*/: case 0x0026/*'&'*/:
@ -889,6 +904,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
} }
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
src->prevStrength = UCOL_TOK_UNSET;
break; break;
case 0x005b/*'['*/: case 0x005b/*'['*/:
@ -953,11 +969,15 @@ ucol_tok_parseNextToken(UColTokenParser *src,
/* found a quote, we're gonna start copying */ /* found a quote, we're gonna start copying */
case 0x0027/*'\''*/: case 0x0027/*'\''*/:
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
*status = U_INVALID_FORMAT_ERROR; if(src->prevStrength == UCOL_TOK_UNSET){
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); *status = U_INVALID_FORMAT_ERROR;
return NULL; syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
// enabling rules to start with a non-token character a < b return NULL;
// newStrength = UCOL_TOK_RESET; // enabling rules to start with a non-token character a < b
// newStrength = UCOL_TOK_RESET;
}else{
newStrength = src->prevStrength;
}
} }
inQuote = TRUE; inQuote = TRUE;
@ -1036,9 +1056,13 @@ ucol_tok_parseNextToken(UColTokenParser *src,
break; break;
default: default:
if (newStrength == UCOL_TOK_UNSET) { if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR; if(src->prevStrength == UCOL_TOK_UNSET){
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); *status = U_INVALID_FORMAT_ERROR;
return NULL; syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}else{
newStrength = src->prevStrength;
}
} }
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
@ -1056,6 +1080,11 @@ ucol_tok_parseNextToken(UColTokenParser *src,
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
} }
src->parsedToken.charsLen++; src->parsedToken.charsLen++;
if(src->prevStrength != UCOL_TOK_UNSET){
U16_NEXT(0, src->current, src->end, codepoint);
src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
goto EndOfLoop;
}
} else { } else {
if(newExtensionLen == 0) { if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->current - src->source); extensionOffset = (uint32_t)(src->current - src->source);
@ -1069,6 +1098,10 @@ ucol_tok_parseNextToken(UColTokenParser *src,
} }
if(wasInQuote) { if(wasInQuote) {
if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
src->current++;
goto EndOfLoop;
}
if(ch != 0x27) { if(ch != 0x27) {
if(inQuote || !uprv_isRuleWhiteSpace(ch)) { if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
ucol_tok_addToExtraCurrent(src, &ch, 1, status); ucol_tok_addToExtraCurrent(src, &ch, 1, status);
@ -1826,6 +1859,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
src->parsedToken.flags = 0; src->parsedToken.flags = 0;
src->parsedToken.strength = UCOL_TOK_UNSET; src->parsedToken.strength = UCOL_TOK_UNSET;
src->buildCCTabFlag = FALSE; src->buildCCTabFlag = FALSE;
src->prevStrength = UCOL_TOK_UNSET;
if(U_FAILURE(*status)) { if(U_FAILURE(*status)) {
return; return;
@ -1915,4 +1949,3 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
} }
#endif /* #if !UCONFIG_NO_COLLATION */ #endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -1,7 +1,7 @@
/* /*
******************************************************************************* *******************************************************************************
* *
* Copyright (C) 2001-2008, International Business Machines * Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
* *
******************************************************************************* *******************************************************************************
@ -123,6 +123,7 @@ typedef struct {
USet *copySet; USet *copySet;
USet *removeSet; USet *removeSet;
UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */ UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */
uint32_t prevStrength;
} UColTokenParser; } UColTokenParser;
typedef struct { typedef struct {

View File

@ -5433,6 +5433,57 @@ static void TestHiragana(void) {
ucol_close(ucol); ucol_close(ucol);
} }
const static UChar testSameStrengthSourceCases[][MAX_TOKEN_LEN] = {
{0x0061},
{0x0061},
{0x006c, 0x0061},
{0x0061, 0x0061, 0x0061},
{0x0062}
};
const static UChar testSameStrengthTargetCases[][MAX_TOKEN_LEN] = {
{0x0031},
{0x006d},
{0x006b, 0x0062},
{0x0031, 0x0032, 0x0033},
{0x007a}
};
const static UCollationResult sameStrengthResults[] = {
UCOL_EQUAL,
UCOL_LESS,
UCOL_LESS,
UCOL_EQUAL,
UCOL_LESS
};
static void TestSameStrengthList(void)
{
int32_t i;
UParseError error;
UErrorCode status = U_ZERO_ERROR;
UCollator *myCollation;
char srules[500] = "&a<*bcd &b<<*klm &k<<<*xyz &a=*123";
UChar rules[500];
uint32_t length = 0;
u_strFromUTF8(rules, 500, &length, srules, strlen(srules), &status);
myCollation = ucol_openRules(rules, length, UCOL_ON, UCOL_TERTIARY, &error, &status);
if(U_FAILURE(status)){
log_err_status(status, "ERROR: in creation of rule based collator: %s\n", myErrorName(status));
return;
}
log_verbose("Testing the <<* syntax\n");
/*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
ucol_setStrength(myCollation, UCOL_TERTIARY);*/
for (i = 0; i < 5 ; i++)
{
doTest(myCollation, testSameStrengthSourceCases[i], testSameStrengthTargetCases[i], sameStrengthResults[i]);
}
ucol_close(myCollation);
}
#define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x) #define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x)
void addMiscCollTest(TestNode** root) void addMiscCollTest(TestNode** root)
@ -5508,6 +5559,7 @@ void addMiscCollTest(TestNode** root)
TEST(TestTailor6179); TEST(TestTailor6179);
TEST(TestUCAPrecontext); TEST(TestUCAPrecontext);
TEST(TestOutOfBuffer5468); TEST(TestOutOfBuffer5468);
TEST(TestSameStrengthList);
} }
#endif /* #if !UCONFIG_NO_COLLATION */ #endif /* #if !UCONFIG_NO_COLLATION */