ICU-7015 Implementing compact collation tailoring syntax by introducing a star-notation.

X-SVN-Rev: 27521
This commit is contained in:
Umesh Nair 2010-02-09 19:59:06 +00:00
parent 6d69ce9111
commit b11c49bcd5
3 changed files with 96 additions and 10 deletions

View File

@ -760,6 +760,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
uint32_t extensionOffset = 0;
uint32_t newStrength = UCOL_TOK_UNSET;
UChar buff[10];
UChar32 codepoint;
src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
@ -823,6 +824,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
if(*(src->current+1) == 0x002A) {/*'*'*/
src->current++;
src->prevStrength = newStrength;
}else{
src->prevStrength = UCOL_TOK_UNSET;
}
break;
case 0x002C/*','*/:
@ -838,6 +845,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
src->prevStrength = UCOL_TOK_UNSET;
break;
case 0x003B/*';'*/:
@ -853,6 +861,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
src->prevStrength = UCOL_TOK_UNSET;
break;
case 0x003C/*'<'*/:
@ -880,6 +889,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
if(*(src->current+1) == 0x002A) {/*'*'*/
src->current++;
src->prevStrength = newStrength;
}else{
src->prevStrength = UCOL_TOK_UNSET;
}
break;
case 0x0026/*'&'*/:
@ -889,6 +904,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
}
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
src->prevStrength = UCOL_TOK_UNSET;
break;
case 0x005b/*'['*/:
@ -953,11 +969,15 @@ ucol_tok_parseNextToken(UColTokenParser *src,
/* found a quote, we're gonna start copying */
case 0x0027/*'\''*/:
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
// enabling rules to start with a non-token character a < b
// newStrength = UCOL_TOK_RESET;
if(src->prevStrength == UCOL_TOK_UNSET){
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
// enabling rules to start with a non-token character a < b
// newStrength = UCOL_TOK_RESET;
}else{
newStrength = src->prevStrength;
}
}
inQuote = TRUE;
@ -1036,9 +1056,13 @@ ucol_tok_parseNextToken(UColTokenParser *src,
break;
default:
if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
if(src->prevStrength == UCOL_TOK_UNSET){
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}else{
newStrength = src->prevStrength;
}
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
@ -1056,6 +1080,11 @@ ucol_tok_parseNextToken(UColTokenParser *src,
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
}
src->parsedToken.charsLen++;
if(src->prevStrength != UCOL_TOK_UNSET){
U16_NEXT(0, src->current, src->end, codepoint);
src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
goto EndOfLoop;
}
} else {
if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->current - src->source);
@ -1069,6 +1098,10 @@ ucol_tok_parseNextToken(UColTokenParser *src,
}
if(wasInQuote) {
if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
src->current++;
goto EndOfLoop;
}
if(ch != 0x27) {
if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
ucol_tok_addToExtraCurrent(src, &ch, 1, status);
@ -1826,6 +1859,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
src->parsedToken.flags = 0;
src->parsedToken.strength = UCOL_TOK_UNSET;
src->buildCCTabFlag = FALSE;
src->prevStrength = UCOL_TOK_UNSET;
if(U_FAILURE(*status)) {
return;
@ -1915,4 +1949,3 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
}
#endif /* #if !UCONFIG_NO_COLLATION */

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -123,6 +123,7 @@ typedef struct {
USet *copySet;
USet *removeSet;
UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */
uint32_t prevStrength;
} UColTokenParser;
typedef struct {

View File

@ -5433,6 +5433,57 @@ static void TestHiragana(void) {
ucol_close(ucol);
}
const static UChar testSameStrengthSourceCases[][MAX_TOKEN_LEN] = {
{0x0061},
{0x0061},
{0x006c, 0x0061},
{0x0061, 0x0061, 0x0061},
{0x0062}
};
const static UChar testSameStrengthTargetCases[][MAX_TOKEN_LEN] = {
{0x0031},
{0x006d},
{0x006b, 0x0062},
{0x0031, 0x0032, 0x0033},
{0x007a}
};
const static UCollationResult sameStrengthResults[] = {
UCOL_EQUAL,
UCOL_LESS,
UCOL_LESS,
UCOL_EQUAL,
UCOL_LESS
};
static void TestSameStrengthList(void)
{
int32_t i;
UParseError error;
UErrorCode status = U_ZERO_ERROR;
UCollator *myCollation;
char srules[500] = "&a<*bcd &b<<*klm &k<<<*xyz &a=*123";
UChar rules[500];
uint32_t length = 0;
u_strFromUTF8(rules, 500, &length, srules, strlen(srules), &status);
myCollation = ucol_openRules(rules, length, UCOL_ON, UCOL_TERTIARY, &error, &status);
if(U_FAILURE(status)){
log_err_status(status, "ERROR: in creation of rule based collator: %s\n", myErrorName(status));
return;
}
log_verbose("Testing the <<* syntax\n");
/*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
ucol_setStrength(myCollation, UCOL_TERTIARY);*/
for (i = 0; i < 5 ; i++)
{
doTest(myCollation, testSameStrengthSourceCases[i], testSameStrengthTargetCases[i], sameStrengthResults[i]);
}
ucol_close(myCollation);
}
#define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x)
void addMiscCollTest(TestNode** root)
@ -5508,6 +5559,7 @@ void addMiscCollTest(TestNode** root)
TEST(TestTailor6179);
TEST(TestUCAPrecontext);
TEST(TestOutOfBuffer5468);
TEST(TestSameStrengthList);
}
#endif /* #if !UCONFIG_NO_COLLATION */