From b11c49bcd5f1cf543df5e38de59702e8b1b6560e Mon Sep 17 00:00:00 2001 From: Umesh Nair Date: Tue, 9 Feb 2010 19:59:06 +0000 Subject: [PATCH] ICU-7015 Implementing compact collation tailoring syntax by introducing a star-notation. X-SVN-Rev: 27521 --- icu4c/source/i18n/ucol_tok.cpp | 51 +++++++++++++++++++++----- icu4c/source/i18n/ucol_tok.h | 3 +- icu4c/source/test/cintltst/cmsccoll.c | 52 +++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 10 deletions(-) diff --git a/icu4c/source/i18n/ucol_tok.cpp b/icu4c/source/i18n/ucol_tok.cpp index 193a422028..b30286544f 100644 --- a/icu4c/source/i18n/ucol_tok.cpp +++ b/icu4c/source/i18n/ucol_tok.cpp @@ -760,6 +760,7 @@ ucol_tok_parseNextToken(UColTokenParser *src, uint32_t extensionOffset = 0; uint32_t newStrength = UCOL_TOK_UNSET; UChar buff[10]; + UChar32 codepoint; src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0; src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0; @@ -823,6 +824,12 @@ ucol_tok_parseNextToken(UColTokenParser *src, goto EndOfLoop; } newStrength = UCOL_IDENTICAL; + if(*(src->current+1) == 0x002A) {/*'*'*/ + src->current++; + src->prevStrength = newStrength; + }else{ + src->prevStrength = UCOL_TOK_UNSET; + } break; case 0x002C/*','*/: @@ -838,6 +845,7 @@ ucol_tok_parseNextToken(UColTokenParser *src, goto EndOfLoop; } newStrength = UCOL_TERTIARY; + src->prevStrength = UCOL_TOK_UNSET; break; case 0x003B/*';'*/: @@ -853,6 +861,7 @@ ucol_tok_parseNextToken(UColTokenParser *src, goto EndOfLoop; } newStrength = UCOL_SECONDARY; + src->prevStrength = UCOL_TOK_UNSET; break; case 0x003C/*'<'*/: @@ -880,6 +889,12 @@ ucol_tok_parseNextToken(UColTokenParser *src, } else { /* just one */ newStrength = UCOL_PRIMARY; } + if(*(src->current+1) == 0x002A) {/*'*'*/ + src->current++; + src->prevStrength = newStrength; + }else{ + src->prevStrength = UCOL_TOK_UNSET; + } break; case 0x0026/*'&'*/: @@ -889,6 +904,7 @@ ucol_tok_parseNextToken(UColTokenParser *src, } newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ + src->prevStrength = UCOL_TOK_UNSET; break; case 0x005b/*'['*/: @@ -953,11 +969,15 @@ ucol_tok_parseNextToken(UColTokenParser *src, /* found a quote, we're gonna start copying */ case 0x0027/*'\''*/: if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */ - *status = U_INVALID_FORMAT_ERROR; - syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); - return NULL; - // enabling rules to start with a non-token character a < b - // newStrength = UCOL_TOK_RESET; + if(src->prevStrength == UCOL_TOK_UNSET){ + *status = U_INVALID_FORMAT_ERROR; + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); + return NULL; + // enabling rules to start with a non-token character a < b + // newStrength = UCOL_TOK_RESET; + }else{ + newStrength = src->prevStrength; + } } inQuote = TRUE; @@ -1036,9 +1056,13 @@ ucol_tok_parseNextToken(UColTokenParser *src, break; default: if (newStrength == UCOL_TOK_UNSET) { - *status = U_INVALID_FORMAT_ERROR; - syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); - return NULL; + if(src->prevStrength == UCOL_TOK_UNSET){ + *status = U_INVALID_FORMAT_ERROR; + syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); + return NULL; + }else{ + newStrength = src->prevStrength; + } } if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) { @@ -1056,6 +1080,11 @@ ucol_tok_parseNextToken(UColTokenParser *src, src->parsedToken.charsOffset = (uint32_t)(src->current - src->source); } src->parsedToken.charsLen++; + if(src->prevStrength != UCOL_TOK_UNSET){ + U16_NEXT(0, src->current, src->end, codepoint); + src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1; + goto EndOfLoop; + } } else { if(newExtensionLen == 0) { extensionOffset = (uint32_t)(src->current - src->source); @@ -1069,6 +1098,10 @@ ucol_tok_parseNextToken(UColTokenParser *src, } if(wasInQuote) { + if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){ + src->current++; + goto EndOfLoop; + } if(ch != 0x27) { if(inQuote || !uprv_isRuleWhiteSpace(ch)) { ucol_tok_addToExtraCurrent(src, &ch, 1, status); @@ -1826,6 +1859,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint src->parsedToken.flags = 0; src->parsedToken.strength = UCOL_TOK_UNSET; src->buildCCTabFlag = FALSE; + src->prevStrength = UCOL_TOK_UNSET; if(U_FAILURE(*status)) { return; @@ -1915,4 +1949,3 @@ void ucol_tok_closeTokenList(UColTokenParser *src) { } #endif /* #if !UCONFIG_NO_COLLATION */ - diff --git a/icu4c/source/i18n/ucol_tok.h b/icu4c/source/i18n/ucol_tok.h index 0662110059..87beb6469f 100644 --- a/icu4c/source/i18n/ucol_tok.h +++ b/icu4c/source/i18n/ucol_tok.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2001-2008, International Business Machines +* Copyright (C) 2001-2010, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -123,6 +123,7 @@ typedef struct { USet *copySet; USet *removeSet; UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */ + uint32_t prevStrength; } UColTokenParser; typedef struct { diff --git a/icu4c/source/test/cintltst/cmsccoll.c b/icu4c/source/test/cintltst/cmsccoll.c index f9364e3957..a077e3d916 100644 --- a/icu4c/source/test/cintltst/cmsccoll.c +++ b/icu4c/source/test/cintltst/cmsccoll.c @@ -5433,6 +5433,57 @@ static void TestHiragana(void) { ucol_close(ucol); } +const static UChar testSameStrengthSourceCases[][MAX_TOKEN_LEN] = { + {0x0061}, + {0x0061}, + {0x006c, 0x0061}, + {0x0061, 0x0061, 0x0061}, + {0x0062} +}; + +const static UChar testSameStrengthTargetCases[][MAX_TOKEN_LEN] = { + {0x0031}, + {0x006d}, + {0x006b, 0x0062}, + {0x0031, 0x0032, 0x0033}, + {0x007a} +}; + +const static UCollationResult sameStrengthResults[] = { + UCOL_EQUAL, + UCOL_LESS, + UCOL_LESS, + UCOL_EQUAL, + UCOL_LESS +}; + +static void TestSameStrengthList(void) +{ + + int32_t i; + UParseError error; + UErrorCode status = U_ZERO_ERROR; + UCollator *myCollation; + char srules[500] = "&a<*bcd &b<<*klm &k<<<*xyz &a=*123"; + UChar rules[500]; + uint32_t length = 0; + + u_strFromUTF8(rules, 500, &length, srules, strlen(srules), &status); + myCollation = ucol_openRules(rules, length, UCOL_ON, UCOL_TERTIARY, &error, &status); + if(U_FAILURE(status)){ + log_err_status(status, "ERROR: in creation of rule based collator: %s\n", myErrorName(status)); + return; + } + log_verbose("Testing the <<* syntax\n"); + /*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); + ucol_setStrength(myCollation, UCOL_TERTIARY);*/ + for (i = 0; i < 5 ; i++) + { + doTest(myCollation, testSameStrengthSourceCases[i], testSameStrengthTargetCases[i], sameStrengthResults[i]); + } + ucol_close(myCollation); +} + #define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x) void addMiscCollTest(TestNode** root) @@ -5508,6 +5559,7 @@ void addMiscCollTest(TestNode** root) TEST(TestTailor6179); TEST(TestUCAPrecontext); TEST(TestOutOfBuffer5468); + TEST(TestSameStrengthList); } #endif /* #if !UCONFIG_NO_COLLATION */