ICU-7015 Implementing compact collation tailoring syntax by introducing a star-notation.
X-SVN-Rev: 27521
This commit is contained in:
parent
6d69ce9111
commit
b11c49bcd5
@ -760,6 +760,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
uint32_t extensionOffset = 0;
|
uint32_t extensionOffset = 0;
|
||||||
uint32_t newStrength = UCOL_TOK_UNSET;
|
uint32_t newStrength = UCOL_TOK_UNSET;
|
||||||
UChar buff[10];
|
UChar buff[10];
|
||||||
|
UChar32 codepoint;
|
||||||
|
|
||||||
src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
|
src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
|
||||||
src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
|
src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
|
||||||
@ -823,6 +824,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
goto EndOfLoop;
|
goto EndOfLoop;
|
||||||
}
|
}
|
||||||
newStrength = UCOL_IDENTICAL;
|
newStrength = UCOL_IDENTICAL;
|
||||||
|
if(*(src->current+1) == 0x002A) {/*'*'*/
|
||||||
|
src->current++;
|
||||||
|
src->prevStrength = newStrength;
|
||||||
|
}else{
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x002C/*','*/:
|
case 0x002C/*','*/:
|
||||||
@ -838,6 +845,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
goto EndOfLoop;
|
goto EndOfLoop;
|
||||||
}
|
}
|
||||||
newStrength = UCOL_TERTIARY;
|
newStrength = UCOL_TERTIARY;
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x003B/*';'*/:
|
case 0x003B/*';'*/:
|
||||||
@ -853,6 +861,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
goto EndOfLoop;
|
goto EndOfLoop;
|
||||||
}
|
}
|
||||||
newStrength = UCOL_SECONDARY;
|
newStrength = UCOL_SECONDARY;
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x003C/*'<'*/:
|
case 0x003C/*'<'*/:
|
||||||
@ -880,6 +889,12 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
} else { /* just one */
|
} else { /* just one */
|
||||||
newStrength = UCOL_PRIMARY;
|
newStrength = UCOL_PRIMARY;
|
||||||
}
|
}
|
||||||
|
if(*(src->current+1) == 0x002A) {/*'*'*/
|
||||||
|
src->current++;
|
||||||
|
src->prevStrength = newStrength;
|
||||||
|
}else{
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x0026/*'&'*/:
|
case 0x0026/*'&'*/:
|
||||||
@ -889,6 +904,7 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
}
|
}
|
||||||
|
|
||||||
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
|
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x005b/*'['*/:
|
case 0x005b/*'['*/:
|
||||||
@ -953,11 +969,15 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
/* found a quote, we're gonna start copying */
|
/* found a quote, we're gonna start copying */
|
||||||
case 0x0027/*'\''*/:
|
case 0x0027/*'\''*/:
|
||||||
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
|
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
if(src->prevStrength == UCOL_TOK_UNSET){
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
return NULL;
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
// enabling rules to start with a non-token character a < b
|
return NULL;
|
||||||
// newStrength = UCOL_TOK_RESET;
|
// enabling rules to start with a non-token character a < b
|
||||||
|
// newStrength = UCOL_TOK_RESET;
|
||||||
|
}else{
|
||||||
|
newStrength = src->prevStrength;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inQuote = TRUE;
|
inQuote = TRUE;
|
||||||
@ -1036,9 +1056,13 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (newStrength == UCOL_TOK_UNSET) {
|
if (newStrength == UCOL_TOK_UNSET) {
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
if(src->prevStrength == UCOL_TOK_UNSET){
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
return NULL;
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
|
return NULL;
|
||||||
|
}else{
|
||||||
|
newStrength = src->prevStrength;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
|
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
|
||||||
@ -1056,6 +1080,11 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
|
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
|
||||||
}
|
}
|
||||||
src->parsedToken.charsLen++;
|
src->parsedToken.charsLen++;
|
||||||
|
if(src->prevStrength != UCOL_TOK_UNSET){
|
||||||
|
U16_NEXT(0, src->current, src->end, codepoint);
|
||||||
|
src->parsedToken.charsLen+= U16_LENGTH(codepoint) - 1;
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if(newExtensionLen == 0) {
|
if(newExtensionLen == 0) {
|
||||||
extensionOffset = (uint32_t)(src->current - src->source);
|
extensionOffset = (uint32_t)(src->current - src->source);
|
||||||
@ -1069,6 +1098,10 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if(wasInQuote) {
|
if(wasInQuote) {
|
||||||
|
if(src->prevStrength != UCOL_TOK_UNSET && !inQuote){
|
||||||
|
src->current++;
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
if(ch != 0x27) {
|
if(ch != 0x27) {
|
||||||
if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
|
if(inQuote || !uprv_isRuleWhiteSpace(ch)) {
|
||||||
ucol_tok_addToExtraCurrent(src, &ch, 1, status);
|
ucol_tok_addToExtraCurrent(src, &ch, 1, status);
|
||||||
@ -1826,6 +1859,7 @@ void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint
|
|||||||
src->parsedToken.flags = 0;
|
src->parsedToken.flags = 0;
|
||||||
src->parsedToken.strength = UCOL_TOK_UNSET;
|
src->parsedToken.strength = UCOL_TOK_UNSET;
|
||||||
src->buildCCTabFlag = FALSE;
|
src->buildCCTabFlag = FALSE;
|
||||||
|
src->prevStrength = UCOL_TOK_UNSET;
|
||||||
|
|
||||||
if(U_FAILURE(*status)) {
|
if(U_FAILURE(*status)) {
|
||||||
return;
|
return;
|
||||||
@ -1915,4 +1949,3 @@ void ucol_tok_closeTokenList(UColTokenParser *src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/*
|
/*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*
|
*
|
||||||
* Copyright (C) 2001-2008, International Business Machines
|
* Copyright (C) 2001-2010, International Business Machines
|
||||||
* Corporation and others. All Rights Reserved.
|
* Corporation and others. All Rights Reserved.
|
||||||
*
|
*
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
@ -123,6 +123,7 @@ typedef struct {
|
|||||||
USet *copySet;
|
USet *copySet;
|
||||||
USet *removeSet;
|
USet *removeSet;
|
||||||
UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */
|
UBool buildCCTabFlag; /* Tailoring rule requirs building combining class table. */
|
||||||
|
uint32_t prevStrength;
|
||||||
} UColTokenParser;
|
} UColTokenParser;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
@ -5433,6 +5433,57 @@ static void TestHiragana(void) {
|
|||||||
ucol_close(ucol);
|
ucol_close(ucol);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const static UChar testSameStrengthSourceCases[][MAX_TOKEN_LEN] = {
|
||||||
|
{0x0061},
|
||||||
|
{0x0061},
|
||||||
|
{0x006c, 0x0061},
|
||||||
|
{0x0061, 0x0061, 0x0061},
|
||||||
|
{0x0062}
|
||||||
|
};
|
||||||
|
|
||||||
|
const static UChar testSameStrengthTargetCases[][MAX_TOKEN_LEN] = {
|
||||||
|
{0x0031},
|
||||||
|
{0x006d},
|
||||||
|
{0x006b, 0x0062},
|
||||||
|
{0x0031, 0x0032, 0x0033},
|
||||||
|
{0x007a}
|
||||||
|
};
|
||||||
|
|
||||||
|
const static UCollationResult sameStrengthResults[] = {
|
||||||
|
UCOL_EQUAL,
|
||||||
|
UCOL_LESS,
|
||||||
|
UCOL_LESS,
|
||||||
|
UCOL_EQUAL,
|
||||||
|
UCOL_LESS
|
||||||
|
};
|
||||||
|
|
||||||
|
static void TestSameStrengthList(void)
|
||||||
|
{
|
||||||
|
|
||||||
|
int32_t i;
|
||||||
|
UParseError error;
|
||||||
|
UErrorCode status = U_ZERO_ERROR;
|
||||||
|
UCollator *myCollation;
|
||||||
|
char srules[500] = "&a<*bcd &b<<*klm &k<<<*xyz &a=*123";
|
||||||
|
UChar rules[500];
|
||||||
|
uint32_t length = 0;
|
||||||
|
|
||||||
|
u_strFromUTF8(rules, 500, &length, srules, strlen(srules), &status);
|
||||||
|
myCollation = ucol_openRules(rules, length, UCOL_ON, UCOL_TERTIARY, &error, &status);
|
||||||
|
if(U_FAILURE(status)){
|
||||||
|
log_err_status(status, "ERROR: in creation of rule based collator: %s\n", myErrorName(status));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log_verbose("Testing the <<* syntax\n");
|
||||||
|
/*ucol_setAttribute(myCollation, UCOL_NORMALIZATION_MODE, UCOL_ON, &status);
|
||||||
|
ucol_setStrength(myCollation, UCOL_TERTIARY);*/
|
||||||
|
for (i = 0; i < 5 ; i++)
|
||||||
|
{
|
||||||
|
doTest(myCollation, testSameStrengthSourceCases[i], testSameStrengthTargetCases[i], sameStrengthResults[i]);
|
||||||
|
}
|
||||||
|
ucol_close(myCollation);
|
||||||
|
}
|
||||||
|
|
||||||
#define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x)
|
#define TEST(x) addTest(root, &x, "tscoll/cmsccoll/" # x)
|
||||||
|
|
||||||
void addMiscCollTest(TestNode** root)
|
void addMiscCollTest(TestNode** root)
|
||||||
@ -5508,6 +5559,7 @@ void addMiscCollTest(TestNode** root)
|
|||||||
TEST(TestTailor6179);
|
TEST(TestTailor6179);
|
||||||
TEST(TestUCAPrecontext);
|
TEST(TestUCAPrecontext);
|
||||||
TEST(TestOutOfBuffer5468);
|
TEST(TestOutOfBuffer5468);
|
||||||
|
TEST(TestSameStrengthList);
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||||
|
Loading…
Reference in New Issue
Block a user