diff --git a/icu4c/source/i18n/ucol.cpp b/icu4c/source/i18n/ucol.cpp index cd6287d2ee..6b686d99ae 100644 --- a/icu4c/source/i18n/ucol.cpp +++ b/icu4c/source/i18n/ucol.cpp @@ -173,11 +173,14 @@ ucol_openRules( const UChar *rules, return 0; } - /* do we need to normalize the string beforehand? */ - - src.source = rules; - src.current = rules; - src.end = rules+rulesLength; + /*src.source = rules;*/ + src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar)); + uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar)); + src.current = src.source; + src.end = src.source+rulesLength; + src.sourceCurrent = src.source; + src.extraCurrent = src.end; + src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE; src.UCA = UCA; src.invUCA = ucol_initInverseUCA(status); src.resultLen = 0; diff --git a/icu4c/source/i18n/ucol_bld.cpp b/icu4c/source/i18n/ucol_bld.cpp index c2d425f970..7301348dd7 100644 --- a/icu4c/source/i18n/ucol_bld.cpp +++ b/icu4c/source/i18n/ucol_bld.cpp @@ -291,13 +291,13 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_ } if(strength == UCOL_SECONDARY) { /* similar as simple */ - if(low > UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) { + if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) { low = UCOL_COMMON_TOP2<<24; } if(high > UCOL_COMMON_BOT2<<24 && high < UCOL_COMMON_TOP2<<24) { high = UCOL_COMMON_TOP2<<24; } - if(low <= UCOL_COMMON_BOT2<<24) { + if(low < UCOL_COMMON_BOT2<<24) { g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, g->ranges); g->current = UCOL_COMMON_BOT2; return g->current; diff --git a/icu4c/source/i18n/ucol_tok.c b/icu4c/source/i18n/ucol_tok.c index cbd6daf1fc..7971562c63 100644 --- a/icu4c/source/i18n/ucol_tok.c +++ b/icu4c/source/i18n/ucol_tok.c @@ -324,6 +324,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu UBool inChars = TRUE; UBool inQuote = FALSE; + UBool wasInQuote = FALSE; UChar *optionEnd = NULL; newStrength = UCOL_TOK_UNSET; @@ -339,12 +340,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu } else { if ((newCharsLen == 0) || inChars) { if(newCharsLen == 0) { - charsOffset = src->current - src->source; + charsOffset = src->extraCurrent - src->source; } newCharsLen++; } else { if(newExtensionsLen == 0) { - extensionOffset = src->current - src->source; + extensionOffset = src->extraCurrent - src->source; } newExtensionsLen++; } @@ -357,6 +358,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu goto EndOfLoop; } + /* if we start with strength, we'll reset to top */ + if(lastToken == NULL) { + top = TRUE; + newStrength = UCOL_TOK_RESET; + goto EndOfLoop; + } newStrength = UCOL_IDENTICAL; break; @@ -365,6 +372,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu goto EndOfLoop; } + /* if we start with strength, we'll reset to top */ + if(lastToken == NULL) { + top = TRUE; + newStrength = UCOL_TOK_RESET; + goto EndOfLoop; + } newStrength = UCOL_TERTIARY; break; @@ -373,6 +386,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu goto EndOfLoop; } + /* if we start with strength, we'll reset to top */ + if(lastToken == NULL) { + top = TRUE; + newStrength = UCOL_TOK_RESET; + goto EndOfLoop; + } newStrength = UCOL_SECONDARY; break; @@ -381,6 +400,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu goto EndOfLoop; } + /* if we start with strength, we'll reset to top */ + if(lastToken == NULL) { + top = TRUE; + newStrength = UCOL_TOK_RESET; + goto EndOfLoop; + } /* before this, do a scan to verify whether this is */ /* another strength */ if(*(src->current+1) == 0x003C) { @@ -436,22 +461,30 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu inChars = FALSE; break; + /* found a quote, we're gonna start copying */ case 0x0027/*'\''*/: inQuote = TRUE; - ch = *(++(src->current)); /*pattern[++index]; */ + wasInQuote = TRUE; if (newCharsLen == 0) { - charsOffset = src->current - src->source; + charsOffset = src->extraCurrent - src->source; newCharsLen++; - } else if (inChars) { - if(newCharsLen == 0) { - charsOffset = src->current - src->source; + } else if (inChars) { /* we're reading some chars */ + charsOffset = src->extraCurrent - src->source; + if(newCharsLen != 0) { + uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar)); + src->extraCurrent += newCharsLen; } newCharsLen++; } else { + if(newExtensionsLen != 0) { + uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar)); + src->extraCurrent += newExtensionsLen; + } newExtensionsLen++; } + ch = *(++(src->current)); /*pattern[++index]; */ break; /* '@' is french only if the strength is not currently set */ @@ -491,10 +524,20 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu } } + if(wasInQuote) { + if(ch != 0x27) { + *src->extraCurrent++ = ch; + } + if(src->extraCurrent == src->extraEnd) { + /* reallocate */ + } + } + src->current++; } EndOfLoop: + wasInQuote = FALSE; if (newStrength == UCOL_TOK_UNSET) { return 0; } @@ -721,6 +764,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu src->resultLen++; uhash_put(uchars2tokens, sourceToken, sourceToken, status); } else { /* reset to something already in rules */ + top = FALSE; } } /* 7 After all this, set LAST to point to sourceToken, and goto step 3. */ @@ -739,5 +783,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) { void ucol_tok_closeTokenList(UColTokenParser *src) { uhash_close(uchars2tokens); uprv_free(src->lh); + uprv_free(src->source); } diff --git a/icu4c/source/i18n/ucol_tok.h b/icu4c/source/i18n/ucol_tok.h index 974a868cbe..41e7ed930c 100644 --- a/icu4c/source/i18n/ucol_tok.h +++ b/icu4c/source/i18n/ucol_tok.h @@ -9,6 +9,9 @@ #define UCOL_RESET_TOP_VALUE 0x9F000303 #define UCOL_NEXT_TOP_VALUE 0xD0000303 +/* this is space for the extra strings that need to be unquoted */ +/* during the parsing of the rules */ +#define UCOL_TOK_EXTRA_RULE_SPACE_SIZE 1024 typedef struct UColToken UColToken; typedef struct { @@ -53,9 +56,12 @@ struct UColToken { }; typedef struct { - const UChar *source; - const UChar *end; - const UChar *current; + UChar *source; + UChar *end; + UChar *current; + UChar *sourceCurrent; + UChar *extraCurrent; + UChar *extraEnd; const InverseTableHeader *invUCA; const UCollator *UCA; UCATableHeader *image; diff --git a/icu4c/source/test/cintltst/cg7coll.c b/icu4c/source/test/cintltst/cg7coll.c index b5649467f0..cff1767f8e 100644 --- a/icu4c/source/test/cintltst/cg7coll.c +++ b/icu4c/source/test/cintltst/cg7coll.c @@ -116,7 +116,7 @@ const static int32_t results[TESTLOCALES][TOTALTESTSET] = { /* new table collation with rules "& Question-mark ; ? & Hash-mark ; # & Ampersand ; '&' " loop to TOTALTESTSET */ { 23, 24, 25, 22, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 21, 14, 1, 11, 2, 3, 4, 5, 19, 20, 6, 8, 10, 7, 29 }, /* analogous to Japanese rules " & aa ; a- & ee ; e- & ii ; i- & oo ; o- & uu ; u- " */ /* loop to TOTALTESTSET */ - { 19, 22, 21, 23, 24, 25, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 } + { 19, 22, 21, 24, 23, 25, 12, 13, 9, 0, 17, 16, 28, 26, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 } }; static UChar*