ICU-96 parsing update: use implicit &[top] if there is no reset and parsing quoted strings
X-SVN-Rev: 4054
This commit is contained in:
parent
a072ac7f00
commit
2d87db9275
@ -173,11 +173,14 @@ ucol_openRules( const UChar *rules,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* do we need to normalize the string beforehand? */
|
||||
|
||||
src.source = rules;
|
||||
src.current = rules;
|
||||
src.end = rules+rulesLength;
|
||||
/*src.source = rules;*/
|
||||
src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
|
||||
uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
|
||||
src.current = src.source;
|
||||
src.end = src.source+rulesLength;
|
||||
src.sourceCurrent = src.source;
|
||||
src.extraCurrent = src.end;
|
||||
src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
|
||||
src.UCA = UCA;
|
||||
src.invUCA = ucol_initInverseUCA(status);
|
||||
src.resultLen = 0;
|
||||
|
@ -291,13 +291,13 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
|
||||
}
|
||||
|
||||
if(strength == UCOL_SECONDARY) { /* similar as simple */
|
||||
if(low > UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
|
||||
if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
|
||||
low = UCOL_COMMON_TOP2<<24;
|
||||
}
|
||||
if(high > UCOL_COMMON_BOT2<<24 && high < UCOL_COMMON_TOP2<<24) {
|
||||
high = UCOL_COMMON_TOP2<<24;
|
||||
}
|
||||
if(low <= UCOL_COMMON_BOT2<<24) {
|
||||
if(low < UCOL_COMMON_BOT2<<24) {
|
||||
g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, g->ranges);
|
||||
g->current = UCOL_COMMON_BOT2;
|
||||
return g->current;
|
||||
|
@ -324,6 +324,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
|
||||
UBool inChars = TRUE;
|
||||
UBool inQuote = FALSE;
|
||||
UBool wasInQuote = FALSE;
|
||||
UChar *optionEnd = NULL;
|
||||
|
||||
newStrength = UCOL_TOK_UNSET;
|
||||
@ -339,12 +340,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
} else {
|
||||
if ((newCharsLen == 0) || inChars) {
|
||||
if(newCharsLen == 0) {
|
||||
charsOffset = src->current - src->source;
|
||||
charsOffset = src->extraCurrent - src->source;
|
||||
}
|
||||
newCharsLen++;
|
||||
} else {
|
||||
if(newExtensionsLen == 0) {
|
||||
extensionOffset = src->current - src->source;
|
||||
extensionOffset = src->extraCurrent - src->source;
|
||||
}
|
||||
newExtensionsLen++;
|
||||
}
|
||||
@ -357,6 +358,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
goto EndOfLoop;
|
||||
}
|
||||
|
||||
/* if we start with strength, we'll reset to top */
|
||||
if(lastToken == NULL) {
|
||||
top = TRUE;
|
||||
newStrength = UCOL_TOK_RESET;
|
||||
goto EndOfLoop;
|
||||
}
|
||||
newStrength = UCOL_IDENTICAL;
|
||||
break;
|
||||
|
||||
@ -365,6 +372,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
goto EndOfLoop;
|
||||
}
|
||||
|
||||
/* if we start with strength, we'll reset to top */
|
||||
if(lastToken == NULL) {
|
||||
top = TRUE;
|
||||
newStrength = UCOL_TOK_RESET;
|
||||
goto EndOfLoop;
|
||||
}
|
||||
newStrength = UCOL_TERTIARY;
|
||||
break;
|
||||
|
||||
@ -373,6 +386,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
goto EndOfLoop;
|
||||
}
|
||||
|
||||
/* if we start with strength, we'll reset to top */
|
||||
if(lastToken == NULL) {
|
||||
top = TRUE;
|
||||
newStrength = UCOL_TOK_RESET;
|
||||
goto EndOfLoop;
|
||||
}
|
||||
newStrength = UCOL_SECONDARY;
|
||||
break;
|
||||
|
||||
@ -381,6 +400,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
goto EndOfLoop;
|
||||
}
|
||||
|
||||
/* if we start with strength, we'll reset to top */
|
||||
if(lastToken == NULL) {
|
||||
top = TRUE;
|
||||
newStrength = UCOL_TOK_RESET;
|
||||
goto EndOfLoop;
|
||||
}
|
||||
/* before this, do a scan to verify whether this is */
|
||||
/* another strength */
|
||||
if(*(src->current+1) == 0x003C) {
|
||||
@ -436,22 +461,30 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
inChars = FALSE;
|
||||
break;
|
||||
|
||||
/* found a quote, we're gonna start copying */
|
||||
case 0x0027/*'\''*/:
|
||||
inQuote = TRUE;
|
||||
ch = *(++(src->current)); /*pattern[++index]; */
|
||||
wasInQuote = TRUE;
|
||||
|
||||
if (newCharsLen == 0) {
|
||||
charsOffset = src->current - src->source;
|
||||
charsOffset = src->extraCurrent - src->source;
|
||||
newCharsLen++;
|
||||
} else if (inChars) {
|
||||
if(newCharsLen == 0) {
|
||||
charsOffset = src->current - src->source;
|
||||
} else if (inChars) { /* we're reading some chars */
|
||||
charsOffset = src->extraCurrent - src->source;
|
||||
if(newCharsLen != 0) {
|
||||
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
|
||||
src->extraCurrent += newCharsLen;
|
||||
}
|
||||
newCharsLen++;
|
||||
} else {
|
||||
if(newExtensionsLen != 0) {
|
||||
uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar));
|
||||
src->extraCurrent += newExtensionsLen;
|
||||
}
|
||||
newExtensionsLen++;
|
||||
}
|
||||
|
||||
ch = *(++(src->current)); /*pattern[++index]; */
|
||||
break;
|
||||
|
||||
/* '@' is french only if the strength is not currently set */
|
||||
@ -491,10 +524,20 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
}
|
||||
}
|
||||
|
||||
if(wasInQuote) {
|
||||
if(ch != 0x27) {
|
||||
*src->extraCurrent++ = ch;
|
||||
}
|
||||
if(src->extraCurrent == src->extraEnd) {
|
||||
/* reallocate */
|
||||
}
|
||||
}
|
||||
|
||||
src->current++;
|
||||
}
|
||||
|
||||
EndOfLoop:
|
||||
wasInQuote = FALSE;
|
||||
if (newStrength == UCOL_TOK_UNSET) {
|
||||
return 0;
|
||||
}
|
||||
@ -721,6 +764,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
|
||||
src->resultLen++;
|
||||
uhash_put(uchars2tokens, sourceToken, sourceToken, status);
|
||||
} else { /* reset to something already in rules */
|
||||
top = FALSE;
|
||||
}
|
||||
}
|
||||
/* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
|
||||
@ -739,5 +783,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) {
|
||||
void ucol_tok_closeTokenList(UColTokenParser *src) {
|
||||
uhash_close(uchars2tokens);
|
||||
uprv_free(src->lh);
|
||||
uprv_free(src->source);
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,9 @@
|
||||
#define UCOL_RESET_TOP_VALUE 0x9F000303
|
||||
#define UCOL_NEXT_TOP_VALUE 0xD0000303
|
||||
|
||||
/* this is space for the extra strings that need to be unquoted */
|
||||
/* during the parsing of the rules */
|
||||
#define UCOL_TOK_EXTRA_RULE_SPACE_SIZE 1024
|
||||
typedef struct UColToken UColToken;
|
||||
|
||||
typedef struct {
|
||||
@ -53,9 +56,12 @@ struct UColToken {
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
const UChar *source;
|
||||
const UChar *end;
|
||||
const UChar *current;
|
||||
UChar *source;
|
||||
UChar *end;
|
||||
UChar *current;
|
||||
UChar *sourceCurrent;
|
||||
UChar *extraCurrent;
|
||||
UChar *extraEnd;
|
||||
const InverseTableHeader *invUCA;
|
||||
const UCollator *UCA;
|
||||
UCATableHeader *image;
|
||||
|
@ -116,7 +116,7 @@ const static int32_t results[TESTLOCALES][TOTALTESTSET] = {
|
||||
/* new table collation with rules "& Question-mark ; ? & Hash-mark ; # & Ampersand ; '&' " loop to TOTALTESTSET */
|
||||
{ 23, 24, 25, 22, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 21, 14, 1, 11, 2, 3, 4, 5, 19, 20, 6, 8, 10, 7, 29 },
|
||||
/* analogous to Japanese rules " & aa ; a- & ee ; e- & ii ; i- & oo ; o- & uu ; u- " */ /* loop to TOTALTESTSET */
|
||||
{ 19, 22, 21, 23, 24, 25, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
|
||||
{ 19, 22, 21, 24, 23, 25, 12, 13, 9, 0, 17, 16, 28, 26, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
|
||||
};
|
||||
|
||||
static UChar*
|
||||
|
Loading…
Reference in New Issue
Block a user