ICU-96 parsing update: use implicit &[top] if there is no reset and parsing quoted strings

X-SVN-Rev: 4054
This commit is contained in:
Vladimir Weinstein 2001-03-14 00:12:46 +00:00
parent a072ac7f00
commit 2d87db9275
5 changed files with 72 additions and 18 deletions

View File

@ -173,11 +173,14 @@ ucol_openRules( const UChar *rules,
return 0;
}
/* do we need to normalize the string beforehand? */
src.source = rules;
src.current = rules;
src.end = rules+rulesLength;
/*src.source = rules;*/
src.source = (UChar *)uprv_malloc((rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
uprv_memcpy(src.source, rules, rulesLength*sizeof(UChar));
src.current = src.source;
src.end = src.source+rulesLength;
src.sourceCurrent = src.source;
src.extraCurrent = src.end;
src.extraEnd = src.end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
src.UCA = UCA;
src.invUCA = ucol_initInverseUCA(status);
src.resultLen = 0;

View File

@ -291,13 +291,13 @@ U_CFUNC uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_
}
if(strength == UCOL_SECONDARY) { /* similar as simple */
if(low > UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
if(low >= UCOL_COMMON_BOT2<<24 && low < UCOL_COMMON_TOP2<<24) {
low = UCOL_COMMON_TOP2<<24;
}
if(high > UCOL_COMMON_BOT2<<24 && high < UCOL_COMMON_TOP2<<24) {
high = UCOL_COMMON_TOP2<<24;
}
if(low <= UCOL_COMMON_BOT2<<24) {
if(low < UCOL_COMMON_BOT2<<24) {
g->noOfRanges = ucol_allocWeights(UCOL_COMMON_TOP2<<24, high, count, g->ranges);
g->current = UCOL_COMMON_BOT2;
return g->current;

View File

@ -324,6 +324,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
UBool inChars = TRUE;
UBool inQuote = FALSE;
UBool wasInQuote = FALSE;
UChar *optionEnd = NULL;
newStrength = UCOL_TOK_UNSET;
@ -339,12 +340,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
} else {
if ((newCharsLen == 0) || inChars) {
if(newCharsLen == 0) {
charsOffset = src->current - src->source;
charsOffset = src->extraCurrent - src->source;
}
newCharsLen++;
} else {
if(newExtensionsLen == 0) {
extensionOffset = src->current - src->source;
extensionOffset = src->extraCurrent - src->source;
}
newExtensionsLen++;
}
@ -357,6 +358,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
@ -365,6 +372,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
break;
@ -373,6 +386,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
@ -381,6 +400,12 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(lastToken == NULL) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
@ -436,22 +461,30 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
inChars = FALSE;
break;
/* found a quote, we're gonna start copying */
case 0x0027/*'\''*/:
inQuote = TRUE;
ch = *(++(src->current)); /*pattern[++index]; */
wasInQuote = TRUE;
if (newCharsLen == 0) {
charsOffset = src->current - src->source;
charsOffset = src->extraCurrent - src->source;
newCharsLen++;
} else if (inChars) {
if(newCharsLen == 0) {
charsOffset = src->current - src->source;
} else if (inChars) { /* we're reading some chars */
charsOffset = src->extraCurrent - src->source;
if(newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
} else {
if(newExtensionsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionsLen, newExtensionsLen*sizeof(UChar));
src->extraCurrent += newExtensionsLen;
}
newExtensionsLen++;
}
ch = *(++(src->current)); /*pattern[++index]; */
break;
/* '@' is french only if the strength is not currently set */
@ -491,10 +524,20 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
}
}
if(wasInQuote) {
if(ch != 0x27) {
*src->extraCurrent++ = ch;
}
if(src->extraCurrent == src->extraEnd) {
/* reallocate */
}
}
src->current++;
}
EndOfLoop:
wasInQuote = FALSE;
if (newStrength == UCOL_TOK_UNSET) {
return 0;
}
@ -721,6 +764,7 @@ uint32_t ucol_uprv_tok_assembleTokenList(UColTokenParser *src, UErrorCode *statu
src->resultLen++;
uhash_put(uchars2tokens, sourceToken, sourceToken, status);
} else { /* reset to something already in rules */
top = FALSE;
}
}
/* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
@ -739,5 +783,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UErrorCode *status) {
void ucol_tok_closeTokenList(UColTokenParser *src) {
uhash_close(uchars2tokens);
uprv_free(src->lh);
uprv_free(src->source);
}

View File

@ -9,6 +9,9 @@
#define UCOL_RESET_TOP_VALUE 0x9F000303
#define UCOL_NEXT_TOP_VALUE 0xD0000303
/* this is space for the extra strings that need to be unquoted */
/* during the parsing of the rules */
#define UCOL_TOK_EXTRA_RULE_SPACE_SIZE 1024
typedef struct UColToken UColToken;
typedef struct {
@ -53,9 +56,12 @@ struct UColToken {
};
typedef struct {
const UChar *source;
const UChar *end;
const UChar *current;
UChar *source;
UChar *end;
UChar *current;
UChar *sourceCurrent;
UChar *extraCurrent;
UChar *extraEnd;
const InverseTableHeader *invUCA;
const UCollator *UCA;
UCATableHeader *image;

View File

@ -116,7 +116,7 @@ const static int32_t results[TESTLOCALES][TOTALTESTSET] = {
/* new table collation with rules "& Question-mark ; ? & Hash-mark ; # & Ampersand ; '&' " loop to TOTALTESTSET */
{ 23, 24, 25, 22, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 21, 14, 1, 11, 2, 3, 4, 5, 19, 20, 6, 8, 10, 7, 29 },
/* analogous to Japanese rules " & aa ; a- & ee ; e- & ii ; i- & oo ; o- & uu ; u- " */ /* loop to TOTALTESTSET */
{ 19, 22, 21, 23, 24, 25, 12, 13, 9, 0, 17, 16, 26, 28, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
{ 19, 22, 21, 24, 23, 25, 12, 13, 9, 0, 17, 16, 28, 26, 27, 15, 18, 14, 1, 11, 2, 3, 4, 5, 20, 6, 8, 10, 7, 29 }
};
static UChar*