ICU-1997 use uprv_isRuleWhiteSpace for parsing collation rules

X-SVN-Rev: 9354
This commit is contained in:
Vladimir Weinstein 2002-07-25 21:59:19 +00:00
parent a151553d16
commit 6eea49d8e7
3 changed files with 296 additions and 300 deletions

View File

@ -434,7 +434,7 @@ ja {
"&\u309A = \u309A" "&\u309A = \u309A"
// Equaling normal and halfwidth/fullwidth characters // Equaling normal and halfwidth/fullwidth characters
"&' '=\u3000" // IDEOGRAPHIC SPACE "&' '='\u3000'" // IDEOGRAPHIC SPACE
"&'\u0020' = '\uFFE3'" // SPACE "&'\u0020' = '\uFFE3'" // SPACE
"&'\u0021' = '\uFF01'" // EXCLAMATION MARK "&'\u0021' = '\uFF01'" // EXCLAMATION MARK
"&'\u0022' = '\uFF02'" // QUOTATION MARK "&'\u0022' = '\uFF02'" // QUOTATION MARK

View File

@ -99,14 +99,14 @@
= ֯ = ֯
= ֽ = ֽ
= ׄ = ׄ
= ۝ = '۝'
= ۞ = '۞'
= ۟ = ۟
= ۠ = ۠
= ۪ = ۪
= ۫ = ۫
= ۬ = ۬
= ܏ = '܏'
= ๎ = ๎
= ༘ = ༘
= ༙ = ༙
@ -117,30 +117,30 @@
= ྆ = ྆
= ྇ = ྇
= ࿆ = ࿆
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
=  = ''
=  = ''
=  = ''
=  = ''
= 𝅥 = 𝅥
= 𝅦 = 𝅦
= 𝅧 = 𝅧
@ -555,23 +555,23 @@
< '\u000C' < '\u000C'
< '\u000D' < '\u000D'
< '\u0085' < '\u0085'
< < '\u2028'
< < ''
< '\u0020' < '\u0020'
<<<   <<< ' '
<<< <<< ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
= = ''
<<<   <<< ' '
= = ''
= = ''
< < ''
< ـ < 'ـ'
< '`' < '`'
<<< <<<
< ´ < ´

View File

@ -23,6 +23,7 @@
#include "ucol_tok.h" #include "ucol_tok.h"
#include "cmemory.h" #include "cmemory.h"
#include "uprops.h"
U_CDECL_BEGIN U_CDECL_BEGIN
static int32_t U_EXPORT2 U_CALLCONV static int32_t U_EXPORT2 U_CALLCONV
@ -712,274 +713,269 @@ ucol_tok_parseNextToken(UColTokenParser *src,
} }
} }
}else { }else {
/* Sets the strength for this entry */ if(!uprv_isRuleWhiteSpace(ch)) {
switch (ch) { /* Sets the strength for this entry */
case 0x003D/*'='*/ : switch (ch) {
if (newStrength != UCOL_TOK_UNSET) { case 0x003D/*'='*/ :
goto EndOfLoop; if (newStrength != UCOL_TOK_UNSET) {
} goto EndOfLoop;
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
case 0x002C/*','*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
break;
case 0x003B/*';'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
src->current++;
if(*(src->current+1) == 0x003C) {
src->current++; /* three in a row! */
newStrength = UCOL_TERTIARY;
} else { /* two in a row */
newStrength = UCOL_SECONDARY;
} }
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
break;
case 0x0026/*'&'*/: /* if we start with strength, we'll reset to top */
if (newStrength != UCOL_TOK_UNSET) { if(startOfRules == TRUE) {
/**/ top = TRUE;
goto EndOfLoop; newStrength = UCOL_TOK_RESET;
} goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */ case 0x002C/*','*/:
break; if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
case 0x005b/*'['*/: /* if we start with strength, we'll reset to top */
/* options - read an option, analyze it */ if(startOfRules == TRUE) {
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) { top = TRUE;
uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status); newStrength = UCOL_TOK_RESET;
src->current = optionEnd; goto EndOfLoop;
if(U_SUCCESS(*status)) { }
if(result & UCOL_TOK_TOP) { newStrength = UCOL_TERTIARY;
if(newStrength == UCOL_TOK_RESET) { break;
top = TRUE;
charsOffset = (uint32_t)(src->extraCurrent - src->source); case 0x003B/*';'*/:
*src->extraCurrent++ = 0xFFFE; if (newStrength != UCOL_TOK_UNSET) {
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16); goto EndOfLoop;
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF); }
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
newCharsLen = 3; /* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
top = TRUE;
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
src->current++;
if(*(src->current+1) == 0x003C) {
src->current++; /* three in a row! */
newStrength = UCOL_TERTIARY;
} else { /* two in a row */
newStrength = UCOL_SECONDARY;
}
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
break;
case 0x0026/*'&'*/:
if (newStrength != UCOL_TOK_UNSET) {
/**/
goto EndOfLoop;
}
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
break;
case 0x005b/*'['*/:
/* options - read an option, analyze it */
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status);
src->current = optionEnd;
if(U_SUCCESS(*status)) {
if(result & UCOL_TOK_TOP) {
if(newStrength == UCOL_TOK_RESET) {
top = TRUE;
charsOffset = (uint32_t)(src->extraCurrent - src->source);
*src->extraCurrent++ = 0xFFFE;
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
newCharsLen = 3;
} else {
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
newCharsLen = 5;
}
if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
*src->extraCurrent++ = 0x002d;
*src->extraCurrent++ = before;
newCharsLen+=2;
}
src->current++;
goto EndOfLoop;
} else { } else {
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16); *status = U_INVALID_FORMAT_ERROR;
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF); syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
newCharsLen = 5;
}
if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
*src->extraCurrent++ = 0x002d;
*src->extraCurrent++ = before;
newCharsLen+=2;
} }
} else if(result & UCOL_TOK_VARIABLE_TOP) {
if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
variableTop = TRUE;
charsOffset = (uint32_t)(src->extraCurrent - src->source);
newCharsLen = 1;
*src->extraCurrent++ = 0xFFFF;
src->current++;
goto EndOfLoop;
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
}
} else if (result & UCOL_TOK_BEFORE){
if(newStrength == UCOL_TOK_RESET) {
before = result & UCOL_TOK_BEFORE;
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
src->current++; }
goto EndOfLoop; }
} else { } else {
*status = U_INVALID_FORMAT_ERROR; syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); return NULL;
} }
} else if(result & UCOL_TOK_VARIABLE_TOP) { }
if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) { break;
variableTop = TRUE; case 0x0021/*! skip java thai modifier reordering*/:
charsOffset = (uint32_t)(src->extraCurrent - src->source); break;
newCharsLen = 1; case 0x002F/*'/'*/:
*src->extraCurrent++ = 0xFFFF; wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
src->current++; inChars = FALSE; /* we're now processing expansion */
goto EndOfLoop; break;
} else { case 0x005C /* back slash for escaped chars */:
*status = U_INVALID_FORMAT_ERROR; isEscaped = TRUE;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); break;
} /* found a quote, we're gonna start copying */
} else if (result & UCOL_TOK_BEFORE){ case 0x0027/*'\''*/:
if(newStrength == UCOL_TOK_RESET) { if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
before = result & UCOL_TOK_BEFORE; /*
} else { enabling rules to start with a non-token character a < b
*status = U_INVALID_FORMAT_ERROR; *status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
*/
newStrength = UCOL_TOK_RESET;
}
} inQuote = TRUE;
}
} else { if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
} else { /* we're doing an expansion */
if(wasInQuote == FALSE) {
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newExtensionLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
src->extraCurrent += newExtensionLen;
}
newExtensionLen++;
}
wasInQuote = TRUE;
ch = *(++(src->current));
if(ch == 0x0027) { /* copy the double quote */
*src->extraCurrent++ = ch;
inQuote = FALSE;
}
break;
/* '@' is french only if the strength is not currently set */
/* if it is, it's just a regular character in collation rules */
case 0x0040/*'@'*/:
if (newStrength == UCOL_TOK_UNSET) {
src->opts->frenchCollation = UCOL_ON;
break;
}
case 0x007C /*|*/: /* this means we have actually been reading prefix part */
// we want to store read characters to the prefix part and continue reading
// the characters (proper way would be to restart reading the chars, but in
// that case we would have to complicate the token hasher, which I do not
// intend to play with. Instead, we will do prefixes when prefixes are due
// (before adding the elements).
src->parsedToken.prefixOffset = charsOffset;
src->parsedToken.prefixLen = newCharsLen;
if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
}
wasInQuote = TRUE;
ch = *(++(src->current));
break;
//charsOffset = 0;
//newCharsLen = 0;
//break; // We want to store the whole prefix/character sequence. If we break
// the '|' is going to get lost.
default:
if (newStrength == UCOL_TOK_UNSET) {
/* enabling rules to start with non-tokens a < b
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
*/
newStrength = UCOL_TOK_RESET;
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError); syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL; return NULL;
} }
}
break; if(ch == 0x0000 && src->current+1 == src->end) {
/* Ignore the white spaces */ break;
case 0x0009/*'\t'*/: }
case 0x000C/*'\f'*/:
case 0x000D/*'\r'*/: if (inChars) {
case 0x000A/*'\n'*/: if(newCharsLen == 0) {
case 0x0020/*' '*/: charsOffset = (uint32_t)(src->current - src->source);
break; /* skip whitespace TODO use Unicode */ }
case 0x0021/*! skip java thai modifier reordering*/: newCharsLen++;
break; } else {
case 0x002F/*'/'*/: if(newExtensionLen == 0) {
wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */ extensionOffset = (uint32_t)(src->current - src->source);
inChars = FALSE; /* we're now processing expansion */ }
break; newExtensionLen++;
case 0x005C /* back slash for escaped chars */: }
isEscaped = TRUE;
break; break;
/* found a quote, we're gonna start copying */ }
case 0x0027/*'\''*/: }
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
/*
enabling rules to start with a non-token character a < b
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
*/
newStrength = UCOL_TOK_RESET;
}
inQuote = TRUE;
if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
} else { /* we're doing an expansion */
if(wasInQuote == FALSE) {
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newExtensionLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
src->extraCurrent += newExtensionLen;
}
newExtensionLen++;
}
wasInQuote = TRUE;
ch = *(++(src->current));
if(ch == 0x0027) { /* copy the double quote */
*src->extraCurrent++ = ch;
inQuote = FALSE;
}
break;
/* '@' is french only if the strength is not currently set */
/* if it is, it's just a regular character in collation rules */
case 0x0040/*'@'*/:
if (newStrength == UCOL_TOK_UNSET) {
src->opts->frenchCollation = UCOL_ON;
break;
}
case 0x007C /*|*/: /* this means we have actually been reading prefix part */
// we want to store read characters to the prefix part and continue reading
// the characters (proper way would be to restart reading the chars, but in
// that case we would have to complicate the token hasher, which I do not
// intend to play with. Instead, we will do prefixes when prefixes are due
// (before adding the elements).
src->parsedToken.prefixOffset = charsOffset;
src->parsedToken.prefixLen = newCharsLen;
if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newCharsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
src->extraCurrent += newCharsLen;
}
newCharsLen++;
}
wasInQuote = TRUE;
ch = *(++(src->current));
break;
//charsOffset = 0;
//newCharsLen = 0;
//break; // We want to store the whole prefix/character sequence. If we break
// the '|' is going to get lost.
default:
if (newStrength == UCOL_TOK_UNSET) {
/* enabling rules to start with non-tokens a < b
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
*/
newStrength = UCOL_TOK_RESET;
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}
if(ch == 0x0000 && src->current+1 == src->end) {
break;
}
if (inChars) {
if(newCharsLen == 0) {
charsOffset = (uint32_t)(src->current - src->source);
}
newCharsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->current - src->source);
}
newExtensionLen++;
}
break;
}
} }
if(wasInQuote) { if(wasInQuote) {