ICU-1997 use uprv_isRuleWhiteSpace for parsing collation rules
X-SVN-Rev: 9354
This commit is contained in:
parent
a151553d16
commit
6eea49d8e7
@ -434,7 +434,7 @@ ja {
|
|||||||
"&\u309A = \u309A"
|
"&\u309A = \u309A"
|
||||||
|
|
||||||
// Equaling normal and halfwidth/fullwidth characters
|
// Equaling normal and halfwidth/fullwidth characters
|
||||||
"&' '=\u3000" // IDEOGRAPHIC SPACE
|
"&' '='\u3000'" // IDEOGRAPHIC SPACE
|
||||||
"&'\u0020' = '\uFFE3'" // SPACE
|
"&'\u0020' = '\uFFE3'" // SPACE
|
||||||
"&'\u0021' = '\uFF01'" // EXCLAMATION MARK
|
"&'\u0021' = '\uFF01'" // EXCLAMATION MARK
|
||||||
"&'\u0022' = '\uFF02'" // QUOTATION MARK
|
"&'\u0022' = '\uFF02'" // QUOTATION MARK
|
||||||
|
@ -99,14 +99,14 @@
|
|||||||
= ֯
|
= ֯
|
||||||
= ֽ
|
= ֽ
|
||||||
= ׄ
|
= ׄ
|
||||||
=
|
= ''
|
||||||
= ۞
|
= '۞'
|
||||||
= ۟
|
= ۟
|
||||||
= ۠
|
= ۠
|
||||||
= ۪
|
= ۪
|
||||||
= ۫
|
= ۫
|
||||||
= ۬
|
= ۬
|
||||||
=
|
= ''
|
||||||
= ๎
|
= ๎
|
||||||
= ༘
|
= ༘
|
||||||
= ༙
|
= ༙
|
||||||
@ -117,30 +117,30 @@
|
|||||||
= ྆
|
= ྆
|
||||||
= ྇
|
= ྇
|
||||||
= ࿆
|
= ࿆
|
||||||
= ᠋
|
= '᠋'
|
||||||
= ᠌
|
= '᠌'
|
||||||
= ᠍
|
= '᠍'
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
=
|
= ''
|
||||||
= 𝅥
|
= 𝅥
|
||||||
= 𝅦
|
= 𝅦
|
||||||
= 𝅧
|
= 𝅧
|
||||||
@ -555,23 +555,23 @@
|
|||||||
< '\u000C'
|
< '\u000C'
|
||||||
< '\u000D'
|
< '\u000D'
|
||||||
< '\u0085'
|
< '\u0085'
|
||||||
<
|
< '\u2028'
|
||||||
<
|
< '
'
|
||||||
< '\u0020'
|
< '\u0020'
|
||||||
<<<
|
<<< ' '
|
||||||
<<<
|
<<< ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
<<<
|
<<< ' '
|
||||||
=
|
= ' '
|
||||||
=
|
= ' '
|
||||||
<
|
< ' '
|
||||||
< ـ
|
< 'ـ'
|
||||||
< '`'
|
< '`'
|
||||||
<<< `
|
<<< `
|
||||||
< ´
|
< ´
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
|
|
||||||
#include "ucol_tok.h"
|
#include "ucol_tok.h"
|
||||||
#include "cmemory.h"
|
#include "cmemory.h"
|
||||||
|
#include "uprops.h"
|
||||||
|
|
||||||
U_CDECL_BEGIN
|
U_CDECL_BEGIN
|
||||||
static int32_t U_EXPORT2 U_CALLCONV
|
static int32_t U_EXPORT2 U_CALLCONV
|
||||||
@ -712,274 +713,269 @@ ucol_tok_parseNextToken(UColTokenParser *src,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}else {
|
}else {
|
||||||
/* Sets the strength for this entry */
|
if(!uprv_isRuleWhiteSpace(ch)) {
|
||||||
switch (ch) {
|
/* Sets the strength for this entry */
|
||||||
case 0x003D/*'='*/ :
|
switch (ch) {
|
||||||
if (newStrength != UCOL_TOK_UNSET) {
|
case 0x003D/*'='*/ :
|
||||||
goto EndOfLoop;
|
if (newStrength != UCOL_TOK_UNSET) {
|
||||||
}
|
goto EndOfLoop;
|
||||||
|
|
||||||
/* if we start with strength, we'll reset to top */
|
|
||||||
if(startOfRules == TRUE) {
|
|
||||||
top = TRUE;
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
newStrength = UCOL_IDENTICAL;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 0x002C/*','*/:
|
|
||||||
if (newStrength != UCOL_TOK_UNSET) {
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if we start with strength, we'll reset to top */
|
|
||||||
if(startOfRules == TRUE) {
|
|
||||||
top = TRUE;
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
newStrength = UCOL_TERTIARY;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 0x003B/*';'*/:
|
|
||||||
if (newStrength != UCOL_TOK_UNSET) {
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if we start with strength, we'll reset to top */
|
|
||||||
if(startOfRules == TRUE) {
|
|
||||||
top = TRUE;
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
newStrength = UCOL_SECONDARY;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 0x003C/*'<'*/:
|
|
||||||
if (newStrength != UCOL_TOK_UNSET) {
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* if we start with strength, we'll reset to top */
|
|
||||||
if(startOfRules == TRUE) {
|
|
||||||
top = TRUE;
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
goto EndOfLoop;
|
|
||||||
}
|
|
||||||
/* before this, do a scan to verify whether this is */
|
|
||||||
/* another strength */
|
|
||||||
if(*(src->current+1) == 0x003C) {
|
|
||||||
src->current++;
|
|
||||||
if(*(src->current+1) == 0x003C) {
|
|
||||||
src->current++; /* three in a row! */
|
|
||||||
newStrength = UCOL_TERTIARY;
|
|
||||||
} else { /* two in a row */
|
|
||||||
newStrength = UCOL_SECONDARY;
|
|
||||||
}
|
}
|
||||||
} else { /* just one */
|
|
||||||
newStrength = UCOL_PRIMARY;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
case 0x0026/*'&'*/:
|
/* if we start with strength, we'll reset to top */
|
||||||
if (newStrength != UCOL_TOK_UNSET) {
|
if(startOfRules == TRUE) {
|
||||||
/**/
|
top = TRUE;
|
||||||
goto EndOfLoop;
|
newStrength = UCOL_TOK_RESET;
|
||||||
}
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
newStrength = UCOL_IDENTICAL;
|
||||||
|
break;
|
||||||
|
|
||||||
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
|
case 0x002C/*','*/:
|
||||||
break;
|
if (newStrength != UCOL_TOK_UNSET) {
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
|
||||||
case 0x005b/*'['*/:
|
/* if we start with strength, we'll reset to top */
|
||||||
/* options - read an option, analyze it */
|
if(startOfRules == TRUE) {
|
||||||
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
|
top = TRUE;
|
||||||
uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status);
|
newStrength = UCOL_TOK_RESET;
|
||||||
src->current = optionEnd;
|
goto EndOfLoop;
|
||||||
if(U_SUCCESS(*status)) {
|
}
|
||||||
if(result & UCOL_TOK_TOP) {
|
newStrength = UCOL_TERTIARY;
|
||||||
if(newStrength == UCOL_TOK_RESET) {
|
break;
|
||||||
top = TRUE;
|
|
||||||
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
case 0x003B/*';'*/:
|
||||||
*src->extraCurrent++ = 0xFFFE;
|
if (newStrength != UCOL_TOK_UNSET) {
|
||||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
|
goto EndOfLoop;
|
||||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
|
}
|
||||||
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
|
|
||||||
newCharsLen = 3;
|
/* if we start with strength, we'll reset to top */
|
||||||
|
if(startOfRules == TRUE) {
|
||||||
|
top = TRUE;
|
||||||
|
newStrength = UCOL_TOK_RESET;
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
newStrength = UCOL_SECONDARY;
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 0x003C/*'<'*/:
|
||||||
|
if (newStrength != UCOL_TOK_UNSET) {
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* if we start with strength, we'll reset to top */
|
||||||
|
if(startOfRules == TRUE) {
|
||||||
|
top = TRUE;
|
||||||
|
newStrength = UCOL_TOK_RESET;
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
/* before this, do a scan to verify whether this is */
|
||||||
|
/* another strength */
|
||||||
|
if(*(src->current+1) == 0x003C) {
|
||||||
|
src->current++;
|
||||||
|
if(*(src->current+1) == 0x003C) {
|
||||||
|
src->current++; /* three in a row! */
|
||||||
|
newStrength = UCOL_TERTIARY;
|
||||||
|
} else { /* two in a row */
|
||||||
|
newStrength = UCOL_SECONDARY;
|
||||||
|
}
|
||||||
|
} else { /* just one */
|
||||||
|
newStrength = UCOL_PRIMARY;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 0x0026/*'&'*/:
|
||||||
|
if (newStrength != UCOL_TOK_UNSET) {
|
||||||
|
/**/
|
||||||
|
goto EndOfLoop;
|
||||||
|
}
|
||||||
|
|
||||||
|
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case 0x005b/*'['*/:
|
||||||
|
/* options - read an option, analyze it */
|
||||||
|
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
|
||||||
|
uint8_t result = ucol_uprv_tok_readAndSetOption(src, optionEnd, status);
|
||||||
|
src->current = optionEnd;
|
||||||
|
if(U_SUCCESS(*status)) {
|
||||||
|
if(result & UCOL_TOK_TOP) {
|
||||||
|
if(newStrength == UCOL_TOK_RESET) {
|
||||||
|
top = TRUE;
|
||||||
|
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
||||||
|
*src->extraCurrent++ = 0xFFFE;
|
||||||
|
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
|
||||||
|
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
|
||||||
|
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
|
||||||
|
newCharsLen = 3;
|
||||||
|
} else {
|
||||||
|
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
|
||||||
|
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
|
||||||
|
newCharsLen = 5;
|
||||||
|
}
|
||||||
|
if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
|
||||||
|
*src->extraCurrent++ = 0x002d;
|
||||||
|
*src->extraCurrent++ = before;
|
||||||
|
newCharsLen+=2;
|
||||||
|
}
|
||||||
|
|
||||||
|
src->current++;
|
||||||
|
goto EndOfLoop;
|
||||||
} else {
|
} else {
|
||||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
newCharsLen = 5;
|
|
||||||
}
|
|
||||||
if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
|
|
||||||
*src->extraCurrent++ = 0x002d;
|
|
||||||
*src->extraCurrent++ = before;
|
|
||||||
newCharsLen+=2;
|
|
||||||
}
|
}
|
||||||
|
} else if(result & UCOL_TOK_VARIABLE_TOP) {
|
||||||
|
if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
|
||||||
|
variableTop = TRUE;
|
||||||
|
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
||||||
|
newCharsLen = 1;
|
||||||
|
*src->extraCurrent++ = 0xFFFF;
|
||||||
|
src->current++;
|
||||||
|
goto EndOfLoop;
|
||||||
|
} else {
|
||||||
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
|
}
|
||||||
|
} else if (result & UCOL_TOK_BEFORE){
|
||||||
|
if(newStrength == UCOL_TOK_RESET) {
|
||||||
|
before = result & UCOL_TOK_BEFORE;
|
||||||
|
} else {
|
||||||
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
|
|
||||||
src->current++;
|
}
|
||||||
goto EndOfLoop;
|
}
|
||||||
} else {
|
} else {
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
return NULL;
|
||||||
}
|
}
|
||||||
} else if(result & UCOL_TOK_VARIABLE_TOP) {
|
}
|
||||||
if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
|
break;
|
||||||
variableTop = TRUE;
|
case 0x0021/*! skip java thai modifier reordering*/:
|
||||||
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
break;
|
||||||
newCharsLen = 1;
|
case 0x002F/*'/'*/:
|
||||||
*src->extraCurrent++ = 0xFFFF;
|
wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
|
||||||
src->current++;
|
inChars = FALSE; /* we're now processing expansion */
|
||||||
goto EndOfLoop;
|
break;
|
||||||
} else {
|
case 0x005C /* back slash for escaped chars */:
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
isEscaped = TRUE;
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
break;
|
||||||
}
|
/* found a quote, we're gonna start copying */
|
||||||
} else if (result & UCOL_TOK_BEFORE){
|
case 0x0027/*'\''*/:
|
||||||
if(newStrength == UCOL_TOK_RESET) {
|
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
|
||||||
before = result & UCOL_TOK_BEFORE;
|
/*
|
||||||
} else {
|
enabling rules to start with a non-token character a < b
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
|
return NULL;
|
||||||
|
*/
|
||||||
|
newStrength = UCOL_TOK_RESET;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
inQuote = TRUE;
|
||||||
}
|
|
||||||
} else {
|
if(inChars) { /* we're doing characters */
|
||||||
|
if(wasInQuote == FALSE) {
|
||||||
|
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
||||||
|
}
|
||||||
|
if (newCharsLen != 0) {
|
||||||
|
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
|
||||||
|
src->extraCurrent += newCharsLen;
|
||||||
|
}
|
||||||
|
newCharsLen++;
|
||||||
|
} else { /* we're doing an expansion */
|
||||||
|
if(wasInQuote == FALSE) {
|
||||||
|
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
|
||||||
|
}
|
||||||
|
if (newExtensionLen != 0) {
|
||||||
|
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
|
||||||
|
src->extraCurrent += newExtensionLen;
|
||||||
|
}
|
||||||
|
newExtensionLen++;
|
||||||
|
}
|
||||||
|
|
||||||
|
wasInQuote = TRUE;
|
||||||
|
|
||||||
|
ch = *(++(src->current));
|
||||||
|
if(ch == 0x0027) { /* copy the double quote */
|
||||||
|
*src->extraCurrent++ = ch;
|
||||||
|
inQuote = FALSE;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* '@' is french only if the strength is not currently set */
|
||||||
|
/* if it is, it's just a regular character in collation rules */
|
||||||
|
case 0x0040/*'@'*/:
|
||||||
|
if (newStrength == UCOL_TOK_UNSET) {
|
||||||
|
src->opts->frenchCollation = UCOL_ON;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
case 0x007C /*|*/: /* this means we have actually been reading prefix part */
|
||||||
|
// we want to store read characters to the prefix part and continue reading
|
||||||
|
// the characters (proper way would be to restart reading the chars, but in
|
||||||
|
// that case we would have to complicate the token hasher, which I do not
|
||||||
|
// intend to play with. Instead, we will do prefixes when prefixes are due
|
||||||
|
// (before adding the elements).
|
||||||
|
src->parsedToken.prefixOffset = charsOffset;
|
||||||
|
src->parsedToken.prefixLen = newCharsLen;
|
||||||
|
|
||||||
|
if(inChars) { /* we're doing characters */
|
||||||
|
if(wasInQuote == FALSE) {
|
||||||
|
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
||||||
|
}
|
||||||
|
if (newCharsLen != 0) {
|
||||||
|
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
|
||||||
|
src->extraCurrent += newCharsLen;
|
||||||
|
}
|
||||||
|
newCharsLen++;
|
||||||
|
}
|
||||||
|
|
||||||
|
wasInQuote = TRUE;
|
||||||
|
|
||||||
|
ch = *(++(src->current));
|
||||||
|
break;
|
||||||
|
|
||||||
|
//charsOffset = 0;
|
||||||
|
//newCharsLen = 0;
|
||||||
|
//break; // We want to store the whole prefix/character sequence. If we break
|
||||||
|
// the '|' is going to get lost.
|
||||||
|
default:
|
||||||
|
if (newStrength == UCOL_TOK_UNSET) {
|
||||||
|
/* enabling rules to start with non-tokens a < b
|
||||||
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
|
return NULL;
|
||||||
|
*/
|
||||||
|
newStrength = UCOL_TOK_RESET;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
|
||||||
|
*status = U_INVALID_FORMAT_ERROR;
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
break;
|
if(ch == 0x0000 && src->current+1 == src->end) {
|
||||||
/* Ignore the white spaces */
|
break;
|
||||||
case 0x0009/*'\t'*/:
|
}
|
||||||
case 0x000C/*'\f'*/:
|
|
||||||
case 0x000D/*'\r'*/:
|
if (inChars) {
|
||||||
case 0x000A/*'\n'*/:
|
if(newCharsLen == 0) {
|
||||||
case 0x0020/*' '*/:
|
charsOffset = (uint32_t)(src->current - src->source);
|
||||||
break; /* skip whitespace TODO use Unicode */
|
}
|
||||||
case 0x0021/*! skip java thai modifier reordering*/:
|
newCharsLen++;
|
||||||
break;
|
} else {
|
||||||
case 0x002F/*'/'*/:
|
if(newExtensionLen == 0) {
|
||||||
wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
|
extensionOffset = (uint32_t)(src->current - src->source);
|
||||||
inChars = FALSE; /* we're now processing expansion */
|
}
|
||||||
break;
|
newExtensionLen++;
|
||||||
case 0x005C /* back slash for escaped chars */:
|
}
|
||||||
isEscaped = TRUE;
|
|
||||||
break;
|
break;
|
||||||
/* found a quote, we're gonna start copying */
|
}
|
||||||
case 0x0027/*'\''*/:
|
}
|
||||||
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
|
|
||||||
/*
|
|
||||||
enabling rules to start with a non-token character a < b
|
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
|
||||||
return NULL;
|
|
||||||
*/
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
}
|
|
||||||
|
|
||||||
inQuote = TRUE;
|
|
||||||
|
|
||||||
if(inChars) { /* we're doing characters */
|
|
||||||
if(wasInQuote == FALSE) {
|
|
||||||
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
|
||||||
}
|
|
||||||
if (newCharsLen != 0) {
|
|
||||||
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
|
|
||||||
src->extraCurrent += newCharsLen;
|
|
||||||
}
|
|
||||||
newCharsLen++;
|
|
||||||
} else { /* we're doing an expansion */
|
|
||||||
if(wasInQuote == FALSE) {
|
|
||||||
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
|
|
||||||
}
|
|
||||||
if (newExtensionLen != 0) {
|
|
||||||
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
|
|
||||||
src->extraCurrent += newExtensionLen;
|
|
||||||
}
|
|
||||||
newExtensionLen++;
|
|
||||||
}
|
|
||||||
|
|
||||||
wasInQuote = TRUE;
|
|
||||||
|
|
||||||
ch = *(++(src->current));
|
|
||||||
if(ch == 0x0027) { /* copy the double quote */
|
|
||||||
*src->extraCurrent++ = ch;
|
|
||||||
inQuote = FALSE;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
|
|
||||||
/* '@' is french only if the strength is not currently set */
|
|
||||||
/* if it is, it's just a regular character in collation rules */
|
|
||||||
case 0x0040/*'@'*/:
|
|
||||||
if (newStrength == UCOL_TOK_UNSET) {
|
|
||||||
src->opts->frenchCollation = UCOL_ON;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
case 0x007C /*|*/: /* this means we have actually been reading prefix part */
|
|
||||||
// we want to store read characters to the prefix part and continue reading
|
|
||||||
// the characters (proper way would be to restart reading the chars, but in
|
|
||||||
// that case we would have to complicate the token hasher, which I do not
|
|
||||||
// intend to play with. Instead, we will do prefixes when prefixes are due
|
|
||||||
// (before adding the elements).
|
|
||||||
src->parsedToken.prefixOffset = charsOffset;
|
|
||||||
src->parsedToken.prefixLen = newCharsLen;
|
|
||||||
|
|
||||||
if(inChars) { /* we're doing characters */
|
|
||||||
if(wasInQuote == FALSE) {
|
|
||||||
charsOffset = (uint32_t)(src->extraCurrent - src->source);
|
|
||||||
}
|
|
||||||
if (newCharsLen != 0) {
|
|
||||||
uprv_memcpy(src->extraCurrent, src->current - newCharsLen, newCharsLen*sizeof(UChar));
|
|
||||||
src->extraCurrent += newCharsLen;
|
|
||||||
}
|
|
||||||
newCharsLen++;
|
|
||||||
}
|
|
||||||
|
|
||||||
wasInQuote = TRUE;
|
|
||||||
|
|
||||||
ch = *(++(src->current));
|
|
||||||
break;
|
|
||||||
|
|
||||||
//charsOffset = 0;
|
|
||||||
//newCharsLen = 0;
|
|
||||||
//break; // We want to store the whole prefix/character sequence. If we break
|
|
||||||
// the '|' is going to get lost.
|
|
||||||
default:
|
|
||||||
if (newStrength == UCOL_TOK_UNSET) {
|
|
||||||
/* enabling rules to start with non-tokens a < b
|
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
|
||||||
return NULL;
|
|
||||||
*/
|
|
||||||
newStrength = UCOL_TOK_RESET;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
|
|
||||||
*status = U_INVALID_FORMAT_ERROR;
|
|
||||||
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(ch == 0x0000 && src->current+1 == src->end) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inChars) {
|
|
||||||
if(newCharsLen == 0) {
|
|
||||||
charsOffset = (uint32_t)(src->current - src->source);
|
|
||||||
}
|
|
||||||
newCharsLen++;
|
|
||||||
} else {
|
|
||||||
if(newExtensionLen == 0) {
|
|
||||||
extensionOffset = (uint32_t)(src->current - src->source);
|
|
||||||
}
|
|
||||||
newExtensionLen++;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if(wasInQuote) {
|
if(wasInQuote) {
|
||||||
|
Loading…
Reference in New Issue
Block a user