scuffed-code/icu4c/source/i18n/ucol_tok.cpp
2004-05-19 20:42:44 +00:00

1873 lines
70 KiB
C++

/*
*******************************************************************************
*
* Copyright (C) 2001-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucol_tok.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created 02/22/2001
* created by: Vladimir Weinstein
*
* This module reads a tailoring rule string and produces a list of
* tokens that will be turned into collation elements
*
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "ucol_tok.h"
#include "cmemory.h"
#include "uprops.h"
U_CDECL_BEGIN
static int32_t U_EXPORT2 U_CALLCONV
uhash_hashTokens(const UHashTok k)
{
int32_t hash = 0;
//uint32_t key = (uint32_t)k.integer;
UColToken *key = (UColToken *)k.pointer;
if (key != 0) {
//int32_t len = (key & 0xFF000000)>>24;
int32_t len = (key->source & 0xFF000000)>>24;
int32_t inc = ((len - 32) / 32) + 1;
//const UChar *p = (key & 0x00FFFFFF) + rulesToParse;
const UChar *p = (key->source & 0x00FFFFFF) + key->rulesToParse;
const UChar *limit = p + len;
while (p<limit) {
hash = (hash * 37) + *p;
p += inc;
}
}
return hash;
}
static UBool U_EXPORT2 U_CALLCONV
uhash_compareTokens(const UHashTok key1, const UHashTok key2)
{
//uint32_t p1 = (uint32_t) key1.integer;
//uint32_t p2 = (uint32_t) key2.integer;
UColToken *p1 = (UColToken *)key1.pointer;
UColToken *p2 = (UColToken *)key2.pointer;
const UChar *s1 = (p1->source & 0x00FFFFFF) + p1->rulesToParse;
const UChar *s2 = (p2->source & 0x00FFFFFF) + p2->rulesToParse;
uint32_t s1L = ((p1->source & 0xFF000000) >> 24);
uint32_t s2L = ((p2->source & 0xFF000000) >> 24);
const UChar *end = s1+s1L-1;
if (p1 == p2) {
return TRUE;
}
if (p1->source == 0 || p2->source == 0) {
return FALSE;
}
if(s1L != s2L) {
return FALSE;
}
if(p1->source == p2->source) {
return TRUE;
}
while((s1 < end) && *s1 == *s2) {
++s1;
++s2;
}
if(*s1 == *s2) {
return TRUE;
} else {
return FALSE;
}
}
U_CDECL_END
static inline void U_CALLCONV
uhash_freeBlockWrapper(void *obj) {
uhash_freeBlock(obj);
}
typedef struct {
uint32_t startCE;
uint32_t startContCE;
uint32_t limitCE;
uint32_t limitContCE;
} indirectBoundaries;
/* these values are used for finding CE values for indirect positioning. */
/* Indirect positioning is a mechanism for allowing resets on symbolic */
/* values. It only works for resets and you cannot tailor indirect names */
/* An indirect name can define either an anchor point or a range. An */
/* anchor point behaves in exactly the same way as a code point in reset */
/* would, except that it cannot be tailored. A range (we currently only */
/* know for the [top] range will explicitly set the upper bound for */
/* generated CEs, thus allowing for better control over how many CEs can */
/* be squeezed between in the range without performance penalty. */
/* In that respect, we use [top] for tailoring of locales that use CJK */
/* characters. Other indirect values are currently a pure convenience, */
/* they can be used to assure that the CEs will be always positioned in */
/* the same place relative to a point with known properties (e.g. first */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[15];
/*
static indirectBoundaries ucolIndirectBoundaries[11] = {
{ UCOL_RESET_TOP_VALUE, 0,
UCOL_NEXT_TOP_VALUE, 0 },
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
0, 0 },
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_VARIABLE, 0,
0, 0 },
{ UCOL_FIRST_NON_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_NON_VARIABLE, 0,
0, 0 },
};
*/
static void setIndirectBoundaries(uint32_t indexR, uint32_t *start, uint32_t *end) {
// Set values for the top - TODO: once we have values for all the indirects, we are going
// to initalize here.
ucolIndirectBoundaries[indexR].startCE = start[0];
ucolIndirectBoundaries[indexR].startContCE = start[1];
if(end) {
ucolIndirectBoundaries[indexR].limitCE = end[0];
ucolIndirectBoundaries[indexR].limitContCE = end[1];
} else {
ucolIndirectBoundaries[indexR].limitCE = 0;
ucolIndirectBoundaries[indexR].limitContCE = 0;
}
}
static inline
void syntaxError(const UChar* rules,
int32_t pos,
int32_t rulesLen,
UParseError* parseError) {
parseError->offset = pos;
parseError->line = 0 ; /* we are not using line numbers */
// for pre-context
int32_t start = (pos <=U_PARSE_CONTEXT_LEN)? 0 : (pos - (U_PARSE_CONTEXT_LEN-1));
int32_t stop = pos;
u_memcpy(parseError->preContext,rules+start,stop-start);
//null terminate the buffer
parseError->preContext[stop-start] = 0;
//for post-context
start = pos+1;
stop = ((pos+U_PARSE_CONTEXT_LEN)<= rulesLen )? (pos+(U_PARSE_CONTEXT_LEN-1)) :
rulesLen;
if(start < stop) {
u_memcpy(parseError->postContext,rules+start,stop-start);
//null terminate the buffer
parseError->postContext[stop-start]= 0;
} else {
parseError->postContext[0] = 0;
}
}
static
void ucol_uprv_tok_setOptionInImage(UColOptionSet *opts, UColAttribute attrib, UColAttributeValue value) {
switch(attrib) {
case UCOL_HIRAGANA_QUATERNARY_MODE:
opts->hiraganaQ = value;
break;
case UCOL_FRENCH_COLLATION:
opts->frenchCollation = value;
break;
case UCOL_ALTERNATE_HANDLING:
opts->alternateHandling = value;
break;
case UCOL_CASE_FIRST:
opts->caseFirst = value;
break;
case UCOL_CASE_LEVEL:
opts->caseLevel = value;
break;
case UCOL_NORMALIZATION_MODE:
opts->normalizationMode = value;
break;
case UCOL_STRENGTH:
opts->strength = value;
break;
case UCOL_NUMERIC_COLLATION:
opts->numericCollation = value;
break;
case UCOL_ATTRIBUTE_COUNT:
default:
break;
}
}
#define UTOK_OPTION_COUNT 20
static UBool didInit = FALSE;
/* we can be strict, or we can be lenient */
/* I'd surely be lenient with the option arguments */
/* maybe even with options */
U_STRING_DECL(suboption_00, "non-ignorable", 13);
U_STRING_DECL(suboption_01, "shifted", 7);
U_STRING_DECL(suboption_02, "lower", 5);
U_STRING_DECL(suboption_03, "upper", 5);
U_STRING_DECL(suboption_04, "off", 3);
U_STRING_DECL(suboption_05, "on", 2);
U_STRING_DECL(suboption_06, "1", 1);
U_STRING_DECL(suboption_07, "2", 1);
U_STRING_DECL(suboption_08, "3", 1);
U_STRING_DECL(suboption_09, "4", 1);
U_STRING_DECL(suboption_10, "I", 1);
U_STRING_DECL(suboption_11, "primary", 7);
U_STRING_DECL(suboption_12, "secondary", 9);
U_STRING_DECL(suboption_13, "tertiary", 8);
U_STRING_DECL(suboption_14, "variable", 8);
U_STRING_DECL(suboption_15, "regular", 7);
U_STRING_DECL(suboption_16, "implicit", 8);
U_STRING_DECL(suboption_17, "trailing", 8);
U_STRING_DECL(option_00, "undefined", 9);
U_STRING_DECL(option_01, "rearrange", 9);
U_STRING_DECL(option_02, "alternate", 9);
U_STRING_DECL(option_03, "backwards", 9);
U_STRING_DECL(option_04, "variable top", 12);
U_STRING_DECL(option_05, "top", 3);
U_STRING_DECL(option_06, "normalization", 13);
U_STRING_DECL(option_07, "caseLevel", 9);
U_STRING_DECL(option_08, "caseFirst", 9);
U_STRING_DECL(option_09, "scriptOrder", 11);
U_STRING_DECL(option_10, "charsetname", 11);
U_STRING_DECL(option_11, "charset", 7);
U_STRING_DECL(option_12, "before", 6);
U_STRING_DECL(option_13, "hiraganaQ", 9);
U_STRING_DECL(option_14, "strength", 8);
U_STRING_DECL(option_15, "first", 5);
U_STRING_DECL(option_16, "last", 4);
U_STRING_DECL(option_17, "optimize", 8);
U_STRING_DECL(option_18, "suppressContractions", 20);
U_STRING_DECL(option_19, "numericOrdering", 15);
/*
[last variable] last variable value
[last primary ignorable] largest CE for primary ignorable
[last secondary ignorable] largest CE for secondary ignorable
[last tertiary ignorable] largest CE for tertiary ignorable
[top] guaranteed to be above all implicit CEs, for now and in the future (in 1.8)
*/
static const ucolTokSuboption alternateSub[2] = {
{suboption_00, 13, UCOL_NON_IGNORABLE},
{suboption_01, 7, UCOL_SHIFTED}
};
static const ucolTokSuboption caseFirstSub[3] = {
{suboption_02, 5, UCOL_LOWER_FIRST},
{suboption_03, 5, UCOL_UPPER_FIRST},
{suboption_04, 3, UCOL_OFF},
};
static const ucolTokSuboption onOffSub[2] = {
{suboption_04, 3, UCOL_OFF},
{suboption_05, 2, UCOL_ON}
};
static const ucolTokSuboption frenchSub[1] = {
{suboption_07, 1, UCOL_ON}
};
static const ucolTokSuboption beforeSub[3] = {
{suboption_06, 1, UCOL_PRIMARY},
{suboption_07, 1, UCOL_SECONDARY},
{suboption_08, 1, UCOL_TERTIARY}
};
static const ucolTokSuboption strengthSub[5] = {
{suboption_06, 1, UCOL_PRIMARY},
{suboption_07, 1, UCOL_SECONDARY},
{suboption_08, 1, UCOL_TERTIARY},
{suboption_09, 1, UCOL_QUATERNARY},
{suboption_10, 1, UCOL_IDENTICAL},
};
static const ucolTokSuboption firstLastSub[7] = {
{suboption_11, 7, UCOL_PRIMARY},
{suboption_12, 9, UCOL_PRIMARY},
{suboption_13, 8, UCOL_PRIMARY},
{suboption_14, 8, UCOL_PRIMARY},
{suboption_15, 7, UCOL_PRIMARY},
{suboption_16, 8, UCOL_PRIMARY},
{suboption_17, 8, UCOL_PRIMARY},
};
enum OptionNumber {
OPTION_ALTERNATE_HANDLING = 0,
OPTION_FRENCH_COLLATION,
OPTION_CASE_LEVEL,
OPTION_CASE_FIRST,
OPTION_NORMALIZATION_MODE,
OPTION_HIRAGANA_QUATERNARY,
OPTION_STRENGTH,
OPTION_NUMERIC_COLLATION,
OPTION_NORMAL_OPTIONS_LIMIT = OPTION_NUMERIC_COLLATION,
OPTION_VARIABLE_TOP,
OPTION_REARRANGE,
OPTION_BEFORE,
OPTION_TOP,
OPTION_FIRST,
OPTION_LAST,
OPTION_OPTIMIZE,
OPTION_SUPPRESS_CONTRACTIONS,
OPTION_UNDEFINED,
OPTION_SCRIPT_ORDER,
OPTION_CHARSET_NAME,
OPTION_CHARSET
} ;
static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
/*00*/ {option_02, 9, alternateSub, 2, UCOL_ALTERNATE_HANDLING}, /*"alternate" */
/*01*/ {option_03, 9, frenchSub, 1, UCOL_FRENCH_COLLATION}, /*"backwards" */
/*02*/ {option_07, 9, onOffSub, 2, UCOL_CASE_LEVEL}, /*"caseLevel" */
/*03*/ {option_08, 9, caseFirstSub, 3, UCOL_CASE_FIRST}, /*"caseFirst" */
/*04*/ {option_06, 13, onOffSub, 2, UCOL_NORMALIZATION_MODE}, /*"normalization" */
/*05*/ {option_13, 9, onOffSub, 2, UCOL_HIRAGANA_QUATERNARY_MODE}, /*"hiraganaQ" */
/*06*/ {option_14, 8, strengthSub, 5, UCOL_STRENGTH}, /*"strength" */
/*07*/ {option_19, 15, onOffSub, 2, UCOL_NUMERIC_COLLATION}, /*"numericOrdering"*/
/*08*/ {option_04, 12, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"variable top" */
/*09*/ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
/*10*/ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
/*11*/ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
/*12*/ {option_15, 5, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"first" */
/*13*/ {option_16, 4, firstLastSub, 7, UCOL_ATTRIBUTE_COUNT}, /*"last" */
/*14*/ {option_17, 8, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"optimize" */
/*15*/ {option_18, 20, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"suppressContractions" */
/*16*/ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
/*17*/ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
/*18*/ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
/*19*/ {option_11, 7, NULL, 0, UCOL_ATTRIBUTE_COUNT} /*"charset" */
};
static
int32_t u_strncmpNoCase(const UChar *s1,
const UChar *s2,
int32_t n)
{
if(n > 0) {
int32_t rc;
for(;;) {
rc = (int32_t)u_tolower(*s1) - (int32_t)u_tolower(*s2);
if(rc != 0 || *s1 == 0 || --n == 0) {
return rc;
}
++s1;
++s2;
}
}
return 0;
}
static
void ucol_uprv_tok_initData() {
if(!didInit) {
U_STRING_INIT(suboption_00, "non-ignorable", 13);
U_STRING_INIT(suboption_01, "shifted", 7);
U_STRING_INIT(suboption_02, "lower", 5);
U_STRING_INIT(suboption_03, "upper", 5);
U_STRING_INIT(suboption_04, "off", 3);
U_STRING_INIT(suboption_05, "on", 2);
U_STRING_INIT(suboption_06, "1", 1);
U_STRING_INIT(suboption_07, "2", 1);
U_STRING_INIT(suboption_08, "3", 1);
U_STRING_INIT(suboption_09, "4", 1);
U_STRING_INIT(suboption_10, "I", 1);
U_STRING_INIT(suboption_11, "primary", 7);
U_STRING_INIT(suboption_12, "secondary", 9);
U_STRING_INIT(suboption_13, "tertiary", 8);
U_STRING_INIT(suboption_14, "variable", 8);
U_STRING_INIT(suboption_15, "regular", 7);
U_STRING_INIT(suboption_16, "implicit", 8);
U_STRING_INIT(suboption_17, "trailing", 8);
U_STRING_INIT(option_00, "undefined", 9);
U_STRING_INIT(option_01, "rearrange", 9);
U_STRING_INIT(option_02, "alternate", 9);
U_STRING_INIT(option_03, "backwards", 9);
U_STRING_INIT(option_04, "variable top", 12);
U_STRING_INIT(option_05, "top", 3);
U_STRING_INIT(option_06, "normalization", 13);
U_STRING_INIT(option_07, "caseLevel", 9);
U_STRING_INIT(option_08, "caseFirst", 9);
U_STRING_INIT(option_09, "scriptOrder", 11);
U_STRING_INIT(option_10, "charsetname", 11);
U_STRING_INIT(option_11, "charset", 7);
U_STRING_INIT(option_12, "before", 6);
U_STRING_INIT(option_13, "hiraganaQ", 9);
U_STRING_INIT(option_14, "strength", 8);
U_STRING_INIT(option_15, "first", 5);
U_STRING_INIT(option_16, "last", 4);
U_STRING_INIT(option_17, "optimize", 8);
U_STRING_INIT(option_18, "suppressContractions", 20);
U_STRING_INIT(option_19, "numericOrdering", 15);
didInit = TRUE;
}
}
// This function reads basic options to set in the runtime collator
// used by data driven tests. Should not support build time options
U_CAPI const UChar * U_EXPORT2
ucol_tok_getNextArgument(const UChar *start, const UChar *end,
UColAttribute *attrib, UColAttributeValue *value,
UErrorCode *status) {
uint32_t i = 0;
int32_t j=0;
UBool foundOption = FALSE;
const UChar *optionArg = NULL;
ucol_uprv_tok_initData();
while(start < end && u_isWhitespace(*start)) { /* eat whitespace */
start++;
}
if(start >= end) {
return NULL;
}
/* skip opening '[' */
if(*start == 0x005b) {
start++;
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR; // no opening '['
return NULL;
}
while(i < UTOK_OPTION_COUNT) {
if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
foundOption = TRUE;
if(end - start > rulesOptions[i].optionLen) {
optionArg = start+rulesOptions[i].optionLen+1; /* start of the options, skip space */
while(u_isWhitespace(*optionArg)) { /* eat whitespace */
optionArg++;
}
}
break;
}
i++;
}
if(!foundOption) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
//ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
*attrib = rulesOptions[i].attr;
*value = rulesOptions[i].subopts[j].attrVal;
optionArg += rulesOptions[i].subopts[j].subLen;
while(u_isWhitespace(*optionArg)) { /* eat whitespace */
optionArg++;
}
if(*optionArg == 0x005d) {
optionArg++;
return optionArg;
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
}
}
}
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
static
USet *ucol_uprv_tok_readAndSetUnicodeSet(const UChar *start, const UChar *end, UErrorCode *status) {
while(*start != 0x005b) { /* advance while we find the first '[' */
start++;
}
// now we need to get a balanced set of '[]'. The problem is that a set can have
// many, and *end point to the first closing '['
int32_t noOpenBraces = 1;
int32_t current = 1; // skip the opening brace
while(start+current < end && noOpenBraces != 0) {
if(start[current] == 0x005b) {
noOpenBraces++;
} else if(start[current] == 0x005D) { // closing brace
noOpenBraces--;
}
current++;
}
UChar *nextBrace = NULL;
if(noOpenBraces != 0 || (nextBrace = u_strchr(start+current, 0x005d /*']'*/)) == NULL) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
return uset_openPattern(start, current, status);
}
static
int32_t ucol_uprv_tok_readOption(const UChar *start, const UChar *end, const UChar **optionArg) {
int32_t i = 0;
ucol_uprv_tok_initData();
while(u_isWhitespace(*start)) { /* eat whitespace */
start++;
}
while(i < UTOK_OPTION_COUNT) {
if(u_strncmpNoCase(start, rulesOptions[i].optionName, rulesOptions[i].optionLen) == 0) {
if(end - start > rulesOptions[i].optionLen) {
*optionArg = start+rulesOptions[i].optionLen; /* start of the options*/
while(u_isWhitespace(**optionArg)) { /* eat whitespace */
(*optionArg)++;
}
}
break;
}
i++;
}
if(i == UTOK_OPTION_COUNT) {
i = -1; // didn't find an option
}
return i;
}
// reads and conforms to various options in rules
// end is the position of the first closing ']'
// However, some of the options take an UnicodeSet definition
// which needs to duplicate the closing ']'
// for example: '[copy [\uAC00-\uD7FF]]'
// These options will move end to the second ']' and the
// caller will set the current to it.
static
uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, UErrorCode *status) {
const UChar* start = src->current;
int32_t i = 0;
int32_t j=0;
const UChar *optionArg = NULL;
uint8_t result = 0;
start++; /*skip opening '['*/
i = ucol_uprv_tok_readOption(start, src->end, &optionArg);
if(optionArg) {
src->current = optionArg;
}
if(i < 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
} else {
int32_t noOpenBraces = 1;
switch(i) {
case OPTION_ALTERNATE_HANDLING:
case OPTION_FRENCH_COLLATION:
case OPTION_CASE_LEVEL:
case OPTION_CASE_FIRST:
case OPTION_NORMALIZATION_MODE:
case OPTION_HIRAGANA_QUATERNARY:
case OPTION_STRENGTH:
case OPTION_NUMERIC_COLLATION:
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
ucol_uprv_tok_setOptionInImage(src->opts, rulesOptions[i].attr, rulesOptions[i].subopts[j].attrVal);
result = UCOL_TOK_SUCCESS;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_VARIABLE_TOP:
result = UCOL_TOK_SUCCESS | UCOL_TOK_VARIABLE_TOP;
break;
case OPTION_REARRANGE:
result = UCOL_TOK_SUCCESS;
break;
case OPTION_BEFORE:
if(optionArg) {
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
result = UCOL_TOK_SUCCESS | rulesOptions[i].subopts[j].attrVal + 1;
}
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_TOP: /* we are going to have an array with structures of limit CEs */
/* index to this array will be src->parsedToken.indirectIndex*/
src->parsedToken.indirectIndex = 0;
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;
break;
case OPTION_FIRST:
case OPTION_LAST: /* first, last */
for(j = 0; j<rulesOptions[i].subSize; j++) {
if(u_strncmpNoCase(optionArg, rulesOptions[i].subopts[j].subName, rulesOptions[i].subopts[j].subLen) == 0) {
// the calculation below assumes that OPTION_FIRST and OPTION_LAST are at i and i+1 and that the first
// element of indirect boundaries is reserved for top.
src->parsedToken.indirectIndex = (uint16_t)(i-OPTION_FIRST+1+j*2);
result = UCOL_TOK_SUCCESS | UCOL_TOK_TOP;;
}
}
if(result == 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
}
break;
case OPTION_OPTIMIZE:
case OPTION_SUPPRESS_CONTRACTIONS: // copy and remove are handled before normalization
// we need to move end here
src->current++; // skip opening brace
while(src->current < src->end && noOpenBraces != 0) {
if(*src->current == 0x005b) {
noOpenBraces++;
} else if(*src->current == 0x005D) { // closing brace
noOpenBraces--;
}
src->current++;
}
result = UCOL_TOK_SUCCESS;
break;
default:
*status = U_UNSUPPORTED_ERROR;
break;
}
}
src->current = u_memchr(src->current, 0x005d, src->end-src->current);
return result;
}
inline UBool ucol_tok_doSetTop(UColTokenParser *src) {
/*
top = TRUE;
*/
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
*src->extraCurrent++ = 0xFFFE;
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE & 0xFFFF);
if(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE == 0) {
src->parsedToken.charsLen = 3;
} else {
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE >> 16);
*src->extraCurrent++ = (UChar)(ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE & 0xFFFF);
src->parsedToken.charsLen = 5;
}
return TRUE;
}
U_CAPI const UChar* U_EXPORT2
ucol_tok_parseNextToken(UColTokenParser *src,
UBool startOfRules,
UParseError *parseError,
UErrorCode *status) {
/* parsing part */
UBool variableTop = FALSE;
UBool top = FALSE;
UBool inChars = TRUE;
UBool inQuote = FALSE;
UBool wasInQuote = FALSE;
UChar *optionEnd = NULL;
uint8_t before = 0;
UBool isEscaped = FALSE;
// TODO: replace these variables with src->parsedToken counterparts
// no need to use them anymore since we have src->parsedToken.
// Ideally, token parser would be a nice class... Once, when I have
// more time (around 2020 probably).
uint32_t newExtensionLen = 0;
uint32_t extensionOffset = 0;
uint32_t newStrength = UCOL_TOK_UNSET;
src->parsedToken.charsOffset = 0; src->parsedToken.charsLen = 0;
src->parsedToken.prefixOffset = 0; src->parsedToken.prefixLen = 0;
src->parsedToken.indirectIndex = 0;
while (src->current < src->end) {
UChar ch = *(src->current);
if (inQuote) {
if (ch == 0x0027/*'\''*/) {
inQuote = FALSE;
} else {
if ((src->parsedToken.charsLen == 0) || inChars) {
if(src->parsedToken.charsLen == 0) {
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
src->parsedToken.charsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
}
newExtensionLen++;
}
}
}else if(isEscaped){
isEscaped =FALSE;
if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
// enabling rules to start with non-tokens a < b
// newStrength = UCOL_TOK_RESET;
}
if(ch != 0x0000 && src->current != src->end) {
if (inChars) {
if(src->parsedToken.charsLen == 0) {
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
}
src->parsedToken.charsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->current - src->source);
}
newExtensionLen++;
}
}
}else {
if(!uprv_isRuleWhiteSpace(ch)) {
/* Sets the strength for this entry */
switch (ch) {
case 0x003D/*'='*/ :
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
src->parsedToken.indirectIndex = 5;
top = ucol_tok_doSetTop(src);
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_IDENTICAL;
break;
case 0x002C/*','*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
src->parsedToken.indirectIndex = 5;
top = ucol_tok_doSetTop(src);
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_TERTIARY;
break;
case 0x003B/*';'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
src->parsedToken.indirectIndex = 5;
top = ucol_tok_doSetTop(src);
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
newStrength = UCOL_SECONDARY;
break;
case 0x003C/*'<'*/:
if (newStrength != UCOL_TOK_UNSET) {
goto EndOfLoop;
}
/* if we start with strength, we'll reset to top */
if(startOfRules == TRUE) {
src->parsedToken.indirectIndex = 5;
top = ucol_tok_doSetTop(src);
newStrength = UCOL_TOK_RESET;
goto EndOfLoop;
}
/* before this, do a scan to verify whether this is */
/* another strength */
if(*(src->current+1) == 0x003C) {
src->current++;
if(*(src->current+1) == 0x003C) {
src->current++; /* three in a row! */
newStrength = UCOL_TERTIARY;
} else { /* two in a row */
newStrength = UCOL_SECONDARY;
}
} else { /* just one */
newStrength = UCOL_PRIMARY;
}
break;
case 0x0026/*'&'*/:
if (newStrength != UCOL_TOK_UNSET) {
/**/
goto EndOfLoop;
}
newStrength = UCOL_TOK_RESET; /* PatternEntry::RESET = 0 */
break;
case 0x005b/*'['*/:
/* options - read an option, analyze it */
if((optionEnd = u_strchr(src->current, 0x005d /*']'*/)) != NULL) {
uint8_t result = ucol_uprv_tok_readAndSetOption(src, status);
//src->current = optionEnd;
if(U_SUCCESS(*status)) {
if(result & UCOL_TOK_TOP) {
if(newStrength == UCOL_TOK_RESET) {
top = ucol_tok_doSetTop(src);
if(before) { // This is a combination of before and indirection like '&[before 2][first regular]<b'
*src->extraCurrent++ = 0x002d;
*src->extraCurrent++ = before;
src->parsedToken.charsLen+=2;
}
src->current++;
goto EndOfLoop;
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
}
} else if(result & UCOL_TOK_VARIABLE_TOP) {
if(newStrength != UCOL_TOK_RESET && newStrength != UCOL_TOK_UNSET) {
variableTop = TRUE;
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
src->parsedToken.charsLen = 1;
*src->extraCurrent++ = 0xFFFF;
src->current++;
goto EndOfLoop;
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
}
} else if (result & UCOL_TOK_BEFORE){
if(newStrength == UCOL_TOK_RESET) {
before = result & UCOL_TOK_BEFORE;
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
}
}
} else {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}
}
break;
case 0x0021/*! skip java thai modifier reordering*/:
break;
case 0x002F/*'/'*/:
wasInQuote = FALSE; /* if we were copying source characters, we want to stop now */
inChars = FALSE; /* we're now processing expansion */
break;
case 0x005C /* back slash for escaped chars */:
isEscaped = TRUE;
break;
/* found a quote, we're gonna start copying */
case 0x0027/*'\''*/:
if (newStrength == UCOL_TOK_UNSET) { /* quote is illegal until we have a strength */
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
// enabling rules to start with a non-token character a < b
// newStrength = UCOL_TOK_RESET;
}
inQuote = TRUE;
if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (src->parsedToken.charsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen*sizeof(UChar));
src->extraCurrent += src->parsedToken.charsLen;
}
src->parsedToken.charsLen++;
} else { /* we're doing an expansion */
if(wasInQuote == FALSE) {
extensionOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (newExtensionLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - newExtensionLen, newExtensionLen*sizeof(UChar));
src->extraCurrent += newExtensionLen;
}
newExtensionLen++;
}
wasInQuote = TRUE;
ch = *(++(src->current));
if(ch == 0x0027) { /* copy the double quote */
*src->extraCurrent++ = ch;
inQuote = FALSE;
}
break;
/* '@' is french only if the strength is not currently set */
/* if it is, it's just a regular character in collation rules */
case 0x0040/*'@'*/:
if (newStrength == UCOL_TOK_UNSET) {
src->opts->frenchCollation = UCOL_ON;
break;
}
case 0x007C /*|*/: /* this means we have actually been reading prefix part */
// we want to store read characters to the prefix part and continue reading
// the characters (proper way would be to restart reading the chars, but in
// that case we would have to complicate the token hasher, which I do not
// intend to play with. Instead, we will do prefixes when prefixes are due
// (before adding the elements).
src->parsedToken.prefixOffset = src->parsedToken.charsOffset;
src->parsedToken.prefixLen = src->parsedToken.charsLen;
if(inChars) { /* we're doing characters */
if(wasInQuote == FALSE) {
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
}
if (src->parsedToken.charsLen != 0) {
uprv_memcpy(src->extraCurrent, src->current - src->parsedToken.charsLen, src->parsedToken.charsLen*sizeof(UChar));
src->extraCurrent += src->parsedToken.charsLen;
}
src->parsedToken.charsLen++;
}
wasInQuote = TRUE;
do {
ch = *(++(src->current));
// skip whitespace between '|' and the character
} while (uprv_isRuleWhiteSpace(ch));
break;
//charsOffset = 0;
//newCharsLen = 0;
//break; // We want to store the whole prefix/character sequence. If we break
// the '|' is going to get lost.
default:
if (newStrength == UCOL_TOK_UNSET) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}
if (ucol_tok_isSpecialChar(ch) && (inQuote == FALSE)) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
return NULL;
}
if(ch == 0x0000 && src->current+1 == src->end) {
break;
}
if (inChars) {
if(src->parsedToken.charsLen == 0) {
src->parsedToken.charsOffset = (uint32_t)(src->current - src->source);
}
src->parsedToken.charsLen++;
} else {
if(newExtensionLen == 0) {
extensionOffset = (uint32_t)(src->current - src->source);
}
newExtensionLen++;
}
break;
}
}
}
if(wasInQuote) {
if(ch != 0x27) {
*src->extraCurrent++ = ch;
}
if(src->extraCurrent > src->extraEnd) {
/* reallocate */
UChar *newSrc = (UChar *)uprv_realloc(src->source, (src->extraEnd-src->source)*2*sizeof(UChar));
if(newSrc != NULL) {
src->current = newSrc + (src->current - src->source);
src->extraCurrent = newSrc + (src->extraCurrent - src->source);
src->end = newSrc + (src->end - src->source);
src->extraEnd = newSrc + (src->extraEnd-src->source)*2;
src->sourceCurrent = newSrc + (src->sourceCurrent-src->source);
src->source = newSrc;
} else {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
}
src->current++;
}
EndOfLoop:
wasInQuote = FALSE;
if (newStrength == UCOL_TOK_UNSET) {
return NULL;
}
if (src->parsedToken.charsLen == 0 && top == FALSE) {
syntaxError(src->source,(int32_t)(src->current-src->source),(int32_t)(src->end-src->source),parseError);
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
src->parsedToken.strength = newStrength;
src->parsedToken.extensionOffset = extensionOffset;
src->parsedToken.extensionLen = newExtensionLen;
src->parsedToken.flags = (UCOL_TOK_VARIABLE_TOP * (variableTop?1:0)) | (UCOL_TOK_TOP * (top?1:0)) | before;
return src->current;
}
/*
Processing Description
1 Build a ListList. Each list has a header, which contains two lists (positive
and negative), a reset token, a baseCE, nextCE, and previousCE. The lists and
reset may be null.
2 As you process, you keep a LAST pointer that points to the last token you
handled.
*/
static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
UParseError *parseError, UErrorCode *status) {
if(src->resultLen == src->listCapacity) {
// Unfortunately, this won't work, as we store addresses of lhs in token
src->listCapacity *= 2;
src->lh = (UColTokListHeader *)uprv_realloc(src->lh, src->listCapacity*sizeof(UColTokListHeader));
if(src->lh == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
}
/* do the reset thing */
UColToken *sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
/* test for NULL */
if (sourceToken == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
sourceToken->rulesToParse = src->source;
sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
// keep the flags around so that we know about before
sourceToken->flags = src->parsedToken.flags;
if(src->parsedToken.prefixOffset != 0) {
// this is a syntax error
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,src->parsedToken.charsOffset-1,src->parsedToken.charsOffset+src->parsedToken.charsLen,parseError);
return 0;
} else {
sourceToken->prefix = 0;
}
sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
sourceToken->strength = UCOL_TOK_RESET;
sourceToken->next = NULL;
sourceToken->previous = NULL;
sourceToken->noOfCEs = 0;
sourceToken->noOfExpCEs = 0;
sourceToken->listHeader = &src->lh[src->resultLen];
src->lh[src->resultLen].first = NULL;
src->lh[src->resultLen].last = NULL;
src->lh[src->resultLen].first = NULL;
src->lh[src->resultLen].last = NULL;
src->lh[src->resultLen].reset = sourceToken;
/*
3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
First convert all expansions into normal form. Examples:
If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
d * ... into &x * c/y * d * ...
Note: reset values can never have expansions, although they can cause the
very next item to have one. They may be contractions, if they are found
earlier in the list.
*/
if(expand != NULL) {
/* check to see if there is an expansion */
if(src->parsedToken.charsLen > 1) {
uint32_t resetCharsOffset;
resetCharsOffset = (uint32_t)(expand - src->source);
sourceToken->source = ((resetCharsOffset - src->parsedToken.charsOffset ) << 24) | src->parsedToken.charsOffset;
*expandNext = ((src->parsedToken.charsLen + src->parsedToken.charsOffset - resetCharsOffset)<<24) | (resetCharsOffset);
} else {
*expandNext = 0;
}
}
src->resultLen++;
uhash_put(src->tailored, sourceToken, sourceToken, status);
return sourceToken;
}
static
inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken, uint8_t strength, UParseError *parseError, UErrorCode *status) {
if(U_FAILURE(*status)) {
return NULL;
}
/* this is a virgin before - we need to fish the anchor from the UCA */
collIterate s;
uint32_t baseCE = UCOL_NOT_FOUND, baseContCE = UCOL_NOT_FOUND;
uint32_t CE, SecondCE;
uint32_t invPos;
if(sourceToken != NULL) {
uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
} else {
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
}
baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
baseContCE = ucol_getNextCE(src->UCA, &s, status);
if(baseContCE == UCOL_NO_MORE_CES) {
baseContCE = 0;
}
UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
uint32_t ch = 0;
uint32_t expandNext = 0;
UColToken key;
if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
uint32_t raw = uprv_uca_getRawFromImplicit(primary);
ch = uprv_uca_getCodePointFromRaw(raw-1);
uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source);
*src->extraCurrent++ = 0xFFFE;
*src->extraCurrent++ = (UChar)ch;
src->parsedToken.charsLen++;
key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
key.rulesToParse = src->source;
//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
sourceToken = (UColToken *)uhash_get(src->tailored, &key);
if(sourceToken == NULL) {
src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
if(isContinuation(SecondCE)) {
src->lh[src->resultLen].baseContCE = SecondCE;
} else {
src->lh[src->resultLen].baseContCE = 0;
}
src->lh[src->resultLen].nextCE = 0;
src->lh[src->resultLen].nextContCE = 0;
src->lh[src->resultLen].previousCE = 0;
src->lh[src->resultLen].previousContCE = 0;
src->lh[src->resultLen].indirect = FALSE;
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
}
} else {
invPos = ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
// we got the previous CE. Now we need to see if the difference between
// the two CEs is really of the requested strength.
// if it's a bigger difference (we asked for secondary and got primary), we
// need to modify the CE.
if(ucol_getCEStrengthDifference(baseCE, baseContCE, CE, SecondCE) < strength) {
// adjust the strength
// now we are in the situation where our baseCE should actually be modified in
// order to get the CE in the right position.
if(strength == UCOL_SECONDARY) {
CE = baseCE - 0x0200;
} else { // strength == UCOL_TERTIARY
CE = baseCE - 0x02;
}
if(baseContCE) {
if(strength == UCOL_SECONDARY) {
SecondCE = baseContCE - 0x0200;
} else { // strength == UCOL_TERTIARY
SecondCE = baseContCE - 0x02;
}
}
}
#if 0
// the code below relies on getting a code point from the inverse table, in order to be
// able to merge the situations like &x < 9 &[before 1]a < d. This won't work:
// 1. There are many code points that have the same CE
// 2. The CE to codepoint table (things pointed to by CETable[3*invPos+2] are broken.
// Also, in case when there is no equivalent strength before an element, we have to actually
// construct one. For example, &[before 2]a << x won't result in x << a, because the element
// before a is a primary difference.
//uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
ch = CETable[3*invPos+2];
if((ch & UCOL_INV_SIZEMASK) != 0) {
uint16_t *conts = (uint16_t *)((uint8_t *)src->invUCA+src->invUCA->conts);
uint32_t offset = (ch & UCOL_INV_OFFSETMASK);
ch = conts[offset];
}
*src->extraCurrent++ = (UChar)ch;
src->parsedToken.charsOffset = (uint32_t)(src->extraCurrent - src->source - 1);
src->parsedToken.charsLen = 1;
// We got an UCA before. However, this might have been tailored.
// example:
// &\u30ca = \u306a
// &[before 3]\u306a<<<\u306a|\u309d
// uint32_t key = (*newCharsLen << 24) | *charsOffset;
key.source = (src->parsedToken.charsLen/**newCharsLen*/ << 24) | src->parsedToken.charsOffset/**charsOffset*/;
key.rulesToParse = src->source;
//sourceToken = (UColToken *)uhash_iget(src->tailored, (int32_t)key);
sourceToken = (UColToken *)uhash_get(src->tailored, &key);
#endif
// here is how it should be. The situation such as &[before 1]a < x, should be
// resolved exactly as if we wrote &a > x.
// therefore, I don't really care if the UCA value before a has been changed.
// However, I do care if the strength between my element and the previous element
// is bigger then I wanted. So, if CE < baseCE and I wanted &[before 2], then i'll
// have to construct the base CE.
// if we found a tailored thing, we have to use the UCA value and construct
// a new reset token with constructed name
//if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
// character to which we want to anchor is already tailored.
// We need to construct a new token which will be the anchor
// point
//*(src->extraCurrent-1) = 0xFFFE;
//*src->extraCurrent++ = (UChar)ch;
// grab before
src->parsedToken.charsOffset -= 10;
src->parsedToken.charsLen += 10;
src->lh[src->resultLen].baseCE = CE & 0xFFFFFF3F;
if(isContinuation(SecondCE)) {
src->lh[src->resultLen].baseContCE = SecondCE;
} else {
src->lh[src->resultLen].baseContCE = 0;
}
src->lh[src->resultLen].nextCE = 0;
src->lh[src->resultLen].nextContCE = 0;
src->lh[src->resultLen].previousCE = 0;
src->lh[src->resultLen].previousContCE = 0;
src->lh[src->resultLen].indirect = FALSE;
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
//}
}
return sourceToken;
}
uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseError, UErrorCode *status) {
UColToken *lastToken = NULL;
const UChar *parseEnd = NULL;
uint32_t expandNext = 0;
UBool variableTop = FALSE;
UBool top = FALSE;
uint16_t specs = 0;
UColTokListHeader *ListList = NULL;
src->parsedToken.strength = UCOL_TOK_UNSET;
ListList = src->lh;
if(U_FAILURE(*status)) {
return 0;
}
while(src->current < src->end) {
src->parsedToken.prefixOffset = 0;
parseEnd = ucol_tok_parseNextToken(src,
(UBool)(lastToken == NULL),
parseError,
status);
specs = src->parsedToken.flags;
variableTop = ((specs & UCOL_TOK_VARIABLE_TOP) != 0);
top = ((specs & UCOL_TOK_TOP) != 0);
if(U_SUCCESS(*status) && parseEnd != NULL) {
UColToken *sourceToken = NULL;
//uint32_t key = 0;
uint32_t lastStrength = UCOL_TOK_UNSET;
if(lastToken != NULL ) {
lastStrength = lastToken->strength;
}
//key = newCharsLen << 24 | charsOffset;
UColToken key;
key.source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
key.rulesToParse = src->source;
/* 4 Lookup each source in the CharsToToken map, and find a sourceToken */
sourceToken = (UColToken *)uhash_get(src->tailored, &key);
if(src->parsedToken.strength != UCOL_TOK_RESET) {
if(lastToken == NULL) { /* this means that rules haven't started properly */
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
return 0;
}
/* 6 Otherwise (when relation != reset) */
if(sourceToken == NULL) {
/* If sourceToken is null, create new one, */
sourceToken = (UColToken *)uprv_malloc(sizeof(UColToken));
/* test for NULL */
if (sourceToken == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
sourceToken->rulesToParse = src->source;
sourceToken->source = src->parsedToken.charsLen << 24 | src->parsedToken.charsOffset;
sourceToken->debugSource = *(src->source + src->parsedToken.charsOffset);
sourceToken->prefix = src->parsedToken.prefixLen << 24 | src->parsedToken.prefixOffset;
sourceToken->debugPrefix = *(src->source + src->parsedToken.prefixOffset);
sourceToken->polarity = UCOL_TOK_POLARITY_POSITIVE; /* TODO: this should also handle reverse */
sourceToken->next = NULL;
sourceToken->previous = NULL;
sourceToken->noOfCEs = 0;
sourceToken->noOfExpCEs = 0;
// keep the flags around so that we know about before
sourceToken->flags = src->parsedToken.flags;
uhash_put(src->tailored, sourceToken, sourceToken, status);
} else {
/* we could have fished out a reset here */
if(sourceToken->strength != UCOL_TOK_RESET && lastToken != sourceToken) {
/* otherwise remove sourceToken from where it was. */
if(sourceToken->next != NULL) {
if(sourceToken->next->strength > sourceToken->strength) {
sourceToken->next->strength = sourceToken->strength;
}
sourceToken->next->previous = sourceToken->previous;
} else {
sourceToken->listHeader->last = sourceToken->previous;
}
if(sourceToken->previous != NULL) {
sourceToken->previous->next = sourceToken->next;
} else {
sourceToken->listHeader->first = sourceToken->next;
}
sourceToken->next = NULL;
sourceToken->previous = NULL;
}
}
sourceToken->strength = src->parsedToken.strength;
sourceToken->listHeader = lastToken->listHeader;
/*
1. Find the strongest strength in each list, and set strongestP and strongestN
accordingly in the headers.
*/
if(lastStrength == UCOL_TOK_RESET
|| sourceToken->listHeader->first == 0) {
/* If LAST is a reset
insert sourceToken in the list. */
if(sourceToken->listHeader->first == 0) {
sourceToken->listHeader->first = sourceToken;
sourceToken->listHeader->last = sourceToken;
} else { /* we need to find a place for us */
/* and we'll get in front of the same strength */
if(sourceToken->listHeader->first->strength <= sourceToken->strength) {
sourceToken->next = sourceToken->listHeader->first;
sourceToken->next->previous = sourceToken;
sourceToken->listHeader->first = sourceToken;
sourceToken->previous = NULL;
} else {
lastToken = sourceToken->listHeader->first;
while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
lastToken = lastToken->next;
}
if(lastToken->next != NULL) {
lastToken->next->previous = sourceToken;
} else {
sourceToken->listHeader->last = sourceToken;
}
sourceToken->previous = lastToken;
sourceToken->next = lastToken->next;
lastToken->next = sourceToken;
}
}
} else {
/* Otherwise (when LAST is not a reset)
if polarity (LAST) == polarity(relation), insert sourceToken after LAST,
otherwise insert before.
when inserting after or before, search to the next position with the same
strength in that direction. (This is called postpone insertion). */
if(sourceToken != lastToken) {
if(lastToken->polarity == sourceToken->polarity) {
while(lastToken->next != NULL && lastToken->next->strength > sourceToken->strength) {
lastToken = lastToken->next;
}
sourceToken->previous = lastToken;
if(lastToken->next != NULL) {
lastToken->next->previous = sourceToken;
} else {
sourceToken->listHeader->last = sourceToken;
}
sourceToken->next = lastToken->next;
lastToken->next = sourceToken;
} else {
while(lastToken->previous != NULL && lastToken->previous->strength > sourceToken->strength) {
lastToken = lastToken->previous;
}
sourceToken->next = lastToken;
if(lastToken->previous != NULL) {
lastToken->previous->next = sourceToken;
} else {
sourceToken->listHeader->first = sourceToken;
}
sourceToken->previous = lastToken->previous;
lastToken->previous = sourceToken;
}
} else { /* repeated one thing twice in rules, stay with the stronger strength */
if(lastStrength < sourceToken->strength) {
sourceToken->strength = lastStrength;
}
}
}
/* if the token was a variable top, we're gonna put it in */
if(variableTop == TRUE && src->varTop == NULL) {
variableTop = FALSE;
src->varTop = sourceToken;
}
// Treat the expansions.
// There are two types of expansions: explicit (x / y) and reset based propagating expansions
// (&abc * d * e <=> &ab * d / c * e / c)
// if both of them are in effect for a token, they are combined.
sourceToken->expansion = src->parsedToken.extensionLen << 24 | src->parsedToken.extensionOffset;
if(expandNext != 0) {
if(sourceToken->strength == UCOL_PRIMARY) { /* primary strength kills off the implicit expansion */
expandNext = 0;
} else if(sourceToken->expansion == 0) { /* if there is no expansion, implicit is just added to the token */
sourceToken->expansion = expandNext;
} else { /* there is both explicit and implicit expansion. We need to make a combination */
uprv_memcpy(src->extraCurrent, src->source + (expandNext & 0xFFFFFF), (expandNext >> 24)*sizeof(UChar));
uprv_memcpy(src->extraCurrent+(expandNext >> 24), src->source + src->parsedToken.extensionOffset, src->parsedToken.extensionLen*sizeof(UChar));
sourceToken->expansion = (uint32_t)(((expandNext >> 24) + src->parsedToken.extensionLen)<<24 | (src->extraCurrent - src->source));
src->extraCurrent += (expandNext >> 24) + src->parsedToken.extensionLen;
}
}
// This is just for debugging purposes
if(sourceToken->expansion != 0) {
sourceToken->debugExpansion = *(src->source + src->parsedToken.extensionOffset);
} else {
sourceToken->debugExpansion = 0;
}
// if the previous token was a reset before, the strength of this
// token must match the strength of before. Otherwise we have an
// undefined situation.
// In other words, we currently have a cludge which we use to
// represent &a >> x. This is written as &[before 2]a << x.
if((lastToken->flags & UCOL_TOK_BEFORE) != 0) {
uint8_t beforeStrength = (lastToken->flags & UCOL_TOK_BEFORE) - 1;
if(beforeStrength != sourceToken->strength) {
*status = U_INVALID_FORMAT_ERROR;
syntaxError(src->source,0,(int32_t)(src->end-src->source),parseError);
return 0;
}
}
} else {
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
/* if the previous token was also a reset, */
/*this means that we have two consecutive resets */
/* and we want to remove the previous one if empty*/
if(ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
}
if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
uint32_t searchCharsLen = src->parsedToken.charsLen;
while(searchCharsLen > 1 && sourceToken == NULL) {
searchCharsLen--;
//key = searchCharsLen << 24 | charsOffset;
UColToken key;
key.source = searchCharsLen << 24 | src->parsedToken.charsOffset;
key.rulesToParse = src->source;
sourceToken = (UColToken *)uhash_get(src->tailored, &key);
}
if(sourceToken != NULL) {
expandNext = (src->parsedToken.charsLen - searchCharsLen) << 24 | (src->parsedToken.charsOffset + searchCharsLen);
}
}
if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
if(top == FALSE) { /* there is no indirection */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
/* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
while(sourceToken->strength > strength && sourceToken->previous != NULL) {
sourceToken = sourceToken->previous;
}
/* here, either we hit the strength or NULL */
if(sourceToken->strength == strength) {
if(sourceToken->previous != NULL) {
sourceToken = sourceToken->previous;
} else { /* start of list */
sourceToken = sourceToken->listHeader->reset;
}
} else { /* we hit NULL */
/* we should be doing the else part */
sourceToken = sourceToken->listHeader->reset;
sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
}
} else {
sourceToken = getVirginBefore(src, sourceToken, strength, parseError, status);
}
} else { /* this is both before and indirection */
top = FALSE;
ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = TRUE;
/* we need to do slightly more work. we need to get the baseCE using the */
/* inverse UCA & getPrevious. The next bound is not set, and will be decided */
/* in ucol_bld */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
if((baseCE & 0xFF000000) >= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
uint32_t primary = baseCE & UCOL_PRIMARYMASK | (baseContCE & UCOL_PRIMARYMASK) >> 16;
uint32_t raw = uprv_uca_getRawFromImplicit(primary);
uint32_t primaryCE = uprv_uca_getImplicitFromRaw(raw-1);
CE = primaryCE & UCOL_PRIMARYMASK | 0x0505;
SecondCE = (primaryCE << 16) & UCOL_PRIMARYMASK | UCOL_CONTINUATION_MARKER;
} else {
/*int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);*/
ucol_inv_getPrevCE(src, baseCE, baseContCE, &CE, &SecondCE, strength);
}
ListList[src->resultLen].baseCE = CE;
ListList[src->resultLen].baseContCE = SecondCE;
ListList[src->resultLen].nextCE = 0;
ListList[src->resultLen].nextContCE = 0;
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
}
}
/* 5 If the relation is a reset:
If sourceToken is null
Create new list, create new sourceToken, make the baseCE from source, put
the sourceToken in ListHeader of the new list */
if(sourceToken == NULL) {
/*
3 Consider each item: relation, source, and expansion: e.g. ...< x / y ...
First convert all expansions into normal form. Examples:
If "xy" doesn't occur earlier in the list or in the UCA, convert &xy * c *
d * ... into &x * c/y * d * ...
Note: reset values can never have expansions, although they can cause the
very next item to have one. They may be contractions, if they are found
earlier in the list.
*/
if(top == FALSE) {
collIterate s;
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
CE = ucol_getNextCE(src->UCA, &s, status);
UChar *expand = s.pos;
SecondCE = ucol_getNextCE(src->UCA, &s, status);
ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
if(isContinuation(SecondCE)) {
ListList[src->resultLen].baseContCE = SecondCE;
} else {
ListList[src->resultLen].baseContCE = 0;
}
ListList[src->resultLen].nextCE = 0;
ListList[src->resultLen].nextContCE = 0;
ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = FALSE;
sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
} else { /* top == TRUE */
/* just use the supplied values */
top = FALSE;
ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = TRUE;
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
}
} else { /* reset to something already in rules */
top = FALSE;
}
}
/* 7 After all this, set LAST to point to sourceToken, and goto step 3. */
lastToken = sourceToken;
} else {
if(U_FAILURE(*status)) {
return 0;
}
}
}
if(src->resultLen > 0 && ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
return src->resultLen;
}
void ucol_tok_initTokenList(UColTokenParser *src, const UChar *rules, const uint32_t rulesLength, const UCollator *UCA, UErrorCode *status) {
uint32_t nSize = 0;
uint32_t estimatedSize = (2*rulesLength+UCOL_TOK_EXTRA_RULE_SPACE_SIZE);
if(U_FAILURE(*status)) {
return;
}
// set everything to zero, so that we can clean up gracefully
uprv_memset(src, 0, sizeof(UColTokenParser));
// first we need to find options that don't like to be normalized,
// like copy and remove...
//const UChar *openBrace = rules;
int32_t optionNumber = -1;
const UChar *setStart;
uint32_t i = 0;
while(i < rulesLength) {
if(rules[i] == 0x005B) {
// while((openBrace = u_strchr(openBrace, 0x005B)) != NULL) { // find open braces
//optionNumber = ucol_uprv_tok_readOption(openBrace+1, rules+rulesLength, &setStart);
optionNumber = ucol_uprv_tok_readOption(rules+i+1, rules+rulesLength, &setStart);
if(optionNumber == OPTION_OPTIMIZE) { /* copy - parts of UCA to tailoring */
USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
if(U_SUCCESS(*status)) {
if(src->copySet == NULL) {
src->copySet = newSet;
} else {
((UnicodeSet *)src->copySet)->addAll(*((UnicodeSet *)newSet));
uset_close(newSet);
}
} else {
return;
}
} else if(optionNumber == OPTION_SUPPRESS_CONTRACTIONS) {
USet *newSet = ucol_uprv_tok_readAndSetUnicodeSet(setStart, rules+rulesLength, status);
if(U_SUCCESS(*status)) {
if(src->removeSet == NULL) {
src->removeSet = newSet;
} else {
((UnicodeSet *)src->removeSet)->addAll(*((UnicodeSet *)newSet));
uset_close(newSet);
}
} else {
return;
}
}
}
//openBrace++;
i++;
}
src->source = (UChar *)uprv_malloc(estimatedSize*sizeof(UChar));
/* test for NULL */
if (src->source == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(src->source, 0, estimatedSize*sizeof(UChar));
nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, estimatedSize, status);
if(nSize > estimatedSize || *status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
src->source = (UChar *)uprv_realloc(src->source, (nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE)*sizeof(UChar));
/* test for NULL */
if (src->source == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
nSize = unorm_normalize(rules, rulesLength, UNORM_NFD, 0, src->source, nSize+UCOL_TOK_EXTRA_RULE_SPACE_SIZE, status);
}
src->current = src->source;
src->end = src->source+nSize;
src->sourceCurrent = src->source;
src->extraCurrent = src->end+1; // Preserve terminating zero in the rule string so that option scanning works correctly
src->extraEnd = src->source+estimatedSize; //src->end+UCOL_TOK_EXTRA_RULE_SPACE_SIZE;
src->varTop = NULL;
src->UCA = UCA;
src->invUCA = ucol_initInverseUCA(status);
src->parsedToken.charsLen = 0;
src->parsedToken.charsOffset = 0;
src->parsedToken.extensionLen = 0;
src->parsedToken.extensionOffset = 0;
src->parsedToken.prefixLen = 0;
src->parsedToken.prefixOffset = 0;
src->parsedToken.flags = 0;
src->parsedToken.strength = UCOL_TOK_UNSET;
if(U_FAILURE(*status)) {
return;
}
src->tailored = uhash_open(uhash_hashTokens, uhash_compareTokens, status);
if(U_FAILURE(*status)) {
return;
}
uhash_setValueDeleter(src->tailored, uhash_freeBlock);
src->opts = (UColOptionSet *)uprv_malloc(sizeof(UColOptionSet));
/* test for NULL */
if (src->opts == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memcpy(src->opts, UCA->options, sizeof(UColOptionSet));
// rulesToParse = src->source;
src->lh = 0;
src->listCapacity = 1024;
src->lh = (UColTokListHeader *)uprv_malloc(src->listCapacity*sizeof(UColTokListHeader));
//Test for NULL
if (src->lh == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(src->lh, 0, src->listCapacity*sizeof(UColTokListHeader));
src->resultLen = 0;
UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
// UCOL_RESET_TOP_VALUE
setIndirectBoundaries(0, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
// UCOL_FIRST_PRIMARY_IGNORABLE
setIndirectBoundaries(1, consts->UCA_FIRST_PRIMARY_IGNORABLE, 0);
// UCOL_LAST_PRIMARY_IGNORABLE
setIndirectBoundaries(2, consts->UCA_LAST_PRIMARY_IGNORABLE, 0);
// UCOL_FIRST_SECONDARY_IGNORABLE
setIndirectBoundaries(3, consts->UCA_FIRST_SECONDARY_IGNORABLE, 0);
// UCOL_LAST_SECONDARY_IGNORABLE
setIndirectBoundaries(4, consts->UCA_LAST_SECONDARY_IGNORABLE, 0);
// UCOL_FIRST_TERTIARY_IGNORABLE
setIndirectBoundaries(5, consts->UCA_FIRST_TERTIARY_IGNORABLE, 0);
// UCOL_LAST_TERTIARY_IGNORABLE
setIndirectBoundaries(6, consts->UCA_LAST_TERTIARY_IGNORABLE, 0);
// UCOL_FIRST_VARIABLE
setIndirectBoundaries(7, consts->UCA_FIRST_VARIABLE, 0);
// UCOL_LAST_VARIABLE
setIndirectBoundaries(8, consts->UCA_LAST_VARIABLE, 0);
// UCOL_FIRST_NON_VARIABLE
setIndirectBoundaries(9, consts->UCA_FIRST_NON_VARIABLE, 0);
// UCOL_LAST_NON_VARIABLE
setIndirectBoundaries(10, consts->UCA_LAST_NON_VARIABLE, consts->UCA_FIRST_IMPLICIT);
// UCOL_FIRST_IMPLICIT
setIndirectBoundaries(11, consts->UCA_FIRST_IMPLICIT, 0);
// UCOL_LAST_IMPLICIT
setIndirectBoundaries(12, consts->UCA_LAST_IMPLICIT, consts->UCA_FIRST_TRAILING);
// UCOL_FIRST_TRAILING
setIndirectBoundaries(13, consts->UCA_FIRST_TRAILING, 0);
// UCOL_LAST_TRAILING
setIndirectBoundaries(14, consts->UCA_LAST_TRAILING, 0);
ucolIndirectBoundaries[14].limitCE = (consts->UCA_PRIMARY_SPECIAL_MIN<<24);
}
void ucol_tok_closeTokenList(UColTokenParser *src) {
if(src->copySet != NULL) {
uset_close(src->copySet);
}
if(src->removeSet != NULL) {
uset_close(src->removeSet);
}
if(src->tailored != NULL) {
uhash_close(src->tailored);
}
if(src->lh != NULL) {
uprv_free(src->lh);
}
if(src->source != NULL) {
uprv_free(src->source);
}
if(src->opts != NULL) {
uprv_free(src->opts);
}
}
#endif /* #if !UCONFIG_NO_COLLATION */