ICU-1770 more support for indirects & some more tests
X-SVN-Rev: 8245
This commit is contained in:
parent
364b4d33ff
commit
98cdff5012
@ -80,11 +80,21 @@ int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) {
|
||||
}
|
||||
}
|
||||
|
||||
/* weiv: */
|
||||
/* in searching for elements, I have removed the failure */
|
||||
/* The reason for this is that the builder does not rely */
|
||||
/* on search mechanism telling it that it didn't find an */
|
||||
/* element. However, indirect positioning relies on being */
|
||||
/* able to find the elements around any CE, even if it is */
|
||||
/* not defined in the UCA. */
|
||||
return i;
|
||||
/*
|
||||
if((first == CE && second == SecondCE)) {
|
||||
return i;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
static uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
|
||||
@ -141,7 +151,9 @@ U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE,
|
||||
*prevContCE = contCE;
|
||||
|
||||
while((*prevCE & strengthMask[strength]) == CE
|
||||
&& (*prevContCE & strengthMask[strength])== contCE) {
|
||||
&& (*prevContCE & strengthMask[strength])== contCE
|
||||
&& iCE > 0) { /* this condition should prevent falling off the edge of the world */
|
||||
/* here, we end up in a singularity - zero */
|
||||
*prevCE = (*(CETable+3*(--iCE)));
|
||||
*prevContCE = (*(CETable+3*(iCE)+1));
|
||||
}
|
||||
|
@ -43,7 +43,7 @@ static const UDataInfo ucaDataInfo={
|
||||
0,
|
||||
|
||||
{0x55, 0x43, 0x6f, 0x6c}, /* dataFormat="UCol" */
|
||||
{1, 1, 0, 0}, /* formatVersion */
|
||||
{2, 0, 0, 0}, /* formatVersion */
|
||||
{3, 0, 0, 0} /* dataVersion = Unicode Version*/
|
||||
};
|
||||
|
||||
@ -58,7 +58,7 @@ static const UDataInfo invUcaDataInfo={
|
||||
0,
|
||||
|
||||
{0x49, 0x6E, 0x76, 0x43}, /* dataFormat="InvC" */
|
||||
{1, 1, 0, 0}, /* formatVersion */
|
||||
{2, 0, 0, 0}, /* formatVersion */
|
||||
{3, 0, 0, 0} /* dataVersion = Unicode Version*/
|
||||
};
|
||||
|
||||
@ -423,23 +423,26 @@ enum {
|
||||
};
|
||||
|
||||
#define UCOL_RESET_TOP_VALUE 0x9F000303
|
||||
#define UCOL_NEXT_TOP_VALUE 0xE8960303
|
||||
#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705
|
||||
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
|
||||
#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05
|
||||
#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C5
|
||||
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
|
||||
#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C0
|
||||
#define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
|
||||
#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000000
|
||||
#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000500
|
||||
#define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00008705
|
||||
#define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00008705
|
||||
#define UCOL_FIRST_VARIABLE 0x05070505
|
||||
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
|
||||
#define UCOL_LAST_VARIABLE 0x13CF0505
|
||||
#define UCOL_FIRST_NON_VARIABLE 0x16200505
|
||||
#define UCOL_LAST_NON_VARIABLE 0x767C0505
|
||||
|
||||
#define UCOL_NEXT_TOP_VALUE 0xE8960303
|
||||
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
|
||||
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
|
||||
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
|
||||
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000500
|
||||
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00000000
|
||||
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
|
||||
#define UCOL_NEXT_LAST_VARIABLE 0x16200505
|
||||
|
||||
#define PRIMARY_IMPLICIT_MIN 0xE8000000
|
||||
|
@ -208,16 +208,43 @@ typedef struct {
|
||||
uint32_t limitContCE;
|
||||
} indirectBoundaries;
|
||||
|
||||
/* these values are used for finding CE values for indirect positioning. */
|
||||
/* Indirect positioning is a mechanism for allowing resets on symbolic */
|
||||
/* values. It only works for resets and you cannot tailor indirect names */
|
||||
/* An indirect name can define either an anchor point or a range. An */
|
||||
/* anchor point behaves in exactly the same way as a code point in reset */
|
||||
/* would, except that it cannot be tailored. A range (we currently only */
|
||||
/* know for the [top] range will explicitly set the upper bound for */
|
||||
/* generated CEs, thus allowing for better control over how many CEs can */
|
||||
/* be squeezed between in the range without performance penalty. */
|
||||
/* In that respect, we use [top] for tailoring of locales that use CJK */
|
||||
/* characters. Other indirect values are currently a pure convenience, */
|
||||
/* they can be used to assure that the CEs will be always positioned in */
|
||||
/* the same place relative to a point with known properties (e.g. first */
|
||||
/* primary ignorable). */
|
||||
static indirectBoundaries ucolIndirectBoundaries[] = {
|
||||
{ UCOL_RESET_TOP_VALUE, 0, UCOL_NEXT_TOP_VALUE, 0 },
|
||||
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_FIRST_PRIMARY_IGNORABLE, 0 },
|
||||
{ UCOL_LAST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_LAST_PRIMARY_IGNORABLE, 0 },
|
||||
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_FIRST_SECONDARY_IGNORABLE, 0 },
|
||||
{ UCOL_LAST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_LAST_SECONDARY_IGNORABLE, 0 },
|
||||
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_FIRST_TERTIARY_IGNORABLE, 0 },
|
||||
{ UCOL_LAST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_LAST_TERTIARY_IGNORABLE, 0 },
|
||||
{ UCOL_FIRST_VARIABLE, 0, UCOL_NEXT_FIRST_VARIABLE, 0 },
|
||||
{ UCOL_LAST_VARIABLE, 0, UCOL_NEXT_LAST_VARIABLE, 0 },
|
||||
{ UCOL_RESET_TOP_VALUE, 0,
|
||||
UCOL_NEXT_TOP_VALUE, 0 },
|
||||
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
|
||||
0, 0 },
|
||||
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_FIRST_VARIABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_LAST_VARIABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_FIRST_NON_VARIABLE, 0,
|
||||
0, 0 },
|
||||
{ UCOL_LAST_NON_VARIABLE, 0,
|
||||
0, 0 },
|
||||
};
|
||||
|
||||
#define UTOK_OPTION_COUNT 17
|
||||
@ -243,7 +270,7 @@ U_STRING_DECL(suboption_11, "primary", 7);
|
||||
U_STRING_DECL(suboption_12, "secondary", 9);
|
||||
U_STRING_DECL(suboption_13, "tertiary", 8);
|
||||
U_STRING_DECL(suboption_14, "variable", 8);
|
||||
U_STRING_DECL(suboption_15, "ignorable", 9);
|
||||
U_STRING_DECL(suboption_15, "non-ignorable", 13);
|
||||
|
||||
U_STRING_DECL(option_00, "undefined", 9);
|
||||
U_STRING_DECL(option_01, "rearrange", 9);
|
||||
@ -306,11 +333,12 @@ static const ucolTokSuboption strengthSub[5] = {
|
||||
{suboption_10, 1, UCOL_IDENTICAL},
|
||||
};
|
||||
|
||||
static const ucolTokSuboption firstLastSub[4] = {
|
||||
static const ucolTokSuboption firstLastSub[5] = {
|
||||
{suboption_11, 7, UCOL_PRIMARY},
|
||||
{suboption_12, 9, UCOL_PRIMARY},
|
||||
{suboption_13, 8, UCOL_PRIMARY},
|
||||
{suboption_14, 8, UCOL_PRIMARY},
|
||||
{suboption_15, 13, UCOL_PRIMARY},
|
||||
};
|
||||
|
||||
static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
|
||||
@ -325,8 +353,8 @@ static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
|
||||
{option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
|
||||
{option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
|
||||
{option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
|
||||
{option_15, 5, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"first" */
|
||||
{option_16, 4, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"last" */
|
||||
{option_15, 5, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"first" */
|
||||
{option_16, 4, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"last" */
|
||||
{option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
|
||||
{option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
|
||||
{option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
|
||||
@ -378,7 +406,7 @@ uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, const UChar *end, U
|
||||
U_STRING_INIT(suboption_12, "secondary", 9);
|
||||
U_STRING_INIT(suboption_13, "tertiary", 8);
|
||||
U_STRING_INIT(suboption_14, "variable", 8);
|
||||
U_STRING_INIT(suboption_15, "ignorable", 9);
|
||||
U_STRING_INIT(suboption_15, "non-ignorable", 13);
|
||||
|
||||
|
||||
U_STRING_INIT(option_00, "undefined", 9);
|
||||
@ -1172,6 +1200,15 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
|
||||
sourceToken->debugExpansion = 0;
|
||||
}
|
||||
} else {
|
||||
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
|
||||
/* if the previous token was also a reset, */
|
||||
/*this means that we have two consecutive resets */
|
||||
/* and we want to remove the previous one if empty*/
|
||||
if(ListList[src->resultLen-1].first == NULL) {
|
||||
src->resultLen--;
|
||||
}
|
||||
}
|
||||
|
||||
if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
|
||||
uint32_t searchCharsLen = src->parsedToken.charsLen;
|
||||
while(searchCharsLen > 1 && sourceToken == NULL) {
|
||||
@ -1187,7 +1224,7 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
|
||||
}
|
||||
}
|
||||
|
||||
if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
|
||||
if((specs & UCOL_TOK_BEFORE) != 0 && top == FALSE) { /* we're doing before & there is no indirection */
|
||||
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
|
||||
if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
|
||||
/* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
|
||||
@ -1214,15 +1251,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
|
||||
}
|
||||
|
||||
|
||||
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
|
||||
/* if the previous token was also a reset, */
|
||||
/*this means that we have two consecutive resets */
|
||||
/* and we want to remove the previous one if empty*/
|
||||
if(ListList[src->resultLen-1].first == NULL) {
|
||||
src->resultLen--;
|
||||
}
|
||||
}
|
||||
|
||||
/* 5 If the relation is a reset:
|
||||
If sourceToken is null
|
||||
Create new list, create new sourceToken, make the baseCE from source, put
|
||||
@ -1261,13 +1289,29 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
|
||||
sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
|
||||
} else { /* top == TRUE */
|
||||
top = FALSE;
|
||||
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
|
||||
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
|
||||
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
|
||||
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
|
||||
ListList[src->resultLen].previousCE = 0;
|
||||
ListList[src->resultLen].previousContCE = 0;
|
||||
ListList[src->resultLen].indirect = TRUE;
|
||||
if((specs & UCOL_TOK_BEFORE) == 0) { /* indirect without before */
|
||||
/* just use the supplied values */
|
||||
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
|
||||
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
|
||||
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
|
||||
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
|
||||
} else { /* there was a before */
|
||||
/* we need to do slightly more work. we need to get the baseCE using the */
|
||||
/* inverse UCA & getPrevious. The next bound is not set, and will be decided */
|
||||
/* in ucol_bld */
|
||||
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
|
||||
uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
|
||||
uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
|
||||
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
|
||||
int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);
|
||||
ListList[src->resultLen].baseCE = CE;
|
||||
ListList[src->resultLen].baseContCE = SecondCE;
|
||||
ListList[src->resultLen].nextCE = 0;
|
||||
ListList[src->resultLen].nextContCE = 0;
|
||||
}
|
||||
|
||||
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);
|
||||
|
||||
|
@ -3469,6 +3469,11 @@ static void TestRuleOptions() {
|
||||
"&[last tertiary ignorable]<<<w"
|
||||
"&[top]<u",
|
||||
{"\\ufffb", "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7 },
|
||||
{ "&[before 1][first tertiary ignorable]<<<k",
|
||||
{ "\\u0000", "k"}, 2}, /* you cannot go before first tertiary ignorable */
|
||||
/* - all befores here amount to zero */
|
||||
{ "&[before 3][last primary ignorable]<<<k",
|
||||
{ "k", "\\u20e3"}, 2},
|
||||
};
|
||||
uint32_t i;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user