ICU-1770 more support for indirects & some more tests

X-SVN-Rev: 8245
This commit is contained in:
Vladimir Weinstein 2002-03-26 16:46:17 +00:00
parent 364b4d33ff
commit 98cdff5012
4 changed files with 105 additions and 41 deletions

View File

@ -80,11 +80,21 @@ int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) {
} }
} }
/* weiv: */
/* in searching for elements, I have removed the failure */
/* The reason for this is that the builder does not rely */
/* on search mechanism telling it that it didn't find an */
/* element. However, indirect positioning relies on being */
/* able to find the elements around any CE, even if it is */
/* not defined in the UCA. */
return i;
/*
if((first == CE && second == SecondCE)) { if((first == CE && second == SecondCE)) {
return i; return i;
} else { } else {
return -1; return -1;
} }
*/
} }
static uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = { static uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
@ -141,7 +151,9 @@ U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE,
*prevContCE = contCE; *prevContCE = contCE;
while((*prevCE & strengthMask[strength]) == CE while((*prevCE & strengthMask[strength]) == CE
&& (*prevContCE & strengthMask[strength])== contCE) { && (*prevContCE & strengthMask[strength])== contCE
&& iCE > 0) { /* this condition should prevent falling off the edge of the world */
/* here, we end up in a singularity - zero */
*prevCE = (*(CETable+3*(--iCE))); *prevCE = (*(CETable+3*(--iCE)));
*prevContCE = (*(CETable+3*(iCE)+1)); *prevContCE = (*(CETable+3*(iCE)+1));
} }

View File

@ -43,7 +43,7 @@ static const UDataInfo ucaDataInfo={
0, 0,
{0x55, 0x43, 0x6f, 0x6c}, /* dataFormat="UCol" */ {0x55, 0x43, 0x6f, 0x6c}, /* dataFormat="UCol" */
{1, 1, 0, 0}, /* formatVersion */ {2, 0, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion = Unicode Version*/ {3, 0, 0, 0} /* dataVersion = Unicode Version*/
}; };
@ -58,7 +58,7 @@ static const UDataInfo invUcaDataInfo={
0, 0,
{0x49, 0x6E, 0x76, 0x43}, /* dataFormat="InvC" */ {0x49, 0x6E, 0x76, 0x43}, /* dataFormat="InvC" */
{1, 1, 0, 0}, /* formatVersion */ {2, 0, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion = Unicode Version*/ {3, 0, 0, 0} /* dataVersion = Unicode Version*/
}; };
@ -423,23 +423,26 @@ enum {
}; };
#define UCOL_RESET_TOP_VALUE 0x9F000303 #define UCOL_RESET_TOP_VALUE 0x9F000303
#define UCOL_NEXT_TOP_VALUE 0xE8960303
#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705 #define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05 #define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05
#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C5 #define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C0
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
#define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000 #define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705 #define UCOL_LAST_SECONDARY_IGNORABLE 0x00000500
#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000 #define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00008705
#define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000 #define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00008705
#define UCOL_FIRST_VARIABLE 0x05070505 #define UCOL_FIRST_VARIABLE 0x05070505
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
#define UCOL_LAST_VARIABLE 0x13CF0505 #define UCOL_LAST_VARIABLE 0x13CF0505
#define UCOL_FIRST_NON_VARIABLE 0x16200505
#define UCOL_LAST_NON_VARIABLE 0x767C0505
#define UCOL_NEXT_TOP_VALUE 0xE8960303
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000500
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
#define UCOL_NEXT_LAST_VARIABLE 0x16200505 #define UCOL_NEXT_LAST_VARIABLE 0x16200505
#define PRIMARY_IMPLICIT_MIN 0xE8000000 #define PRIMARY_IMPLICIT_MIN 0xE8000000

View File

@ -208,16 +208,43 @@ typedef struct {
uint32_t limitContCE; uint32_t limitContCE;
} indirectBoundaries; } indirectBoundaries;
/* these values are used for finding CE values for indirect positioning. */
/* Indirect positioning is a mechanism for allowing resets on symbolic */
/* values. It only works for resets and you cannot tailor indirect names */
/* An indirect name can define either an anchor point or a range. An */
/* anchor point behaves in exactly the same way as a code point in reset */
/* would, except that it cannot be tailored. A range (we currently only */
/* know for the [top] range will explicitly set the upper bound for */
/* generated CEs, thus allowing for better control over how many CEs can */
/* be squeezed between in the range without performance penalty. */
/* In that respect, we use [top] for tailoring of locales that use CJK */
/* characters. Other indirect values are currently a pure convenience, */
/* they can be used to assure that the CEs will be always positioned in */
/* the same place relative to a point with known properties (e.g. first */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[] = { static indirectBoundaries ucolIndirectBoundaries[] = {
{ UCOL_RESET_TOP_VALUE, 0, UCOL_NEXT_TOP_VALUE, 0 }, { UCOL_RESET_TOP_VALUE, 0,
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_FIRST_PRIMARY_IGNORABLE, 0 }, UCOL_NEXT_TOP_VALUE, 0 },
{ UCOL_LAST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_LAST_PRIMARY_IGNORABLE, 0 }, { UCOL_FIRST_PRIMARY_IGNORABLE, 0,
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_FIRST_SECONDARY_IGNORABLE, 0 }, 0, 0 },
{ UCOL_LAST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_LAST_SECONDARY_IGNORABLE, 0 }, { UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_FIRST_TERTIARY_IGNORABLE, 0 }, 0, 0 },
{ UCOL_LAST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_LAST_TERTIARY_IGNORABLE, 0 }, { UCOL_FIRST_SECONDARY_IGNORABLE, 0,
{ UCOL_FIRST_VARIABLE, 0, UCOL_NEXT_FIRST_VARIABLE, 0 }, 0, 0 },
{ UCOL_LAST_VARIABLE, 0, UCOL_NEXT_LAST_VARIABLE, 0 }, { UCOL_LAST_SECONDARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_VARIABLE, 0,
0, 0 },
{ UCOL_FIRST_NON_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_NON_VARIABLE, 0,
0, 0 },
}; };
#define UTOK_OPTION_COUNT 17 #define UTOK_OPTION_COUNT 17
@ -243,7 +270,7 @@ U_STRING_DECL(suboption_11, "primary", 7);
U_STRING_DECL(suboption_12, "secondary", 9); U_STRING_DECL(suboption_12, "secondary", 9);
U_STRING_DECL(suboption_13, "tertiary", 8); U_STRING_DECL(suboption_13, "tertiary", 8);
U_STRING_DECL(suboption_14, "variable", 8); U_STRING_DECL(suboption_14, "variable", 8);
U_STRING_DECL(suboption_15, "ignorable", 9); U_STRING_DECL(suboption_15, "non-ignorable", 13);
U_STRING_DECL(option_00, "undefined", 9); U_STRING_DECL(option_00, "undefined", 9);
U_STRING_DECL(option_01, "rearrange", 9); U_STRING_DECL(option_01, "rearrange", 9);
@ -306,11 +333,12 @@ static const ucolTokSuboption strengthSub[5] = {
{suboption_10, 1, UCOL_IDENTICAL}, {suboption_10, 1, UCOL_IDENTICAL},
}; };
static const ucolTokSuboption firstLastSub[4] = { static const ucolTokSuboption firstLastSub[5] = {
{suboption_11, 7, UCOL_PRIMARY}, {suboption_11, 7, UCOL_PRIMARY},
{suboption_12, 9, UCOL_PRIMARY}, {suboption_12, 9, UCOL_PRIMARY},
{suboption_13, 8, UCOL_PRIMARY}, {suboption_13, 8, UCOL_PRIMARY},
{suboption_14, 8, UCOL_PRIMARY}, {suboption_14, 8, UCOL_PRIMARY},
{suboption_15, 13, UCOL_PRIMARY},
}; };
static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = { static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
@ -325,8 +353,8 @@ static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
{option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */ {option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
{option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */ {option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
{option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */ {option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
{option_15, 5, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"first" */ {option_15, 5, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"first" */
{option_16, 4, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"last" */ {option_16, 4, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"last" */
{option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */ {option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
{option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */ {option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
{option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */ {option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
@ -378,7 +406,7 @@ uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, const UChar *end, U
U_STRING_INIT(suboption_12, "secondary", 9); U_STRING_INIT(suboption_12, "secondary", 9);
U_STRING_INIT(suboption_13, "tertiary", 8); U_STRING_INIT(suboption_13, "tertiary", 8);
U_STRING_INIT(suboption_14, "variable", 8); U_STRING_INIT(suboption_14, "variable", 8);
U_STRING_INIT(suboption_15, "ignorable", 9); U_STRING_INIT(suboption_15, "non-ignorable", 13);
U_STRING_INIT(option_00, "undefined", 9); U_STRING_INIT(option_00, "undefined", 9);
@ -1172,6 +1200,15 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
sourceToken->debugExpansion = 0; sourceToken->debugExpansion = 0;
} }
} else { } else {
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
/* if the previous token was also a reset, */
/*this means that we have two consecutive resets */
/* and we want to remove the previous one if empty*/
if(ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
}
if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */ if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
uint32_t searchCharsLen = src->parsedToken.charsLen; uint32_t searchCharsLen = src->parsedToken.charsLen;
while(searchCharsLen > 1 && sourceToken == NULL) { while(searchCharsLen > 1 && sourceToken == NULL) {
@ -1187,7 +1224,7 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
} }
} }
if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */ if((specs & UCOL_TOK_BEFORE) != 0 && top == FALSE) { /* we're doing before & there is no indirection */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1; uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) { if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
/* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */ /* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
@ -1214,15 +1251,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
} }
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
/* if the previous token was also a reset, */
/*this means that we have two consecutive resets */
/* and we want to remove the previous one if empty*/
if(ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
}
/* 5 If the relation is a reset: /* 5 If the relation is a reset:
If sourceToken is null If sourceToken is null
Create new list, create new sourceToken, make the baseCE from source, put Create new list, create new sourceToken, make the baseCE from source, put
@ -1261,13 +1289,29 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status); sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
} else { /* top == TRUE */ } else { /* top == TRUE */
top = FALSE; top = FALSE;
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
ListList[src->resultLen].previousCE = 0; ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0; ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = TRUE; ListList[src->resultLen].indirect = TRUE;
if((specs & UCOL_TOK_BEFORE) == 0) { /* indirect without before */
/* just use the supplied values */
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
} else { /* there was a before */
/* we need to do slightly more work. we need to get the baseCE using the */
/* inverse UCA & getPrevious. The next bound is not set, and will be decided */
/* in ucol_bld */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);
ListList[src->resultLen].baseCE = CE;
ListList[src->resultLen].baseContCE = SecondCE;
ListList[src->resultLen].nextCE = 0;
ListList[src->resultLen].nextContCE = 0;
}
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status); sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

View File

@ -3469,6 +3469,11 @@ static void TestRuleOptions() {
"&[last tertiary ignorable]<<<w" "&[last tertiary ignorable]<<<w"
"&[top]<u", "&[top]<u",
{"\\ufffb", "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7 }, {"\\ufffb", "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7 },
{ "&[before 1][first tertiary ignorable]<<<k",
{ "\\u0000", "k"}, 2}, /* you cannot go before first tertiary ignorable */
/* - all befores here amount to zero */
{ "&[before 3][last primary ignorable]<<<k",
{ "k", "\\u20e3"}, 2},
}; };
uint32_t i; uint32_t i;