ICU-1770 more support for indirects & some more tests

X-SVN-Rev: 8245
This commit is contained in:
Vladimir Weinstein 2002-03-26 16:46:17 +00:00
parent 364b4d33ff
commit 98cdff5012
4 changed files with 105 additions and 41 deletions

View File

@ -80,11 +80,21 @@ int32_t ucol_inv_findCE(uint32_t CE, uint32_t SecondCE) {
}
}
/* weiv: */
/* in searching for elements, I have removed the failure */
/* The reason for this is that the builder does not rely */
/* on search mechanism telling it that it didn't find an */
/* element. However, indirect positioning relies on being */
/* able to find the elements around any CE, even if it is */
/* not defined in the UCA. */
return i;
/*
if((first == CE && second == SecondCE)) {
return i;
} else {
return -1;
}
*/
}
static uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
@ -141,7 +151,9 @@ U_CAPI int32_t U_EXPORT2 ucol_inv_getPrevCE(uint32_t CE, uint32_t contCE,
*prevContCE = contCE;
while((*prevCE & strengthMask[strength]) == CE
&& (*prevContCE & strengthMask[strength])== contCE) {
&& (*prevContCE & strengthMask[strength])== contCE
&& iCE > 0) { /* this condition should prevent falling off the edge of the world */
/* here, we end up in a singularity - zero */
*prevCE = (*(CETable+3*(--iCE)));
*prevContCE = (*(CETable+3*(iCE)+1));
}

View File

@ -43,7 +43,7 @@ static const UDataInfo ucaDataInfo={
0,
{0x55, 0x43, 0x6f, 0x6c}, /* dataFormat="UCol" */
{1, 1, 0, 0}, /* formatVersion */
{2, 0, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion = Unicode Version*/
};
@ -58,7 +58,7 @@ static const UDataInfo invUcaDataInfo={
0,
{0x49, 0x6E, 0x76, 0x43}, /* dataFormat="InvC" */
{1, 1, 0, 0}, /* formatVersion */
{2, 0, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion = Unicode Version*/
};
@ -423,23 +423,26 @@ enum {
};
#define UCOL_RESET_TOP_VALUE 0x9F000303
#define UCOL_NEXT_TOP_VALUE 0xE8960303
#define UCOL_FIRST_PRIMARY_IGNORABLE 0x00008705
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
#define UCOL_LAST_PRIMARY_IGNORABLE 0x0000DD05
#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C5
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
#define UCOL_LAST_PRIMARY_IGNORABLE_CONT 0x0000C1C0
#define UCOL_FIRST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000000
#define UCOL_LAST_SECONDARY_IGNORABLE 0x00000500
#define UCOL_FIRST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00008705
#define UCOL_LAST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00008705
#define UCOL_FIRST_VARIABLE 0x05070505
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
#define UCOL_LAST_VARIABLE 0x13CF0505
#define UCOL_FIRST_NON_VARIABLE 0x16200505
#define UCOL_LAST_NON_VARIABLE 0x767C0505
#define UCOL_NEXT_TOP_VALUE 0xE8960303
#define UCOL_NEXT_FIRST_PRIMARY_IGNORABLE 0x00008905
#define UCOL_NEXT_LAST_PRIMARY_IGNORABLE 0x03000303
#define UCOL_NEXT_FIRST_SECONDARY_IGNORABLE 0x00008705
#define UCOL_NEXT_LAST_SECONDARY_IGNORABLE 0x00000500
#define UCOL_NEXT_FIRST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_LAST_TERTIARY_IGNORABLE 0x00000000
#define UCOL_NEXT_FIRST_VARIABLE 0x05090505
#define UCOL_NEXT_LAST_VARIABLE 0x16200505
#define PRIMARY_IMPLICIT_MIN 0xE8000000

View File

@ -208,16 +208,43 @@ typedef struct {
uint32_t limitContCE;
} indirectBoundaries;
/* these values are used for finding CE values for indirect positioning. */
/* Indirect positioning is a mechanism for allowing resets on symbolic */
/* values. It only works for resets and you cannot tailor indirect names */
/* An indirect name can define either an anchor point or a range. An */
/* anchor point behaves in exactly the same way as a code point in reset */
/* would, except that it cannot be tailored. A range (we currently only */
/* know for the [top] range will explicitly set the upper bound for */
/* generated CEs, thus allowing for better control over how many CEs can */
/* be squeezed between in the range without performance penalty. */
/* In that respect, we use [top] for tailoring of locales that use CJK */
/* characters. Other indirect values are currently a pure convenience, */
/* they can be used to assure that the CEs will be always positioned in */
/* the same place relative to a point with known properties (e.g. first */
/* primary ignorable). */
static indirectBoundaries ucolIndirectBoundaries[] = {
{ UCOL_RESET_TOP_VALUE, 0, UCOL_NEXT_TOP_VALUE, 0 },
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_FIRST_PRIMARY_IGNORABLE, 0 },
{ UCOL_LAST_PRIMARY_IGNORABLE, 0, UCOL_NEXT_LAST_PRIMARY_IGNORABLE, 0 },
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_FIRST_SECONDARY_IGNORABLE, 0 },
{ UCOL_LAST_SECONDARY_IGNORABLE, 0, UCOL_NEXT_LAST_SECONDARY_IGNORABLE, 0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_FIRST_TERTIARY_IGNORABLE, 0 },
{ UCOL_LAST_TERTIARY_IGNORABLE, 0, UCOL_NEXT_LAST_TERTIARY_IGNORABLE, 0 },
{ UCOL_FIRST_VARIABLE, 0, UCOL_NEXT_FIRST_VARIABLE, 0 },
{ UCOL_LAST_VARIABLE, 0, UCOL_NEXT_LAST_VARIABLE, 0 },
{ UCOL_RESET_TOP_VALUE, 0,
UCOL_NEXT_TOP_VALUE, 0 },
{ UCOL_FIRST_PRIMARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_PRIMARY_IGNORABLE, UCOL_LAST_PRIMARY_IGNORABLE_CONT,
0, 0 },
{ UCOL_FIRST_SECONDARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_SECONDARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_LAST_TERTIARY_IGNORABLE, 0,
0, 0 },
{ UCOL_FIRST_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_VARIABLE, 0,
0, 0 },
{ UCOL_FIRST_NON_VARIABLE, 0,
0, 0 },
{ UCOL_LAST_NON_VARIABLE, 0,
0, 0 },
};
#define UTOK_OPTION_COUNT 17
@ -243,7 +270,7 @@ U_STRING_DECL(suboption_11, "primary", 7);
U_STRING_DECL(suboption_12, "secondary", 9);
U_STRING_DECL(suboption_13, "tertiary", 8);
U_STRING_DECL(suboption_14, "variable", 8);
U_STRING_DECL(suboption_15, "ignorable", 9);
U_STRING_DECL(suboption_15, "non-ignorable", 13);
U_STRING_DECL(option_00, "undefined", 9);
U_STRING_DECL(option_01, "rearrange", 9);
@ -306,11 +333,12 @@ static const ucolTokSuboption strengthSub[5] = {
{suboption_10, 1, UCOL_IDENTICAL},
};
static const ucolTokSuboption firstLastSub[4] = {
static const ucolTokSuboption firstLastSub[5] = {
{suboption_11, 7, UCOL_PRIMARY},
{suboption_12, 9, UCOL_PRIMARY},
{suboption_13, 8, UCOL_PRIMARY},
{suboption_14, 8, UCOL_PRIMARY},
{suboption_15, 13, UCOL_PRIMARY},
};
static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
@ -325,8 +353,8 @@ static const ucolTokOption rulesOptions[UTOK_OPTION_COUNT] = {
{option_01, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"rearrange" */
{option_12, 6, beforeSub, 3, UCOL_ATTRIBUTE_COUNT}, /*"before" */
{option_05, 3, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"top" */
{option_15, 5, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"first" */
{option_16, 4, firstLastSub, 4, UCOL_ATTRIBUTE_COUNT}, /*"last" */
{option_15, 5, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"first" */
{option_16, 4, firstLastSub, 5, UCOL_ATTRIBUTE_COUNT}, /*"last" */
{option_00, 9, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"undefined" */
{option_09, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"scriptOrder" */
{option_10, 11, NULL, 0, UCOL_ATTRIBUTE_COUNT}, /*"charsetname" */
@ -378,7 +406,7 @@ uint8_t ucol_uprv_tok_readAndSetOption(UColTokenParser *src, const UChar *end, U
U_STRING_INIT(suboption_12, "secondary", 9);
U_STRING_INIT(suboption_13, "tertiary", 8);
U_STRING_INIT(suboption_14, "variable", 8);
U_STRING_INIT(suboption_15, "ignorable", 9);
U_STRING_INIT(suboption_15, "non-ignorable", 13);
U_STRING_INIT(option_00, "undefined", 9);
@ -1172,6 +1200,15 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
sourceToken->debugExpansion = 0;
}
} else {
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
/* if the previous token was also a reset, */
/*this means that we have two consecutive resets */
/* and we want to remove the previous one if empty*/
if(ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
}
if(sourceToken == NULL) { /* this is a reset, but it might still be somewhere in the tailoring, in shorter form */
uint32_t searchCharsLen = src->parsedToken.charsLen;
while(searchCharsLen > 1 && sourceToken == NULL) {
@ -1187,7 +1224,7 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
}
}
if((specs & UCOL_TOK_BEFORE) != 0) { /* we're doing before */
if((specs & UCOL_TOK_BEFORE) != 0 && top == FALSE) { /* we're doing before & there is no indirection */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
if(sourceToken != NULL && sourceToken->strength != UCOL_TOK_RESET) {
/* this is a before that is already ordered in the UCA - so we need to get the previous with good strength */
@ -1214,15 +1251,6 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
}
if(lastToken != NULL && lastStrength == UCOL_TOK_RESET) {
/* if the previous token was also a reset, */
/*this means that we have two consecutive resets */
/* and we want to remove the previous one if empty*/
if(ListList[src->resultLen-1].first == NULL) {
src->resultLen--;
}
}
/* 5 If the relation is a reset:
If sourceToken is null
Create new list, create new sourceToken, make the baseCE from source, put
@ -1261,13 +1289,29 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
sourceToken = ucol_tok_initAReset(src, expand, &expandNext, parseError, status);
} else { /* top == TRUE */
top = FALSE;
ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = TRUE;
if((specs & UCOL_TOK_BEFORE) == 0) { /* indirect without before */
/* just use the supplied values */
ListList[src->resultLen].baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
ListList[src->resultLen].baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;
ListList[src->resultLen].nextCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitCE;
ListList[src->resultLen].nextContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].limitContCE;
ListList[src->resultLen].previousCE = 0;
ListList[src->resultLen].previousContCE = 0;
ListList[src->resultLen].indirect = TRUE;
} else { /* there was a before */
/* we need to do slightly more work. we need to get the baseCE using the */
/* inverse UCA & getPrevious. The next bound is not set, and will be decided */
/* in ucol_bld */
uint8_t strength = (specs & UCOL_TOK_BEFORE) - 1;
uint32_t baseCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startCE;
uint32_t baseContCE = ucolIndirectBoundaries[src->parsedToken.indirectIndex].startContCE;//&0xFFFFFF3F;
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
int32_t invPos = ucol_inv_getPrevCE(baseCE, baseContCE, &CE, &SecondCE, strength);
ListList[src->resultLen].baseCE = CE;
ListList[src->resultLen].baseContCE = SecondCE;
ListList[src->resultLen].nextCE = 0;
ListList[src->resultLen].nextContCE = 0;
}
sourceToken = ucol_tok_initAReset(src, 0, &expandNext, parseError, status);

View File

@ -3469,6 +3469,11 @@ static void TestRuleOptions() {
"&[last tertiary ignorable]<<<w"
"&[top]<u",
{"\\ufffb", "w", "y", "\\u20e3", "x", "\\u137c", "z", "u"}, 7 },
{ "&[before 1][first tertiary ignorable]<<<k",
{ "\\u0000", "k"}, 2}, /* you cannot go before first tertiary ignorable */
/* - all befores here amount to zero */
{ "&[before 3][last primary ignorable]<<<k",
{ "k", "\\u20e3"}, 2},
};
uint32_t i;