ICU-8262 reorg code for uloc_getDisplayName, add regression test

X-SVN-Rev: 29760
This commit is contained in:
Doug Felt 2011-04-08 20:06:36 +00:00
parent 389c986a20
commit 45f8abf19f
2 changed files with 429 additions and 2 deletions

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1997-2010, International Business Machines
* Copyright (C) 1997-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -427,8 +427,9 @@ uloc_getDisplayVariant(const char *locale,
uloc_getVariant, _kVariants, pErrorCode);
}
/* TODO:dougfelt remove */
U_CAPI int32_t U_EXPORT2
uloc_getDisplayName(const char *locale,
uloc_getDisplayNameOld(const char *locale,
const char *displayLocale,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode)
@ -704,6 +705,286 @@ uloc_getDisplayName(const char *locale,
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
}
/* Instead of having a separate pass for 'special' patterns, reintegrate the two
* so we don't get bitten by preflight bugs again. We can be reasonably efficient
* without two separate code paths, this code isn't that performance-critical.
*
* This code is general enough to deal with patterns that have a prefix or swap the
* language and remainder components, since we gave developers enough rope to do such
* things if they futz with the pattern data. But since we don't give them a way to
* specify a pattern for arbitrary combinations of components, there's not much use in
* that. I don't think our data includes such patterns, the only variable I know if is
* whether there is a space before the open paren, or not. Oh, and zh uses different
* chars than the standard open/close paren (which ja and ko use, btw).
*/
U_CAPI int32_t U_EXPORT2
uloc_getDisplayName(const char *locale,
const char *displayLocale,
UChar *dest, int32_t destCapacity,
UErrorCode *pErrorCode)
{
static const UChar defaultSeparator[3] = { 0x002c, 0x0020, 0x0000 }; /* comma + space */
static const int32_t defaultSepLen = 2;
static const UChar sub0[4] = { 0x007b, 0x0030, 0x007d , 0x0000 } ; /* {0} */
static const UChar sub1[4] = { 0x007b, 0x0031, 0x007d , 0x0000 } ; /* {1} */
static const int32_t subLen = 3;
static const UChar defaultPattern[10] = {
0x007b, 0x0030, 0x007d, 0x0020, 0x0028, 0x007b, 0x0031, 0x007d, 0x0029, 0x0000
}; /* {0} ({1}) */
static const int32_t defaultPatLen = 9;
static const int32_t defaultSub0Pos = 0;
static const int32_t defaultSub1Pos = 5;
int32_t length; /* of formatted result */
const UChar *separator;
int32_t sepLen = 0;
const UChar *pattern;
int32_t patLen = 0;
int32_t sub0Pos, sub1Pos;
UBool haveLang = TRUE; /* assume true, set false if we find we don't have
a lang component in the locale */
UBool haveRest = TRUE; /* assume true, set false if we find we don't have
any other component in the locale */
UBool retry = FALSE; /* set true if we need to retry, see below */
int32_t langi = 0; /* index of the language substitution (0 or 1), virtually always 0 */
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return 0;
}
if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
{
UErrorCode status = U_ZERO_ERROR;
UResourceBundle* locbundle=ures_open(U_ICUDATA_LANG, displayLocale, &status);
UResourceBundle* dspbundle=ures_getByKeyWithFallback(locbundle, _kLocaleDisplayPattern,
NULL, &status);
separator=ures_getStringByKeyWithFallback(dspbundle, _kSeparator, &sepLen, &status);
pattern=ures_getStringByKeyWithFallback(dspbundle, _kPattern, &patLen, &status);
ures_close(dspbundle);
ures_close(locbundle);
}
/* If we couldn't find any data, then use the defaults */
if(sepLen == 0) {
separator = defaultSeparator;
sepLen = defaultSepLen;
}
if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
pattern=defaultPattern;
patLen=defaultPatLen;
sub0Pos=defaultSub0Pos;
sub1Pos=defaultSub1Pos;
} else { /* non-default pattern */
UChar *p0=u_strstr(pattern, sub0);
UChar *p1=u_strstr(pattern, sub1);
if (p0==NULL || p1==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
sub0Pos=p0-pattern;
sub1Pos=p1-pattern;
if (sub1Pos < sub0Pos) { /* a very odd pattern */
int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
langi=1;
}
}
/* We loop here because there is one case in which after the first pass we could need to
* reextract the data. If there's initial padding before the first element, we put in
* the padding and then write that element. If it turns out there's no second element,
* we didn't need the padding. If we do need the data (no preflight), and the first element
* would have fit but for the padding, we need to reextract. In this case (only) we
* adjust the parameters so padding is not added, and repeat.
*/
do {
UChar* p=dest;
int32_t patPos=0; /* position in the pattern, used for non-substitution portions */
int32_t langLen=0; /* length of language substitution */
int32_t langPos=0; /* position in output of language substitution */
int32_t restLen=0; /* length of 'everything else' substitution */
int32_t restPos=0; /* position in output of 'everything else' substitution */
UEnumeration* kenum; /* keyword enumeration */
/* prefix of pattern, extremely likely to be empty */
if(sub0Pos) {
if(destCapacity >= sub0Pos) {
while (patPos < sub0Pos) {
*p++ = pattern[patPos++];
}
} else {
patPos=sub0Pos;
}
length=sub0Pos;
} else {
length=0;
}
for(int32_t subi=0,resti=0;subi<2;) { /* iterate through patterns 0 and 1*/
UBool subdone = FALSE; /* set true when ready to move to next substitution */
/* prep p and cap for calls to get display components, pin cap to 0 since
they complain if cap is negative */
int32_t cap=destCapacity-length;
if (cap <= 0) {
cap=0;
} else {
p=dest+length;
}
if (subi == langi) { /* {0}*/
if(haveLang) {
langPos=length;
langLen=uloc_getDisplayLanguage(locale, displayLocale, p, cap, pErrorCode);
length+=langLen;
haveLang=langLen>0;
}
subdone=TRUE;
} else { /* {1} */
if(!haveRest) {
subdone=TRUE;
} else {
int32_t len; /* length of component (plus other stuff) we just fetched */
switch(resti++) {
case 0:
restPos=length;
len=uloc_getDisplayScript(locale, displayLocale, p, cap, pErrorCode);
break;
case 1:
len=uloc_getDisplayCountry(locale, displayLocale, p, cap, pErrorCode);
break;
case 2:
len=uloc_getDisplayVariant(locale, displayLocale, p, cap, pErrorCode);
break;
case 3:
kenum = uloc_openKeywords(locale, pErrorCode);
/* fall through */
default: {
const char* kw=uenum_next(kenum, &len, pErrorCode);
if (kw == NULL) {
uenum_close(kenum);
len=0; /* mark that we didn't add a component */
subdone=TRUE;
} else {
/* incorporating this behavior into the loop made it even more complex,
so just special case it here */
len = uloc_getDisplayKeyword(kw, displayLocale, p, cap, pErrorCode);
if(len) {
if(len < cap) {
p[len]=0x3d; /* '=', assume we'll need it */
}
len+=1;
/* adjust for call to get keyword */
cap-=len;
if(cap <= 0) {
cap=0;
} else {
p+=len;
}
}
/* reset for call below */
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
int32_t vlen = uloc_getDisplayKeywordValue(locale, kw, displayLocale,
p, cap, pErrorCode);
if(len) {
if(vlen==0) {
--len; /* remove unneeded '=' */
}
/* restore cap and p to what they were at start */
cap=destCapacity-length;
if(cap <= 0) {
cap=0;
} else {
p=dest+length;
}
}
len+=vlen; /* total we added for key + '=' + value */
}
} break;
} /* end switch */
if (len>0) {
/* we addeed a component, so add separator and write it if there's room. */
if(len+sepLen<=cap) {
p+=len;
for(int32_t i=0;i<sepLen;++i) {
*p++=separator[i];
}
}
length+=len+sepLen;
} else if(subdone) {
/* remove separator if we added it */
if (length!=restPos) {
length-=sepLen;
}
restLen=length-restPos;
haveRest=restLen>0;
}
}
}
if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
*pErrorCode=U_ZERO_ERROR;
}
if(subdone) {
if(haveLang && haveRest) {
/* append internal portion of pattern, the first time,
or last portion of pattern the second time */
int32_t padLen;
patPos+=subLen;
padLen=(subi==0 ? sub1Pos : patLen)-patPos;
if(length+padLen < destCapacity) {
p=dest+length;
for(int32_t i=0;i<padLen;++i) {
*p++=pattern[patPos++];
}
} else {
patPos+=padLen;
}
length+=padLen;
} else if(subi==0) {
/* don't have first component, reset for second component */
sub0Pos=0;
length=0;
} else if(length>0) {
/* true length is the length of just the component we got. */
length=haveLang?langLen:restLen;
if(dest && sub0Pos!=0) {
if (sub0Pos+length<=destCapacity) {
/* first component not at start of result,
but we have full component in buffer. */
u_memmove(dest, dest+(haveLang?langPos:restPos), length);
} else {
/* would have fit, but didn't because of pattern prefix. */
sub0Pos=0; /* stops initial padding (and a second retry,
so we won't end up here again) */
retry=TRUE;
}
}
}
++subi; /* move on to next substitution */
}
}
} while(retry);
return u_terminateUChars(dest, destCapacity, length, pErrorCode);
}
U_CAPI int32_t U_EXPORT2
uloc_getDisplayKeyword(const char* keyword,
const char* displayLocale,

View File

@ -561,6 +561,85 @@ static void TestSimpleResourceInfo() {
cleanUpDataTable();
}
/* obviously, on non-ascii platforms this is useless, but it's test/debug code */
/* if len < 0, we convert until we hit UChar 0x0000, which is not output. will add trailing null
* if there's room but won't be included in result. result < 0 indicates an error.
* Returns the number of chars written (not those that would be written if there's enough room.*/
static int32_t UCharsToEscapedAscii(const UChar* utext, int32_t len, char* resultChars, int32_t buflen) {
#if U_CHARSET_FAMILY != U_ASCII_FAMILY
return -1;
#else
static const UChar ESCAPE_MAP[] = {
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*e*/ 0x65, 0x1b,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
static const int32_t ESCAPE_MAP_LENGTH = sizeof(ESCAPE_MAP)/sizeof(ESCAPE_MAP[0]);
static const char HEX_DIGITS[] = {
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66
};
int32_t i, j, v;
int32_t resultLen = 0;
const int32_t limit = len<0 ? buflen : len; /* buflen is long enough to hit the buffer limit */
const int32_t escapeLimit1 = buflen-2;
const int32_t escapeLimit2 = buflen-6;
UChar uc;
if(utext==NULL || resultChars==NULL || buflen<0) {
return -1;
}
for(i=0;i<limit && resultLen<buflen;++i) {
uc=utext[i];
if(len<0 && uc==0) {
break;
}
if(uc<0x20) {
for(j=0;j<ESCAPE_MAP_LENGTH;j+=2) {
if(uc==ESCAPE_MAP[j+1]) {
break;
}
}
if(j<ESCAPE_MAP_LENGTH) {
if(resultLen>escapeLimit1) {
break;
}
resultChars[resultLen++]='\\';
resultChars[resultLen++]=ESCAPE_MAP[j];
continue;
}
} else if(uc<0x7f) {
resultChars[resultLen++] = uc;
continue;
}
if(resultLen>escapeLimit2) {
break;
}
/* have to escape the uchar */
resultChars[resultLen++]='\\';
resultChars[resultLen++]='u';
resultChars[resultLen++]=HEX_DIGITS[(uc>>12)&0xff];
resultChars[resultLen++]=HEX_DIGITS[(uc>>8)&0xff];
resultChars[resultLen++]=HEX_DIGITS[(uc>>4)&0xff];
resultChars[resultLen++]=HEX_DIGITS[uc&0xff];
}
if(resultLen<buflen) {
resultChars[resultLen] = 0;
}
return resultLen;
#endif
}
/*
* Jitterbug 2439 -- markus 20030425
*
@ -634,6 +713,73 @@ static void TestDisplayNames()
}
}
}
/* test that we properly preflight and return data when there's a non-default pattern,
see ticket #8262. */
{
int32_t i, j, v;
static const char *locale="az_Cyrl";
static const char *displayLocale="ja";
static const char *expectedChars =
"\\u30a2\\u30bc\\u30eb\\u30d0\\u30a4\\u30b8\\u30e3\\u30f3\\u8a9e"
"(\\u30ad\\u30ea\\u30eb\\u6587\\u5b57)";
UErrorCode ec=U_ZERO_ERROR;
UChar result[256];
int32_t len;
int32_t preflightLen=uloc_getDisplayName(locale, displayLocale, NULL, 0, &ec);
/* inconvenient semantics when preflighting, this condition is expected... */
if(ec==U_BUFFER_OVERFLOW_ERROR) {
ec=U_ZERO_ERROR;
}
len=uloc_getDisplayName(locale, displayLocale, result, LENGTHOF(result), &ec);
if(U_FAILURE(ec)) {
log_err("uloc_getDisplayName(%s, %s...) returned error: %s",
locale, displayLocale, u_errorName(ec));
} else {
UChar *expected=CharsToUChars(expectedChars);
int32_t expectedLen=u_strlen(expected);
if(len!=expectedLen) {
log_err("uloc_getDisplayName(%s, %s...) returned string of length %d, expected length %d",
locale, displayLocale, len, expectedLen);
} else if(preflightLen!=expectedLen) {
log_err("uloc_getDisplayName(%s, %s...) returned preflight length %d, expected length %d",
locale, displayLocale, preflightLen, expectedLen);
} else if(u_strncmp(result, expected, len)) {
int32_t cap=len*6+1; /* worst case + space for trailing null */
char* resultChars=malloc(cap);
int32_t resultCharsLen=UCharsToEscapedAscii(result, len, resultChars, cap);
if(resultCharsLen<0 || resultCharsLen<cap-1) {
log_err("uloc_getDisplayName(%s, %s...) mismatch", locale, displayLocale);
} else {
log_err("uloc_getDisplayName(%s, %s...) returned '%s' but expected '%s'",
locale, displayLocale, resultChars, expectedChars);
}
free(resultChars);
resultChars=NULL;
} else {
/* test all buffer sizes */
for(i=len+1;i>=0;--i) {
len=uloc_getDisplayName(locale, displayLocale, result, i, &ec);
if(ec==U_BUFFER_OVERFLOW_ERROR) {
ec=U_ZERO_ERROR;
}
if(U_FAILURE(ec)) {
log_err("using buffer of length %d returned error %s", i, u_errorName(ec));
break;
}
if(len!=expectedLen) {
log_err("with buffer of length %d, expected length %d but got %d", i, expectedLen, len);
break;
}
/* There's no guarantee about what's in the buffer if we've overflowed, in particular,
* we don't know that it's been filled, so no point in checking. */
}
}
free(expected);
}
}
}