ICU-8262 reorg code for uloc_getDisplayName, add regression test

X-SVN-Rev: 29760
2011-04-08 20:06:36 +00:00 · 2011-04-08 20:06:36 +00:00 · 45f8abf19f
commit 45f8abf19f
parent 389c986a20
2 changed files with 429 additions and 2 deletions
--- a/icu4c/source/common/locdispnames.cpp
+++ b/icu4c/source/common/locdispnames.cpp
@ -1,7 +1,7 @@
 /*
 *******************************************************************************
 *
-*   Copyright (C) 1997-2010, International Business Machines
+*   Copyright (C) 1997-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 *******************************************************************************
@ -427,8 +427,9 @@ uloc_getDisplayVariant(const char *locale,
                uloc_getVariant, _kVariants, pErrorCode);
 }

+/* TODO:dougfelt remove */
 U_CAPI int32_t U_EXPORT2
-uloc_getDisplayName(const char *locale,
+uloc_getDisplayNameOld(const char *locale,
                    const char *displayLocale,
                    UChar *dest, int32_t destCapacity,
                    UErrorCode *pErrorCode)
@ -704,6 +705,286 @@ uloc_getDisplayName(const char *locale,
    return u_terminateUChars(dest, destCapacity, length, pErrorCode);
 }

+
+/* Instead of having a separate pass for 'special' patterns, reintegrate the two
+ * so we don't get bitten by preflight bugs again.  We can be reasonably efficient
+ * without two separate code paths, this code isn't that performance-critical.
+ *
+ * This code is general enough to deal with patterns that have a prefix or swap the
+ * language and remainder components, since we gave developers enough rope to do such
+ * things if they futz with the pattern data.  But since we don't give them a way to
+ * specify a pattern for arbitrary combinations of components, there's not much use in
+ * that.  I don't think our data includes such patterns, the only variable I know if is
+ * whether there is a space before the open paren, or not.  Oh, and zh uses different
+ * chars than the standard open/close paren (which ja and ko use, btw).
+ */
+U_CAPI int32_t U_EXPORT2
+uloc_getDisplayName(const char *locale,
+                    const char *displayLocale,
+                    UChar *dest, int32_t destCapacity,
+                    UErrorCode *pErrorCode)
+{
+    static const UChar defaultSeparator[3] = { 0x002c, 0x0020, 0x0000 }; /* comma + space */
+    static const int32_t defaultSepLen = 2;
+    static const UChar sub0[4] = { 0x007b, 0x0030, 0x007d , 0x0000 } ; /* {0} */
+    static const UChar sub1[4] = { 0x007b, 0x0031, 0x007d , 0x0000 } ; /* {1} */
+    static const int32_t subLen = 3;
+    static const UChar defaultPattern[10] = {
+        0x007b, 0x0030, 0x007d, 0x0020, 0x0028, 0x007b, 0x0031, 0x007d, 0x0029, 0x0000
+    }; /* {0} ({1}) */
+    static const int32_t defaultPatLen = 9;
+    static const int32_t defaultSub0Pos = 0;
+    static const int32_t defaultSub1Pos = 5;
+
+    int32_t length; /* of formatted result */
+
+    const UChar *separator;
+    int32_t sepLen = 0;
+    const UChar *pattern;
+    int32_t patLen = 0;
+    int32_t sub0Pos, sub1Pos;
+
+    UBool haveLang = TRUE; /* assume true, set false if we find we don't have
+                              a lang component in the locale */
+    UBool haveRest = TRUE; /* assume true, set false if we find we don't have
+                              any other component in the locale */
+    UBool retry = FALSE; /* set true if we need to retry, see below */
+
+    int32_t langi = 0; /* index of the language substitution (0 or 1), virtually always 0 */
+
+    if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
+        return 0;
+    }
+
+    if(destCapacity<0 || (destCapacity>0 && dest==NULL)) {
+        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+        return 0;
+    }
+
+    {
+        UErrorCode status = U_ZERO_ERROR;
+        UResourceBundle* locbundle=ures_open(U_ICUDATA_LANG, displayLocale, &status);
+        UResourceBundle* dspbundle=ures_getByKeyWithFallback(locbundle, _kLocaleDisplayPattern,
+                                                             NULL, &status);
+
+        separator=ures_getStringByKeyWithFallback(dspbundle, _kSeparator, &sepLen, &status);
+        pattern=ures_getStringByKeyWithFallback(dspbundle, _kPattern, &patLen, &status);
+
+        ures_close(dspbundle);
+        ures_close(locbundle);
+    }
+
+    /* If we couldn't find any data, then use the defaults */
+    if(sepLen == 0) {
+       separator = defaultSeparator;
+       sepLen = defaultSepLen;
+    }
+
+    if(patLen==0 || (patLen==defaultPatLen && !u_strncmp(pattern, defaultPattern, patLen))) {
+        pattern=defaultPattern;
+        patLen=defaultPatLen;
+        sub0Pos=defaultSub0Pos;
+        sub1Pos=defaultSub1Pos;
+    } else { /* non-default pattern */
+        UChar *p0=u_strstr(pattern, sub0);
+        UChar *p1=u_strstr(pattern, sub1);
+        if (p0==NULL || p1==NULL) {
+            *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+            return 0;
+        }
+        sub0Pos=p0-pattern;
+        sub1Pos=p1-pattern;
+        if (sub1Pos < sub0Pos) { /* a very odd pattern */
+            int32_t t=sub0Pos; sub0Pos=sub1Pos; sub1Pos=t;
+            langi=1;
+        }
+    }
+
+    /* We loop here because there is one case in which after the first pass we could need to
+     * reextract the data.  If there's initial padding before the first element, we put in
+     * the padding and then write that element.  If it turns out there's no second element,
+     * we didn't need the padding.  If we do need the data (no preflight), and the first element
+     * would have fit but for the padding, we need to reextract.  In this case (only) we
+     * adjust the parameters so padding is not added, and repeat.
+     */
+    do {
+        UChar* p=dest;
+        int32_t patPos=0; /* position in the pattern, used for non-substitution portions */
+        int32_t langLen=0; /* length of language substitution */
+        int32_t langPos=0; /* position in output of language substitution */
+        int32_t restLen=0; /* length of 'everything else' substitution */
+        int32_t restPos=0; /* position in output of 'everything else' substitution */
+        UEnumeration* kenum; /* keyword enumeration */
+
+        /* prefix of pattern, extremely likely to be empty */
+        if(sub0Pos) {
+            if(destCapacity >= sub0Pos) {
+                while (patPos < sub0Pos) {
+                    *p++ = pattern[patPos++];
+                }
+            } else {
+                patPos=sub0Pos;
+            }
+            length=sub0Pos;
+        } else {
+            length=0;
+        }
+
+        for(int32_t subi=0,resti=0;subi<2;) { /* iterate through patterns 0 and 1*/
+            UBool subdone = FALSE; /* set true when ready to move to next substitution */
+
+            /* prep p and cap for calls to get display components, pin cap to 0 since
+               they complain if cap is negative */
+            int32_t cap=destCapacity-length;
+            if (cap <= 0) {
+                cap=0;
+            } else {
+                p=dest+length;
+            }
+
+            if (subi == langi) { /* {0}*/
+                if(haveLang) {
+                    langPos=length;
+                    langLen=uloc_getDisplayLanguage(locale, displayLocale, p, cap, pErrorCode);
+                    length+=langLen;
+                    haveLang=langLen>0;
+                }
+                subdone=TRUE;
+            } else { /* {1} */
+                if(!haveRest) {
+                    subdone=TRUE;
+                } else {
+                    int32_t len; /* length of component (plus other stuff) we just fetched */
+                    switch(resti++) {
+                        case 0:
+                            restPos=length;
+                            len=uloc_getDisplayScript(locale, displayLocale, p, cap, pErrorCode);
+                            break;
+                        case 1:
+                            len=uloc_getDisplayCountry(locale, displayLocale, p, cap, pErrorCode);
+                            break;
+                        case 2:
+                            len=uloc_getDisplayVariant(locale, displayLocale, p, cap, pErrorCode);
+                            break;
+                        case 3:
+                            kenum = uloc_openKeywords(locale, pErrorCode);
+                            /* fall through */
+                        default: {
+                            const char* kw=uenum_next(kenum, &len, pErrorCode);
+                            if (kw == NULL) {
+                                uenum_close(kenum);
+                                len=0; /* mark that we didn't add a component */
+                                subdone=TRUE;
+                            } else {
+                                /* incorporating this behavior into the loop made it even more complex,
+                                   so just special case it here */
+                                len = uloc_getDisplayKeyword(kw, displayLocale, p, cap, pErrorCode);
+                                if(len) {
+                                    if(len < cap) {
+                                        p[len]=0x3d; /* '=', assume we'll need it */
+                                    }
+                                    len+=1;
+
+                                    /* adjust for call to get keyword */
+                                    cap-=len;
+                                    if(cap <= 0) {
+                                        cap=0;
+                                    } else {
+                                        p+=len;
+                                    }
+                                }
+                                /* reset for call below */
+                                if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
+                                    *pErrorCode=U_ZERO_ERROR;
+                                }
+                                int32_t vlen = uloc_getDisplayKeywordValue(locale, kw, displayLocale,
+                                                                           p, cap, pErrorCode);
+                                if(len) {
+                                    if(vlen==0) {
+                                        --len; /* remove unneeded '=' */
+                                    }
+                                    /* restore cap and p to what they were at start */
+                                    cap=destCapacity-length;
+                                    if(cap <= 0) {
+                                        cap=0;
+                                    } else {
+                                        p=dest+length;
+                                    }
+                                }
+                                len+=vlen; /* total we added for key + '=' + value */
+                            }
+                        } break;
+                    } /* end switch */
+
+                    if (len>0) {
+                        /* we addeed a component, so add separator and write it if there's room. */
+                        if(len+sepLen<=cap) {
+                            p+=len;
+                            for(int32_t i=0;i<sepLen;++i) {
+                                *p++=separator[i];
+                            }
+                        }
+                        length+=len+sepLen;
+                    } else if(subdone) {
+                        /* remove separator if we added it */
+                        if (length!=restPos) {
+                            length-=sepLen;
+                        }
+                        restLen=length-restPos;
+                        haveRest=restLen>0;
+                    }
+                }
+            }
+
+            if(*pErrorCode == U_BUFFER_OVERFLOW_ERROR) {
+                *pErrorCode=U_ZERO_ERROR;
+            }
+
+            if(subdone) {
+                if(haveLang && haveRest) {
+                    /* append internal portion of pattern, the first time,
+                       or last portion of pattern the second time */
+                    int32_t padLen;
+                    patPos+=subLen;
+                    padLen=(subi==0 ? sub1Pos : patLen)-patPos;
+                    if(length+padLen < destCapacity) {
+                        p=dest+length;
+                        for(int32_t i=0;i<padLen;++i) {
+                            *p++=pattern[patPos++];
+                        }
+                    } else {
+                        patPos+=padLen;
+                    }
+                    length+=padLen;
+                } else if(subi==0) {
+                    /* don't have first component, reset for second component */
+                    sub0Pos=0;
+                    length=0;
+                } else if(length>0) {
+                    /* true length is the length of just the component we got. */
+                    length=haveLang?langLen:restLen;
+                    if(dest && sub0Pos!=0) {
+                        if (sub0Pos+length<=destCapacity) {
+                            /* first component not at start of result,
+                               but we have full component in buffer. */
+                            u_memmove(dest, dest+(haveLang?langPos:restPos), length);
+                        } else {
+                            /* would have fit, but didn't because of pattern prefix. */
+                            sub0Pos=0; /* stops initial padding (and a second retry,
+                                          so we won't end up here again) */
+                            retry=TRUE;
+                        }
+                    }
+                }
+
+                ++subi; /* move on to next substitution */
+            }
+        }
+    } while(retry);
+
+    return u_terminateUChars(dest, destCapacity, length, pErrorCode);
+}
+
 U_CAPI int32_t U_EXPORT2
 uloc_getDisplayKeyword(const char* keyword,
                       const char* displayLocale,
--- a/icu4c/source/test/cintltst/cloctst.c
+++ b/icu4c/source/test/cintltst/cloctst.c
@ -561,6 +561,85 @@ static void TestSimpleResourceInfo() {
    cleanUpDataTable();
 }

+/* obviously, on non-ascii platforms this is useless, but it's test/debug code */
+/* if len < 0, we convert until we hit UChar 0x0000, which is not output. will add trailing null
+ * if there's room but won't be included in result.  result < 0 indicates an error.
+ * Returns the number of chars written (not those that would be written if there's enough room.*/
+static int32_t UCharsToEscapedAscii(const UChar* utext, int32_t len, char* resultChars, int32_t buflen) {
+#if U_CHARSET_FAMILY != U_ASCII_FAMILY
+    return -1;
+#else
+    static const UChar ESCAPE_MAP[] = {
+        /*a*/ 0x61, 0x07,
+        /*b*/ 0x62, 0x08,
+        /*e*/ 0x65, 0x1b,
+        /*f*/ 0x66, 0x0c,
+        /*n*/ 0x6E, 0x0a,
+        /*r*/ 0x72, 0x0d,
+        /*t*/ 0x74, 0x09,
+        /*v*/ 0x76, 0x0b
+    };
+    static const int32_t ESCAPE_MAP_LENGTH = sizeof(ESCAPE_MAP)/sizeof(ESCAPE_MAP[0]);
+    static const char HEX_DIGITS[] = {
+        0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+        0x38, 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66
+    };
+    int32_t i, j, v;
+    int32_t resultLen = 0;
+    const int32_t limit = len<0 ? buflen : len; /* buflen is long enough to hit the buffer limit */
+    const int32_t escapeLimit1 = buflen-2;
+    const int32_t escapeLimit2 = buflen-6;
+    UChar uc;
+
+    if(utext==NULL || resultChars==NULL || buflen<0) {
+        return -1;
+    }
+
+    for(i=0;i<limit && resultLen<buflen;++i) {
+        uc=utext[i];
+        if(len<0 && uc==0) {
+            break;
+        }
+        if(uc<0x20) {
+            for(j=0;j<ESCAPE_MAP_LENGTH;j+=2) {
+                if(uc==ESCAPE_MAP[j+1]) {
+                    break;
+                }
+            }
+            if(j<ESCAPE_MAP_LENGTH) {
+                if(resultLen>escapeLimit1) {
+                    break;
+                }
+                resultChars[resultLen++]='\\';
+                resultChars[resultLen++]=ESCAPE_MAP[j];
+                continue;
+            }
+        } else if(uc<0x7f) {
+            resultChars[resultLen++] = uc;
+            continue;
+        }
+
+        if(resultLen>escapeLimit2) {
+            break;
+        }
+
+        /* have to escape the uchar */
+        resultChars[resultLen++]='\\';
+        resultChars[resultLen++]='u';
+        resultChars[resultLen++]=HEX_DIGITS[(uc>>12)&0xff];
+        resultChars[resultLen++]=HEX_DIGITS[(uc>>8)&0xff];
+        resultChars[resultLen++]=HEX_DIGITS[(uc>>4)&0xff];
+        resultChars[resultLen++]=HEX_DIGITS[uc&0xff];
+    }
+
+    if(resultLen<buflen) {
+        resultChars[resultLen] = 0;
+    }
+
+    return resultLen;
+#endif
+}
+
 /*
 * Jitterbug 2439 -- markus 20030425
 *
@ -634,6 +713,73 @@ static void TestDisplayNames()
            }
        }
    }
+
+    /* test that we properly preflight and return data when there's a non-default pattern,
+       see ticket #8262. */
+    {
+        int32_t i, j, v;
+        static const char *locale="az_Cyrl";
+        static const char *displayLocale="ja";
+        static const char *expectedChars =
+                "\\u30a2\\u30bc\\u30eb\\u30d0\\u30a4\\u30b8\\u30e3\\u30f3\\u8a9e"
+                "(\\u30ad\\u30ea\\u30eb\\u6587\\u5b57)";
+        UErrorCode ec=U_ZERO_ERROR;
+        UChar result[256];
+        int32_t len;
+        int32_t preflightLen=uloc_getDisplayName(locale, displayLocale, NULL, 0, &ec);
+        /* inconvenient semantics when preflighting, this condition is expected... */
+        if(ec==U_BUFFER_OVERFLOW_ERROR) {
+            ec=U_ZERO_ERROR;
+        }
+        len=uloc_getDisplayName(locale, displayLocale, result, LENGTHOF(result), &ec);
+        if(U_FAILURE(ec)) {
+            log_err("uloc_getDisplayName(%s, %s...) returned error: %s",
+                    locale, displayLocale, u_errorName(ec));
+        } else {
+            UChar *expected=CharsToUChars(expectedChars);
+            int32_t expectedLen=u_strlen(expected);
+
+            if(len!=expectedLen) {
+                log_err("uloc_getDisplayName(%s, %s...) returned string of length %d, expected length %d",
+                        locale, displayLocale, len, expectedLen);
+            } else if(preflightLen!=expectedLen) {
+                log_err("uloc_getDisplayName(%s, %s...) returned preflight length %d, expected length %d",
+                        locale, displayLocale, preflightLen, expectedLen);
+            } else if(u_strncmp(result, expected, len)) {
+                int32_t cap=len*6+1;  /* worst case + space for trailing null */
+                char* resultChars=malloc(cap);
+                int32_t resultCharsLen=UCharsToEscapedAscii(result, len, resultChars, cap);
+                if(resultCharsLen<0 || resultCharsLen<cap-1) {
+                    log_err("uloc_getDisplayName(%s, %s...) mismatch", locale, displayLocale);
+                } else {
+                    log_err("uloc_getDisplayName(%s, %s...) returned '%s' but expected '%s'",
+                            locale, displayLocale, resultChars, expectedChars);
+                }
+                free(resultChars);
+                resultChars=NULL;
+            } else {
+                /* test all buffer sizes */
+                for(i=len+1;i>=0;--i) {
+                    len=uloc_getDisplayName(locale, displayLocale, result, i, &ec);
+                    if(ec==U_BUFFER_OVERFLOW_ERROR) {
+                        ec=U_ZERO_ERROR;
+                    }
+                    if(U_FAILURE(ec)) {
+                        log_err("using buffer of length %d returned error %s", i, u_errorName(ec));
+                        break;
+                    }
+                    if(len!=expectedLen) {
+                        log_err("with buffer of length %d, expected length %d but got %d", i, expectedLen, len);
+                        break;
+                    }
+                    /* There's no guarantee about what's in the buffer if we've overflowed, in particular,
+                     * we don't know that it's been filled, so no point in checking. */
+                }
+            }
+
+            free(expected);
+        }
+    }
 }