From acc3f65a872acb2de8c49fac80e029fc3ecc894a Mon Sep 17 00:00:00 2001 From: Fredrik Roubert Date: Wed, 19 Sep 2018 17:52:37 -0700 Subject: [PATCH] ICU-13417 Replace fixed buffers in uloc_tag.cpp with CharString. This gets rid of those fixed buffers that caused ICU-13417 to be filed in the first place, those that prevent handling language tags with very large amounts of keywords. A number of fixed buffers will still remain in uloc_tag.cpp (and elsewhere in the locale handling code) for the time being, but this change is a necessary first step in cleaning up this code and will alleviate the most pressing problem encountered by ICU4C users. An off-by-one error in _getKeywords() caused uloc_canonicalize() to not write out the final keyword value in case the result would fill up the buffer exactly, resulting in U_STRING_NOT_TERMINATED_WARNING. --- icu4c/source/common/uloc.cpp | 2 +- icu4c/source/common/uloc_tag.cpp | 227 +++++++++++++++++++------ icu4c/source/test/cintltst/cloctst.c | 37 ++++ icu4c/source/test/cintltst/cloctst.h | 1 + icu4c/source/test/intltest/loctest.cpp | 21 +++ icu4c/source/test/intltest/loctest.h | 2 + 6 files changed, 233 insertions(+), 57 deletions(-) diff --git a/icu4c/source/common/uloc.cpp b/icu4c/source/common/uloc.cpp index f7073fec31..81b6e0f68a 100644 --- a/icu4c/source/common/uloc.cpp +++ b/icu4c/source/common/uloc.cpp @@ -798,7 +798,7 @@ _getKeywords(const char *localeID, } keywordsLen += keywordList[i].keywordLen + 1; if(valuesToo) { - if(keywordsLen + keywordList[i].valueLen < keywordCapacity) { + if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) { uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen); } keywordsLen += keywordList[i].valueLen; diff --git a/icu4c/source/common/uloc_tag.cpp b/icu4c/source/common/uloc_tag.cpp index b0647e97a2..2d6a9213c3 100644 --- a/icu4c/source/common/uloc_tag.cpp +++ b/icu4c/source/common/uloc_tag.cpp @@ -12,11 +12,13 @@ #include "unicode/putil.h" #include "unicode/uloc.h" #include "ustr_imp.h" +#include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" #include "uinvchar.h" #include "ulocimp.h" +#include "uvector.h" #include "uassert.h" @@ -172,6 +174,46 @@ static const char* ultag_getGrandfathered(const ULanguageTag* langtag); #endif +namespace { + +// Helper class to memory manage CharString objects. +// Only ever stack-allocated, does not need to inherit UMemory. +class CharStringPool { +public: + CharStringPool() : status(U_ZERO_ERROR), pool(&deleter, nullptr, status) {} + ~CharStringPool() = default; + + CharStringPool(const CharStringPool&) = delete; + CharStringPool& operator=(const CharStringPool&) = delete; + + icu::CharString* create() { + if (U_FAILURE(status)) { + return nullptr; + } + icu::CharString* const obj = new icu::CharString; + if (obj == nullptr) { + status = U_MEMORY_ALLOCATION_ERROR; + return nullptr; + } + pool.addElement(obj, status); + if (U_FAILURE(status)) { + delete obj; + return nullptr; + } + return obj; + } + +private: + static void U_CALLCONV deleter(void* obj) { + delete static_cast(obj); + } + + UErrorCode status; + icu::UVector pool; +}; + +} // namespace + /* * ------------------------------------------------- * @@ -900,7 +942,6 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac static int32_t _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) { - char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 }; int32_t attrBufLength = 0; UEnumeration *keywordEnum = NULL; @@ -920,22 +961,48 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac AttributeListEntry *firstAttr = NULL; AttributeListEntry *attr; char *attrValue; - char extBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY]; - char *pExtBuf = extBuf; - int32_t extBufCapacity = sizeof(extBuf); + CharStringPool extBufPool; const char *bcpKey=nullptr, *bcpValue=nullptr; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t keylen; UBool isBcpUExt; while (TRUE) { + icu::CharString buf; key = uenum_next(keywordEnum, NULL, status); if (key == NULL) { break; } - len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus); - /* buf must be null-terminated */ - if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + char* buffer; + int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY; + + for (;;) { + buffer = buf.getAppendBuffer( + /*minCapacity=*/resultCapacity, + /*desiredCapacityHint=*/resultCapacity, + resultCapacity, + tmpStatus); + + if (U_FAILURE(tmpStatus)) { + break; + } + + len = uloc_getKeywordValue( + localeID, key, buffer, resultCapacity, &tmpStatus); + + if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + resultCapacity = len; + tmpStatus = U_ZERO_ERROR; + } + + if (U_FAILURE(tmpStatus)) { + if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) { + *status = U_MEMORY_ALLOCATION_ERROR; + break; + } if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -945,6 +1012,11 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac continue; } + buf.append(buffer, len, tmpStatus); + if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString. + } + keylen = (int32_t)uprv_strlen(key); isBcpUExt = (keylen > 1); @@ -1007,7 +1079,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } /* we've checked buf is null-terminated above */ - bcpValue = uloc_toUnicodeLocaleType(key, buf); + bcpValue = uloc_toUnicodeLocaleType(key, buf.data()); if (bcpValue == NULL) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; @@ -1015,33 +1087,44 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } continue; } - if (bcpValue == buf) { - /* + if (bcpValue == buf.data()) { + /* When uloc_toUnicodeLocaleType(key, buf) returns the input value as is, the value is well-formed, but has no known mapping. This implementation normalizes the - the value to lower case + value to lower case */ - int32_t bcpValueLen = static_cast(uprv_strlen(bcpValue)); - if (bcpValueLen < extBufCapacity) { - uprv_strcpy(pExtBuf, bcpValue); - T_CString_toLowerCase(pExtBuf); - - bcpValue = pExtBuf; - - pExtBuf += (bcpValueLen + 1); - extBufCapacity -= (bcpValueLen + 1); - } else { - if (strict) { - *status = U_ILLEGAL_ARGUMENT_ERROR; - break; - } - continue; + icu::CharString* extBuf = extBufPool.create(); + if (extBuf == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; + break; } + int32_t bcpValueLen = static_cast(uprv_strlen(bcpValue)); + int32_t resultCapacity; + char* pExtBuf = extBuf->getAppendBuffer( + /*minCapacity=*/bcpValueLen, + /*desiredCapacityHint=*/bcpValueLen, + resultCapacity, + tmpStatus); + if (U_FAILURE(tmpStatus)) { + *status = tmpStatus; + break; + } + + uprv_strcpy(pExtBuf, bcpValue); + T_CString_toLowerCase(pExtBuf); + + extBuf->append(pExtBuf, bcpValueLen, tmpStatus); + if (U_FAILURE(tmpStatus)) { + *status = tmpStatus; + break; + } + + bcpValue = extBuf->data(); } } else { if (*key == PRIVATEUSE) { - if (!_isPrivateuseValueSubtags(buf, len)) { + if (!_isPrivateuseValueSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -1049,7 +1132,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac continue; } } else { - if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf, len)) { + if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) { if (strict) { *status = U_ILLEGAL_ARGUMENT_ERROR; break; @@ -1058,20 +1141,17 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac } } bcpKey = key; - if ((len + 1) < extBufCapacity) { - uprv_memcpy(pExtBuf, buf, len); - bcpValue = pExtBuf; - - pExtBuf += len; - - *pExtBuf = 0; - pExtBuf++; - - extBufCapacity -= (len + 1); - } else { - *status = U_ILLEGAL_ARGUMENT_ERROR; + icu::CharString* extBuf = extBufPool.create(); + if (extBuf == nullptr) { + *status = U_MEMORY_ALLOCATION_ERROR; break; } + extBuf->append(buf.data(), len, tmpStatus); + if (U_FAILURE(tmpStatus)) { + *status = tmpStatus; + break; + } + bcpValue = extBuf->data(); } /* create ExtensionListEntry */ @@ -2337,31 +2417,66 @@ uloc_toLanguageTag(const char* localeID, int32_t langtagCapacity, UBool strict, UErrorCode* status) { - /* char canonical[ULOC_FULLNAME_CAPACITY]; */ /* See #6822 */ - char canonical[256]; - int32_t reslen = 0; + icu::CharString canonical; + int32_t reslen; UErrorCode tmpStatus = U_ZERO_ERROR; UBool hadPosix = FALSE; const char* pKeywordStart; /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ - canonical[0] = 0; - if (uprv_strlen(localeID) > 0) { - uloc_canonicalize(localeID, canonical, sizeof(canonical), &tmpStatus); - if (tmpStatus != U_ZERO_ERROR) { + int32_t resultCapacity = uprv_strlen(localeID); + if (resultCapacity > 0) { + char* buffer; + + for (;;) { + buffer = canonical.getAppendBuffer( + /*minCapacity=*/resultCapacity, + /*desiredCapacityHint=*/resultCapacity, + resultCapacity, + tmpStatus); + + if (U_FAILURE(tmpStatus)) { + *status = tmpStatus; + return 0; + } + + reslen = + uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus); + + if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) { + break; + } + + resultCapacity = reslen; + tmpStatus = U_ZERO_ERROR; + } + + if (U_FAILURE(tmpStatus)) { *status = U_ILLEGAL_ARGUMENT_ERROR; return 0; } + + canonical.append(buffer, reslen, tmpStatus); + if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { + tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString. + } + + if (U_FAILURE(tmpStatus)) { + *status = tmpStatus; + return 0; + } } + reslen = 0; + /* For handling special case - private use only tag */ - pKeywordStart = locale_getKeywordsStart(canonical); - if (pKeywordStart == canonical) { + pKeywordStart = locale_getKeywordsStart(canonical.data()); + if (pKeywordStart == canonical.data()) { UEnumeration *kwdEnum; int kwdCnt = 0; UBool done = FALSE; - kwdEnum = uloc_openKeywords((const char*)canonical, &tmpStatus); + kwdEnum = uloc_openKeywords(canonical.data(), &tmpStatus); if (kwdEnum != NULL) { kwdCnt = uenum_count(kwdEnum, &tmpStatus); if (kwdCnt == 1) { @@ -2399,12 +2514,12 @@ uloc_toLanguageTag(const char* localeID, } } - reslen += _appendLanguageToLanguageTag(canonical, langtag, langtagCapacity, strict, status); - reslen += _appendScriptToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); - reslen += _appendRegionToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status); - reslen += _appendVariantsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status); - reslen += _appendKeywordsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); - reslen += _appendPrivateuseToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); + reslen += _appendLanguageToLanguageTag(canonical.data(), langtag, langtagCapacity, strict, status); + reslen += _appendScriptToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status); + reslen += _appendRegionToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status); + reslen += _appendVariantsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status); + reslen += _appendKeywordsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); + reslen += _appendPrivateuseToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status); return reslen; } diff --git a/icu4c/source/test/cintltst/cloctst.c b/icu4c/source/test/cintltst/cloctst.c index 4454c67274..1d1805196f 100644 --- a/icu4c/source/test/cintltst/cloctst.c +++ b/icu4c/source/test/cintltst/cloctst.c @@ -226,6 +226,7 @@ void addLocaleTest(TestNode** root) TESTCASE(TestKeywordVariants); TESTCASE(TestKeywordVariantParsing); TESTCASE(TestCanonicalization); + TESTCASE(TestCanonicalizationBuffer); TESTCASE(TestKeywordSet); TESTCASE(TestKeywordSetError); TESTCASE(TestDisplayKeywords); @@ -2251,6 +2252,42 @@ static void TestCanonicalization(void) } } +static void TestCanonicalizationBuffer(void) +{ + UErrorCode status = U_ZERO_ERROR; + char buffer[256]; + + // ULOC_FULLNAME_CAPACITY == 157 (uloc.h) + static const char name[] = + "zh@x" + "=foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-barz" + ; + static const size_t len = sizeof name - 1; // Without NUL terminator. + + int32_t reslen = uloc_canonicalize(name, buffer, len, &status); + + if (U_FAILURE(status)) { + log_err("FAIL: uloc_canonicalize(%s) => %s, expected !U_FAILURE()\n", + name, u_errorName(status)); + return; + } + + if (reslen != len) { + log_err("FAIL: uloc_canonicalize(%s) => \"%i\", expected \"%u\"\n", + name, reslen, len); + return; + } + + if (uprv_strncmp(name, buffer, len) != 0) { + log_err("FAIL: uloc_canonicalize(%s) => \"%.*s\", expected \"%s\"\n", + name, reslen, buffer, name); + return; + } +} + static void TestDisplayKeywords(void) { int32_t i; diff --git a/icu4c/source/test/cintltst/cloctst.h b/icu4c/source/test/cintltst/cloctst.h index be1896a0c3..a2ce892ec2 100644 --- a/icu4c/source/test/cintltst/cloctst.h +++ b/icu4c/source/test/cintltst/cloctst.h @@ -84,6 +84,7 @@ static void TestDisplayNames(void); static void doTestDisplayNames(const char* inLocale, int32_t compareIndex); static void TestCanonicalization(void); + static void TestCanonicalizationBuffer(void); static void TestDisplayKeywords(void); diff --git a/icu4c/source/test/intltest/loctest.cpp b/icu4c/source/test/intltest/loctest.cpp index d3fc4e286c..e375c0c5a5 100644 --- a/icu4c/source/test/intltest/loctest.cpp +++ b/icu4c/source/test/intltest/loctest.cpp @@ -252,6 +252,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c TESTCASE_AUTO(TestToLanguageTag); TESTCASE_AUTO(TestMoveAssign); TESTCASE_AUTO(TestMoveCtor); + TESTCASE_AUTO(TestBug13417VeryLongLanguageTag); TESTCASE_AUTO_END; } @@ -3125,3 +3126,23 @@ void LocaleTest::TestMoveCtor() { assertEquals("variant", l7.getVariant(), l8.getVariant()); assertEquals("bogus", l7.isBogus(), l8.isBogus()); } + +void LocaleTest::TestBug13417VeryLongLanguageTag() { + IcuTestErrorCode status(*this, "TestBug13417VeryLongLanguageTag()"); + + static const char tag[] = + "zh-x" + "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz" + "-foo-bar-baz-fxx" + ; + + Locale l = Locale::forLanguageTag(tag, status); + status.errIfFailureAndReset("\"%s\"", tag); + assertTrue("!l.isBogus()", !l.isBogus()); + + std::string result = l.toLanguageTag(status); + status.errIfFailureAndReset("\"%s\"", l.getName()); + assertEquals("equals", tag, result.c_str()); +} diff --git a/icu4c/source/test/intltest/loctest.h b/icu4c/source/test/intltest/loctest.h index d165cae893..2a83be51a0 100644 --- a/icu4c/source/test/intltest/loctest.h +++ b/icu4c/source/test/intltest/loctest.h @@ -124,6 +124,8 @@ public: void TestMoveAssign(); void TestMoveCtor(); + void TestBug13417VeryLongLanguageTag(); + private: void _checklocs(const char* label, const char* req,