ICU-13417 Replace fixed buffers in uloc_tag.cpp with CharString.

This gets rid of those fixed buffers that caused ICU-13417 to be filed
in the first place, those that prevent handling language tags with very
large amounts of keywords.

A number of fixed buffers will still remain in uloc_tag.cpp (and
elsewhere in the locale handling code) for the time being, but this
change is a necessary first step in cleaning up this code and will
alleviate the most pressing problem encountered by ICU4C users.

An off-by-one error in _getKeywords() caused uloc_canonicalize() to not
write out the final keyword value in case the result would fill up the
buffer exactly, resulting in U_STRING_NOT_TERMINATED_WARNING.
This commit is contained in:
Fredrik Roubert 2018-09-19 17:52:37 -07:00 committed by Shane Carr
parent 24b490dc02
commit acc3f65a87
No known key found for this signature in database
GPG Key ID: FCED3B24AAB18B5C
6 changed files with 233 additions and 57 deletions

View File

@ -798,7 +798,7 @@ _getKeywords(const char *localeID,
}
keywordsLen += keywordList[i].keywordLen + 1;
if(valuesToo) {
if(keywordsLen + keywordList[i].valueLen < keywordCapacity) {
if(keywordsLen + keywordList[i].valueLen <= keywordCapacity) {
uprv_strncpy(keywords+keywordsLen, keywordList[i].valueStart, keywordList[i].valueLen);
}
keywordsLen += keywordList[i].valueLen;

View File

@ -12,11 +12,13 @@
#include "unicode/putil.h"
#include "unicode/uloc.h"
#include "ustr_imp.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "putilimp.h"
#include "uinvchar.h"
#include "ulocimp.h"
#include "uvector.h"
#include "uassert.h"
@ -172,6 +174,46 @@ static const char*
ultag_getGrandfathered(const ULanguageTag* langtag);
#endif
namespace {
// Helper class to memory manage CharString objects.
// Only ever stack-allocated, does not need to inherit UMemory.
class CharStringPool {
public:
CharStringPool() : status(U_ZERO_ERROR), pool(&deleter, nullptr, status) {}
~CharStringPool() = default;
CharStringPool(const CharStringPool&) = delete;
CharStringPool& operator=(const CharStringPool&) = delete;
icu::CharString* create() {
if (U_FAILURE(status)) {
return nullptr;
}
icu::CharString* const obj = new icu::CharString;
if (obj == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return nullptr;
}
pool.addElement(obj, status);
if (U_FAILURE(status)) {
delete obj;
return nullptr;
}
return obj;
}
private:
static void U_CALLCONV deleter(void* obj) {
delete static_cast<icu::CharString*>(obj);
}
UErrorCode status;
icu::UVector pool;
};
} // namespace
/*
* -------------------------------------------------
*
@ -900,7 +942,6 @@ _appendVariantsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
static int32_t
_appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capacity, UBool strict, UBool hadPosix, UErrorCode* status) {
char buf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
char attrBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY] = { 0 };
int32_t attrBufLength = 0;
UEnumeration *keywordEnum = NULL;
@ -920,22 +961,48 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
AttributeListEntry *firstAttr = NULL;
AttributeListEntry *attr;
char *attrValue;
char extBuf[ULOC_KEYWORD_AND_VALUES_CAPACITY];
char *pExtBuf = extBuf;
int32_t extBufCapacity = sizeof(extBuf);
CharStringPool extBufPool;
const char *bcpKey=nullptr, *bcpValue=nullptr;
UErrorCode tmpStatus = U_ZERO_ERROR;
int32_t keylen;
UBool isBcpUExt;
while (TRUE) {
icu::CharString buf;
key = uenum_next(keywordEnum, NULL, status);
if (key == NULL) {
break;
}
len = uloc_getKeywordValue(localeID, key, buf, sizeof(buf), &tmpStatus);
/* buf must be null-terminated */
if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
char* buffer;
int32_t resultCapacity = ULOC_KEYWORD_AND_VALUES_CAPACITY;
for (;;) {
buffer = buf.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
break;
}
len = uloc_getKeywordValue(
localeID, key, buffer, resultCapacity, &tmpStatus);
if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
break;
}
resultCapacity = len;
tmpStatus = U_ZERO_ERROR;
}
if (U_FAILURE(tmpStatus)) {
if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
@ -945,6 +1012,11 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
continue;
}
buf.append(buffer, len, tmpStatus);
if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
}
keylen = (int32_t)uprv_strlen(key);
isBcpUExt = (keylen > 1);
@ -1007,7 +1079,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
}
/* we've checked buf is null-terminated above */
bcpValue = uloc_toUnicodeLocaleType(key, buf);
bcpValue = uloc_toUnicodeLocaleType(key, buf.data());
if (bcpValue == NULL) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
@ -1015,33 +1087,44 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
}
continue;
}
if (bcpValue == buf) {
if (bcpValue == buf.data()) {
/*
When uloc_toUnicodeLocaleType(key, buf) returns the
input value as is, the value is well-formed, but has
no known mapping. This implementation normalizes the
the value to lower case
value to lower case
*/
icu::CharString* extBuf = extBufPool.create();
if (extBuf == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
int32_t bcpValueLen = static_cast<int32_t>(uprv_strlen(bcpValue));
if (bcpValueLen < extBufCapacity) {
int32_t resultCapacity;
char* pExtBuf = extBuf->getAppendBuffer(
/*minCapacity=*/bcpValueLen,
/*desiredCapacityHint=*/bcpValueLen,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
uprv_strcpy(pExtBuf, bcpValue);
T_CString_toLowerCase(pExtBuf);
bcpValue = pExtBuf;
pExtBuf += (bcpValueLen + 1);
extBufCapacity -= (bcpValueLen + 1);
} else {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
extBuf->append(pExtBuf, bcpValueLen, tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
continue;
}
bcpValue = extBuf->data();
}
} else {
if (*key == PRIVATEUSE) {
if (!_isPrivateuseValueSubtags(buf, len)) {
if (!_isPrivateuseValueSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
@ -1049,7 +1132,7 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
continue;
}
} else {
if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf, len)) {
if (!_isExtensionSingleton(key, keylen) || !_isExtensionSubtags(buf.data(), len)) {
if (strict) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
break;
@ -1058,20 +1141,17 @@ _appendKeywordsToLanguageTag(const char* localeID, char* appendAt, int32_t capac
}
}
bcpKey = key;
if ((len + 1) < extBufCapacity) {
uprv_memcpy(pExtBuf, buf, len);
bcpValue = pExtBuf;
pExtBuf += len;
*pExtBuf = 0;
pExtBuf++;
extBufCapacity -= (len + 1);
} else {
*status = U_ILLEGAL_ARGUMENT_ERROR;
icu::CharString* extBuf = extBufPool.create();
if (extBuf == nullptr) {
*status = U_MEMORY_ALLOCATION_ERROR;
break;
}
extBuf->append(buf.data(), len, tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
break;
}
bcpValue = extBuf->data();
}
/* create ExtensionListEntry */
@ -2337,31 +2417,66 @@ uloc_toLanguageTag(const char* localeID,
int32_t langtagCapacity,
UBool strict,
UErrorCode* status) {
/* char canonical[ULOC_FULLNAME_CAPACITY]; */ /* See #6822 */
char canonical[256];
int32_t reslen = 0;
icu::CharString canonical;
int32_t reslen;
UErrorCode tmpStatus = U_ZERO_ERROR;
UBool hadPosix = FALSE;
const char* pKeywordStart;
/* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */
canonical[0] = 0;
if (uprv_strlen(localeID) > 0) {
uloc_canonicalize(localeID, canonical, sizeof(canonical), &tmpStatus);
if (tmpStatus != U_ZERO_ERROR) {
int32_t resultCapacity = uprv_strlen(localeID);
if (resultCapacity > 0) {
char* buffer;
for (;;) {
buffer = canonical.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
tmpStatus);
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return 0;
}
reslen =
uloc_canonicalize(localeID, buffer, resultCapacity, &tmpStatus);
if (tmpStatus != U_BUFFER_OVERFLOW_ERROR) {
break;
}
resultCapacity = reslen;
tmpStatus = U_ZERO_ERROR;
}
if (U_FAILURE(tmpStatus)) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
canonical.append(buffer, reslen, tmpStatus);
if (tmpStatus == U_STRING_NOT_TERMINATED_WARNING) {
tmpStatus = U_ZERO_ERROR; // Terminators provided by CharString.
}
if (U_FAILURE(tmpStatus)) {
*status = tmpStatus;
return 0;
}
}
reslen = 0;
/* For handling special case - private use only tag */
pKeywordStart = locale_getKeywordsStart(canonical);
if (pKeywordStart == canonical) {
pKeywordStart = locale_getKeywordsStart(canonical.data());
if (pKeywordStart == canonical.data()) {
UEnumeration *kwdEnum;
int kwdCnt = 0;
UBool done = FALSE;
kwdEnum = uloc_openKeywords((const char*)canonical, &tmpStatus);
kwdEnum = uloc_openKeywords(canonical.data(), &tmpStatus);
if (kwdEnum != NULL) {
kwdCnt = uenum_count(kwdEnum, &tmpStatus);
if (kwdCnt == 1) {
@ -2399,12 +2514,12 @@ uloc_toLanguageTag(const char* localeID,
}
}
reslen += _appendLanguageToLanguageTag(canonical, langtag, langtagCapacity, strict, status);
reslen += _appendScriptToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendRegionToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendVariantsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status);
reslen += _appendKeywordsToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
reslen += _appendPrivateuseToLanguageTag(canonical, langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
reslen += _appendLanguageToLanguageTag(canonical.data(), langtag, langtagCapacity, strict, status);
reslen += _appendScriptToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendRegionToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, status);
reslen += _appendVariantsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, &hadPosix, status);
reslen += _appendKeywordsToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
reslen += _appendPrivateuseToLanguageTag(canonical.data(), langtag + reslen, langtagCapacity - reslen, strict, hadPosix, status);
return reslen;
}

View File

@ -226,6 +226,7 @@ void addLocaleTest(TestNode** root)
TESTCASE(TestKeywordVariants);
TESTCASE(TestKeywordVariantParsing);
TESTCASE(TestCanonicalization);
TESTCASE(TestCanonicalizationBuffer);
TESTCASE(TestKeywordSet);
TESTCASE(TestKeywordSetError);
TESTCASE(TestDisplayKeywords);
@ -2251,6 +2252,42 @@ static void TestCanonicalization(void)
}
}
static void TestCanonicalizationBuffer(void)
{
UErrorCode status = U_ZERO_ERROR;
char buffer[256];
// ULOC_FULLNAME_CAPACITY == 157 (uloc.h)
static const char name[] =
"zh@x"
"=foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-barz"
;
static const size_t len = sizeof name - 1; // Without NUL terminator.
int32_t reslen = uloc_canonicalize(name, buffer, len, &status);
if (U_FAILURE(status)) {
log_err("FAIL: uloc_canonicalize(%s) => %s, expected !U_FAILURE()\n",
name, u_errorName(status));
return;
}
if (reslen != len) {
log_err("FAIL: uloc_canonicalize(%s) => \"%i\", expected \"%u\"\n",
name, reslen, len);
return;
}
if (uprv_strncmp(name, buffer, len) != 0) {
log_err("FAIL: uloc_canonicalize(%s) => \"%.*s\", expected \"%s\"\n",
name, reslen, buffer, name);
return;
}
}
static void TestDisplayKeywords(void)
{
int32_t i;

View File

@ -84,6 +84,7 @@ static void TestDisplayNames(void);
static void doTestDisplayNames(const char* inLocale, int32_t compareIndex);
static void TestCanonicalization(void);
static void TestCanonicalizationBuffer(void);
static void TestDisplayKeywords(void);

View File

@ -252,6 +252,7 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
TESTCASE_AUTO(TestToLanguageTag);
TESTCASE_AUTO(TestMoveAssign);
TESTCASE_AUTO(TestMoveCtor);
TESTCASE_AUTO(TestBug13417VeryLongLanguageTag);
TESTCASE_AUTO_END;
}
@ -3125,3 +3126,23 @@ void LocaleTest::TestMoveCtor() {
assertEquals("variant", l7.getVariant(), l8.getVariant());
assertEquals("bogus", l7.isBogus(), l8.isBogus());
}
void LocaleTest::TestBug13417VeryLongLanguageTag() {
IcuTestErrorCode status(*this, "TestBug13417VeryLongLanguageTag()");
static const char tag[] =
"zh-x"
"-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-bar-baz-foo-bar-baz-foo-bar-baz-foo-bar-baz"
"-foo-bar-baz-fxx"
;
Locale l = Locale::forLanguageTag(tag, status);
status.errIfFailureAndReset("\"%s\"", tag);
assertTrue("!l.isBogus()", !l.isBogus());
std::string result = l.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", l.getName());
assertEquals("equals", tag, result.c_str());
}

View File

@ -124,6 +124,8 @@ public:
void TestMoveAssign();
void TestMoveCtor();
void TestBug13417VeryLongLanguageTag();
private:
void _checklocs(const char* label,
const char* req,