ICU-13417 Add the Locale::(for|to)LanguageTag() functions.

They are C++ wrappers of uloc_forLanguageTag() and uloc_toLanguageTag()
respectively, that take care of dynamic memory management.
This commit is contained in:
Fredrik Roubert 2018-09-12 20:41:53 -07:00 committed by Shane Carr
parent 3e8fb05f7c
commit 5663412172
No known key found for this signature in database
GPG Key ID: FCED3B24AAB18B5C
4 changed files with 318 additions and 0 deletions

View File

@ -32,8 +32,10 @@
*/
#include "unicode/bytestream.h"
#include "unicode/locid.h"
#include "unicode/strenum.h"
#include "unicode/stringpiece.h"
#include "unicode/uloc.h"
#include "putilimp.h"
#include "mutex.h"
@ -711,6 +713,161 @@ Locale::setDefault( const Locale& newLocale,
locale_set_default_internal(localeID, status);
}
Locale U_EXPORT2
Locale::forLanguageTag(StringPiece tag, UErrorCode& status)
{
Locale result(Locale::eBOGUS);
if (U_FAILURE(status)) {
return result;
}
// TODO: Remove the need for a const char* to a NUL terminated buffer.
const CharString tag_nul(tag, status);
if (U_FAILURE(status)) {
return result;
}
// If a BCP-47 language tag is passed as the language parameter to the
// normal Locale constructor, it will actually fall back to invoking
// uloc_forLanguageTag() to parse it if it somehow is able to detect that
// the string actually is BCP-47. This works well for things like strings
// using BCP-47 extensions, but it does not at all work for things like
// BCP-47 grandfathered tags (eg. "en-GB-oed") which are possible to also
// interpret as ICU locale IDs and because of that won't trigger the BCP-47
// parsing. Therefore the code here explicitly calls uloc_forLanguageTag()
// and then Locale::init(), instead of just calling the normal constructor.
// All simple language tags will have the exact same length as ICU locale
// ID strings as they have as BCP-47 strings (like "en_US" for "en-US").
CharString localeID;
int32_t resultCapacity = tag.size();
char* buffer;
int32_t parsedLength, reslen;
for (;;) {
buffer = localeID.getAppendBuffer(
/*minCapacity=*/resultCapacity,
/*desiredCapacityHint=*/resultCapacity,
resultCapacity,
status);
if (U_FAILURE(status)) {
return result;
}
reslen = uloc_forLanguageTag(
tag_nul.data(),
buffer,
resultCapacity,
&parsedLength,
&status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
break;
}
// For all BCP-47 language tags that use extensions, the corresponding
// ICU locale ID will be longer but uloc_forLanguageTag() does compute
// the exact length needed so this memory reallocation will be done at
// most once.
resultCapacity = reslen;
status = U_ZERO_ERROR;
}
if (U_FAILURE(status)) {
return result;
}
if (parsedLength != tag.size()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return result;
}
localeID.append(buffer, reslen, status);
if (status == U_STRING_NOT_TERMINATED_WARNING) {
status = U_ZERO_ERROR; // Terminators provided by CharString.
}
if (U_FAILURE(status)) {
return result;
}
result.init(localeID.data(), /*canonicalize=*/FALSE);
if (result.isBogus()) {
status = U_ILLEGAL_ARGUMENT_ERROR;
}
return result;
}
void
Locale::toLanguageTag(ByteSink& sink, UErrorCode& status) const
{
if (U_FAILURE(status)) {
return;
}
if (fIsBogus) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// All simple language tags will have the exact same length as BCP-47
// strings as they have as ICU locale IDs (like "en-US" for "en_US").
LocalMemory<char> scratch;
int32_t scratch_capacity = uprv_strlen(fullName);
if (scratch_capacity == 0) {
scratch_capacity = 3; // "und"
}
char* buffer;
int32_t result_capacity, reslen;
for (;;) {
if (scratch.allocateInsteadAndReset(scratch_capacity) == nullptr) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
buffer = sink.GetAppendBuffer(
/*min_capacity=*/scratch_capacity,
/*desired_capacity_hint=*/scratch_capacity,
scratch.getAlias(),
scratch_capacity,
&result_capacity);
reslen = uloc_toLanguageTag(
fullName,
buffer,
result_capacity,
/*strict=*/FALSE,
&status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
break;
}
// For some very few edge cases a language tag will be longer as a
// BCP-47 string than it is as an ICU locale ID. Most notoriously "C"
// expands to the BCP-47 tag "en-US-u-va-posix", 16 times longer, and
// it'll take several calls to uloc_toLanguageTag() to figure that out.
// https://unicode-org.atlassian.net/browse/ICU-20132
scratch_capacity = reslen;
status = U_ZERO_ERROR;
}
if (U_FAILURE(status)) {
return;
}
sink.Append(buffer, reslen);
if (status == U_STRING_NOT_TERMINATED_WARNING) {
status = U_ZERO_ERROR; // Terminators not used.
}
}
Locale U_EXPORT2
Locale::createFromName (const char *name)
{

View File

@ -31,6 +31,8 @@
#ifndef LOCID_H
#define LOCID_H
#include "unicode/bytestream.h"
#include "unicode/stringpiece.h"
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/putil.h"
@ -362,6 +364,55 @@ public:
UErrorCode& success);
#endif /* U_HIDE_SYSTEM_API */
#ifndef U_HIDE_DRAFT_API
/**
* Returns a Locale for the specified BCP47 language tag string.
* If the specified language tag contains any ill-formed subtags,
* the first such subtag and all following subtags are ignored.
* <p>
* This implements the 'Language-Tag' production of BCP47, and so
* supports grandfathered (regular and irregular) as well as private
* use language tags. Private use tags are represented as 'x-whatever',
* and grandfathered tags are converted to their canonical replacements
* where they exist. Note that a few grandfathered tags have no modern
* replacement, these will be converted using the fallback described in
* the first paragraph, so some information might be lost.
* @param tag the input BCP47 language tag.
* @param status error information if creating the Locale failed.
* @return the Locale for the specified BCP47 language tag.
* @draft ICU 63
*/
static Locale U_EXPORT2 forLanguageTag(StringPiece tag, UErrorCode& status);
/**
* Returns a well-formed language tag for this Locale.
* <p>
* <b>Note</b>: Any locale fields which do not satisfy the BCP47 syntax
* requirement will be silently omitted from the result.
*
* If this function fails, partial output may have been written to the sink.
*
* @param sink the output sink receiving the BCP47 language
* tag for this Locale.
* @param status error information if creating the language tag failed.
* @draft ICU 63
*/
void toLanguageTag(ByteSink& sink, UErrorCode& status) const;
/**
* Returns a well-formed language tag for this Locale.
* <p>
* <b>Note</b>: Any locale fields which do not satisfy the BCP47 syntax
* requirement will be silently omitted from the result.
*
* @param status error information if creating the language tag failed.
* @return the BCP47 language tag for this Locale.
* @draft ICU 63
*/
template<typename StringClass>
inline StringClass toLanguageTag(UErrorCode& status) const;
#endif // U_HIDE_DRAFT_API
/**
* Creates a locale which has had minimal canonicalization
* as per uloc_getName().
@ -775,6 +826,17 @@ Locale::operator!=(const Locale& other) const
return !operator==(other);
}
#ifndef U_HIDE_DRAFT_API
template<typename StringClass> inline StringClass
Locale::toLanguageTag(UErrorCode& status) const
{
StringClass result;
StringByteSink<StringClass> sink(&result);
toLanguageTag(sink, status);
return result;
}
#endif // U_HIDE_DRAFT_API
inline const char *
Locale::getCountry() const
{

View File

@ -15,6 +15,7 @@
#include "unicode/brkiter.h"
#include "unicode/coll.h"
#include "unicode/ustring.h"
#include "unicode/std_string.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
@ -233,6 +234,8 @@ void LocaleTest::runIndexedTest( int32_t index, UBool exec, const char* &name, c
TESTCASE_AUTO(TestIsRightToLeft);
TESTCASE_AUTO(TestBug13277);
TESTCASE_AUTO(TestBug13554);
TESTCASE_AUTO(TestForLanguageTag);
TESTCASE_AUTO(TestToLanguageTag);
TESTCASE_AUTO_END;
}
@ -2748,4 +2751,97 @@ void LocaleTest::TestBug13554() {
}
}
void LocaleTest::TestForLanguageTag() {
IcuTestErrorCode status(*this, "TestForLanguageTag()");
static const char tag_en[] = "en-US";
static const char tag_oed[] = "en-GB-oed";
static const char tag_af[] = "af-t-ar-i0-handwrit-u-ca-coptic-x-foo";
static const char tag_ill[] = "!";
static const char tag_no_nul[] = { 'e', 'n', '-', 'G', 'B' };
static const Locale loc_en("en_US");
static const Locale loc_oed("en_GB@x=oed");
static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo");
static const Locale loc_null("");
static const Locale loc_gb("en_GB");
Locale result_en = Locale::forLanguageTag(tag_en, status);
status.errIfFailureAndReset("\"%s\"", tag_en);
assertEquals(tag_en, loc_en.getName(), result_en.getName());
Locale result_oed = Locale::forLanguageTag(tag_oed, status);
status.errIfFailureAndReset("\"%s\"", tag_oed);
assertEquals(tag_oed, loc_oed.getName(), result_oed.getName());
Locale result_af = Locale::forLanguageTag(tag_af, status);
status.errIfFailureAndReset("\"%s\"", tag_af);
assertEquals(tag_af, loc_af.getName(), result_af.getName());
Locale result_ill = Locale::forLanguageTag(tag_ill, status);
assertEquals(tag_ill, U_ILLEGAL_ARGUMENT_ERROR, status.reset());
assertTrue(result_ill.getName(), result_ill.isBogus());
Locale result_null = Locale::forLanguageTag(nullptr, status);
status.errIfFailureAndReset("nullptr");
assertEquals("nullptr", loc_null.getName(), result_null.getName());
StringPiece sp_substr(tag_oed, 5); // "en-GB", no NUL.
Locale result_substr = Locale::forLanguageTag(sp_substr, status);
status.errIfFailureAndReset("\"%.*s\"", sp_substr.size(), sp_substr.data());
assertEquals(CharString(sp_substr, status).data(),
loc_gb.getName(), result_substr.getName());
StringPiece sp_no_nul(tag_no_nul, sizeof tag_no_nul); // "en-GB", no NUL.
Locale result_no_nul = Locale::forLanguageTag(sp_no_nul, status);
status.errIfFailureAndReset("\"%.*s\"", sp_no_nul.size(), sp_no_nul.data());
assertEquals(CharString(sp_no_nul, status).data(),
loc_gb.getName(), result_no_nul.getName());
}
void LocaleTest::TestToLanguageTag() {
IcuTestErrorCode status(*this, "TestToLanguageTag()");
static const Locale loc_c("C");
static const Locale loc_en("en_US");
static const Locale loc_af("af@calendar=coptic;t=ar-i0-handwrit;x=foo");
static const Locale loc_empty("");
static const Locale loc_ill("!");
static const char tag_c[] = "en-US-u-va-posix";
static const char tag_en[] = "en-US";
static const char tag_af[] = "af-t-ar-i0-handwrit-u-ca-coptic-x-foo";
static const char tag_und[] = "und";
std::string result;
StringByteSink<std::string> sink(&result);
loc_c.toLanguageTag(sink, status);
status.errIfFailureAndReset("\"%s\"", loc_c.getName());
assertEquals(loc_c.getName(), tag_c, result.c_str());
std::string result_c = loc_c.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", loc_c.getName());
assertEquals(loc_c.getName(), tag_c, result_c.c_str());
std::string result_en = loc_en.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", loc_en.getName());
assertEquals(loc_en.getName(), tag_en, result_en.c_str());
std::string result_af = loc_af.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", loc_af.getName());
assertEquals(loc_af.getName(), tag_af, result_af.c_str());
std::string result_empty = loc_empty.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", loc_empty.getName());
assertEquals(loc_empty.getName(), tag_und, result_empty.c_str());
std::string result_ill = loc_ill.toLanguageTag<std::string>(status);
status.errIfFailureAndReset("\"%s\"", loc_ill.getName());
assertEquals(loc_ill.getName(), tag_und, result_ill.c_str());
Locale loc_bogus;
loc_bogus.setToBogus();
std::string result_bogus = loc_bogus.toLanguageTag<std::string>(status);
assertEquals("bogus", U_ILLEGAL_ARGUMENT_ERROR, status.reset());
assertTrue(result_bogus.c_str(), result_bogus.empty());
}

View File

@ -108,6 +108,9 @@ public:
void TestBug13277();
void TestBug13554();
void TestForLanguageTag();
void TestToLanguageTag();
private:
void _checklocs(const char* label,
const char* req,