ICU-10751 uscript_getCode(locale) use likely subtags not LocaleScript locale data

X-SVN-Rev: 36280
This commit is contained in:
Markus Scherer 2014-08-28 23:51:38 +00:00
parent c955c14a6b
commit e353b8e897
3 changed files with 183 additions and 65 deletions

View File

@ -423,15 +423,16 @@ typedef enum UScriptCode {
} UScriptCode; } UScriptCode;
/** /**
* Gets script codes associated with the given locale or ISO 15924 abbreviation or name. * Gets the script codes associated with the given locale or ISO 15924 abbreviation or name.
* Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym". * Fills in USCRIPT_MALAYALAM given "Malayam" OR "Mlym".
* Fills in USCRIPT_LATIN given "en" OR "en_US" * Fills in USCRIPT_LATIN given "en" OR "en_US"
* If required capacity is greater than capacity of the destination buffer then the error code * If the required capacity is greater than the capacity of the destination buffer,
* is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned * then the error code is set to U_BUFFER_OVERFLOW_ERROR and the required capacity is returned.
* *
* <p>Note: To search by short or long script alias only, use * <p>Note: To search by short or long script alias only, use
* u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. This does * u_getPropertyValueEnum(UCHAR_SCRIPT, alias) instead. That does
* a fast lookup with no access of the locale data. * a fast lookup with no access of the locale data.
*
* @param nameOrAbbrOrLocale name of the script, as given in * @param nameOrAbbrOrLocale name of the script, as given in
* PropertyValueAliases.txt, or ISO 15924 code or locale * PropertyValueAliases.txt, or ISO 15924 code or locale
* @param fillIn the UScriptCode buffer to fill in the script code * @param fillIn the UScriptCode buffer to fill in the script code

View File

@ -1,6 +1,6 @@
/* /*
********************************************************************** **********************************************************************
* Copyright (C) 1997-2011, International Business Machines * Copyright (C) 1997-2014, International Business Machines
* Corporation and others. All Rights Reserved. * Corporation and others. All Rights Reserved.
********************************************************************** **********************************************************************
* *
@ -13,85 +13,126 @@
****************************************************************************** ******************************************************************************
*/ */
#include "unicode/uscript.h"
#include "unicode/ures.h"
#include "unicode/uchar.h" #include "unicode/uchar.h"
#include "unicode/putil.h" #include "unicode/uscript.h"
#include "uprops.h" #include "unicode/uloc.h"
#include "cmemory.h" #include "cmemory.h"
#include "cstring.h" #include "cstring.h"
static const char kLocaleScript[] = "LocaleScript"; static const UScriptCode JAPANESE[3] = { USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN };
static const UScriptCode KOREAN[2] = { USCRIPT_HANGUL, USCRIPT_HAN };
static const UScriptCode HAN_BOPO[2] = { USCRIPT_HAN, USCRIPT_BOPOMOFO };
/* TODO: this is a bad API should be deprecated */ static int32_t
setCodes(const UScriptCode *src, int32_t length,
UScriptCode *dest, int32_t capacity, UErrorCode *err) {
int32_t i;
if(U_FAILURE(*err)) { return 0; }
if(length > capacity) {
*err = U_BUFFER_OVERFLOW_ERROR;
return length;
}
for(i = 0; i < length; ++i) {
dest[i] = src[i];
}
return length;
}
static int32_t
setOneCode(UScriptCode script, UScriptCode *scripts, int32_t capacity, UErrorCode *err) {
if(U_FAILURE(*err)) { return 0; }
if(1 > capacity) {
*err = U_BUFFER_OVERFLOW_ERROR;
return 1;
}
scripts[0] = script;
return 1;
}
static int32_t
getCodesFromLocale(const char *locale,
UScriptCode *scripts, int32_t capacity, UErrorCode *err) {
UErrorCode internalErrorCode = U_ZERO_ERROR;
char lang[8];
char script[8];
int32_t langLength, scriptLength;
if(U_FAILURE(*err)) { return 0; }
// Multi-script languages, equivalent to the LocaleScript data
// that we used to load from locale resource bundles.
langLength = uloc_getLanguage(locale, lang, UPRV_LENGTHOF(lang), &internalErrorCode);
if(U_FAILURE(internalErrorCode) || internalErrorCode == U_STRING_NOT_TERMINATED_WARNING) {
return 0;
}
if(0 == uprv_strcmp(lang, "ja")) {
return setCodes(JAPANESE, UPRV_LENGTHOF(JAPANESE), scripts, capacity, err);
}
if(0 == uprv_strcmp(lang, "ko")) {
return setCodes(KOREAN, UPRV_LENGTHOF(KOREAN), scripts, capacity, err);
}
scriptLength = uloc_getScript(locale, script, UPRV_LENGTHOF(script), &internalErrorCode);
if(U_FAILURE(internalErrorCode) || internalErrorCode == U_STRING_NOT_TERMINATED_WARNING) {
return 0;
}
if(0 == uprv_strcmp(lang, "zh") && 0 == uprv_strcmp(script, "Hant")) {
return setCodes(HAN_BOPO, UPRV_LENGTHOF(HAN_BOPO), scripts, capacity, err);
}
// Explicit script code.
if(scriptLength != 0) {
UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script);
if(scriptCode != USCRIPT_INVALID_CODE) {
if(scriptCode == USCRIPT_SIMPLIFIED_HAN || scriptCode == USCRIPT_TRADITIONAL_HAN) {
scriptCode = USCRIPT_HAN;
}
return setOneCode(scriptCode, scripts, capacity, err);
}
}
return 0;
}
/* TODO: this is a bad API and should be deprecated, ticket #11141 */
U_CAPI int32_t U_EXPORT2 U_CAPI int32_t U_EXPORT2
uscript_getCode(const char* nameOrAbbrOrLocale, uscript_getCode(const char* nameOrAbbrOrLocale,
UScriptCode* fillIn, UScriptCode* fillIn,
int32_t capacity, int32_t capacity,
UErrorCode* err){ UErrorCode* err){
if(U_FAILURE(*err)) {
UScriptCode code = USCRIPT_INVALID_CODE; return 0;
int32_t numFilled=0;
int32_t len=0;
/* check arguments */
if(err==NULL ||U_FAILURE(*err)){
return numFilled;
} }
if(nameOrAbbrOrLocale==NULL || fillIn == NULL || capacity<0){ if(nameOrAbbrOrLocale==NULL ||
(fillIn == NULL ? capacity != 0 : capacity < 0)) {
*err = U_ILLEGAL_ARGUMENT_ERROR; *err = U_ILLEGAL_ARGUMENT_ERROR;
return numFilled; return 0;
} }
UBool triedCode = FALSE;
if(uprv_strchr(nameOrAbbrOrLocale, '-')==NULL && uprv_strchr(nameOrAbbrOrLocale, '_')==NULL ){ if(uprv_strchr(nameOrAbbrOrLocale, '-')==NULL && uprv_strchr(nameOrAbbrOrLocale, '_')==NULL ){
/* try long and abbreviated script names first */ /* try long and abbreviated script names first */
code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, nameOrAbbrOrLocale); UScriptCode code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, nameOrAbbrOrLocale);
if(code!=USCRIPT_INVALID_CODE) {
return setOneCode(code, fillIn, capacity, err);
} }
if(code==(UScriptCode)UCHAR_INVALID_CODE){ triedCode = TRUE;
/* Do not propagate error codes from just not finding a locale bundle. */ }
UErrorCode localErrorCode = U_ZERO_ERROR; char likely[ULOC_FULLNAME_CAPACITY];
UResourceBundle* resB = ures_open(NULL,nameOrAbbrOrLocale,&localErrorCode); UErrorCode internalErrorCode = U_ZERO_ERROR;
if(U_SUCCESS(localErrorCode)&& localErrorCode != U_USING_DEFAULT_WARNING){ int32_t length = getCodesFromLocale(nameOrAbbrOrLocale, fillIn, capacity, err);
UResourceBundle* resD = ures_getByKey(resB,kLocaleScript,NULL,&localErrorCode); if(U_FAILURE(*err) || length != 0) {
if(U_SUCCESS(localErrorCode) ){ return length;
len =0; }
while(ures_hasNext(resD)){ (void)uloc_addLikelySubtags(nameOrAbbrOrLocale,
const UChar* name = ures_getNextString(resD,&len,NULL,&localErrorCode); likely, UPRV_LENGTHOF(likely), &internalErrorCode);
if(U_SUCCESS(localErrorCode)){ if(U_SUCCESS(internalErrorCode) && internalErrorCode != U_STRING_NOT_TERMINATED_WARNING) {
char cName[50] = {'\0'}; length = getCodesFromLocale(likely, fillIn, capacity, err);
u_UCharsToChars(name,cName,len); if(U_FAILURE(*err) || length != 0) {
code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, cName); return length;
/* got the script code now fill in the buffer */
if(numFilled<capacity){
*(fillIn)++=code;
numFilled++;
}else{
ures_close(resD);
ures_close(resB);
*err=U_BUFFER_OVERFLOW_ERROR;
return len;
} }
} }
} if(!triedCode) {
}
ures_close(resD);
}
ures_close(resB);
code = USCRIPT_INVALID_CODE;
}
if(code==(UScriptCode)UCHAR_INVALID_CODE){
/* still not found .. try long and abbreviated script names again */ /* still not found .. try long and abbreviated script names again */
code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, nameOrAbbrOrLocale); UScriptCode code = (UScriptCode) u_getPropertyValueEnum(UCHAR_SCRIPT, nameOrAbbrOrLocale);
} if(code!=USCRIPT_INVALID_CODE) {
if(code!=(UScriptCode)UCHAR_INVALID_CODE){ return setOneCode(code, fillIn, capacity, err);
/* we found it */
if(numFilled<capacity){
*(fillIn)++=code;
numFilled++;
}else{
*err=U_BUFFER_OVERFLOW_ERROR;
return len;
} }
} }
return numFilled; return 0;
} }

View File

@ -11,6 +11,38 @@
#include "cucdapi.h" #include "cucdapi.h"
#include "cmemory.h" #include "cmemory.h"
static void scriptsToString(const UScriptCode scripts[], int32_t length, char s[]) {
int32_t i;
if(length == 0) {
strcpy(s, "(no scripts)");
return;
}
s[0] = 0;
for(i = 0; i < length; ++i) {
if(i > 0) {
strcat(s, " ");
}
strcat(s, uscript_getShortName(scripts[i]));
}
}
static void assertEqualScripts(const char *msg,
const UScriptCode scripts1[], int32_t length1,
const UScriptCode scripts2[], int32_t length2,
UErrorCode errorCode) {
char s1[80];
char s2[80];
if(U_FAILURE(errorCode)) {
log_err("Failed: %s - %s\n", msg, u_errorName(errorCode));
return;
}
scriptsToString(scripts1, length1, s1);
scriptsToString(scripts2, length2, s2);
if(0!=strcmp(s1, s2)) {
log_err("Failed: %s: expected %s but got %s\n", msg, s1, s2);
}
}
void TestUScriptCodeAPI(){ void TestUScriptCodeAPI(){
int i =0; int i =0;
int numErrors =0; int numErrors =0;
@ -112,6 +144,50 @@ void TestUScriptCodeAPI(){
} }
} }
{
static const UScriptCode LATIN[1] = { USCRIPT_LATIN };
static const UScriptCode CYRILLIC[1] = { USCRIPT_CYRILLIC };
static const UScriptCode DEVANAGARI[1] = { USCRIPT_DEVANAGARI };
static const UScriptCode HAN[1] = { USCRIPT_HAN };
static const UScriptCode JAPANESE[3] = { USCRIPT_KATAKANA, USCRIPT_HIRAGANA, USCRIPT_HAN };
static const UScriptCode KOREAN[2] = { USCRIPT_HANGUL, USCRIPT_HAN };
static const UScriptCode HAN_BOPO[2] = { USCRIPT_HAN, USCRIPT_BOPOMOFO };
UScriptCode scripts[5];
UErrorCode err;
int32_t num;
// Should work regardless of whether we have locale data for the language.
err = U_ZERO_ERROR;
num = uscript_getCode("tg", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("tg script: Cyrl", CYRILLIC, 1, scripts, num, err); // Tajik
err = U_ZERO_ERROR;
num = uscript_getCode("xsr", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("xsr script: Deva", DEVANAGARI, 1, scripts, num, err); // Sherpa
// Multi-script languages.
err = U_ZERO_ERROR;
num = uscript_getCode("ja", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("ja scripts: Kana Hira Hani",
JAPANESE, UPRV_LENGTHOF(JAPANESE), scripts, num, err);
err = U_ZERO_ERROR;
num = uscript_getCode("ko", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("ko scripts: Hang Hani",
KOREAN, UPRV_LENGTHOF(KOREAN), scripts, num, err);
err = U_ZERO_ERROR;
num = uscript_getCode("zh", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("zh script: Hani", HAN, 1, scripts, num, err);
err = U_ZERO_ERROR;
num = uscript_getCode("zh-Hant", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("zh-Hant scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err);
err = U_ZERO_ERROR;
num = uscript_getCode("zh-TW", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("zh-TW scripts: Hani Bopo", HAN_BOPO, 2, scripts, num, err);
// Ambiguous API, but this probably wants to return Latin rather than Rongorongo (Roro).
err = U_ZERO_ERROR;
num = uscript_getCode("ro-RO", scripts, UPRV_LENGTHOF(scripts), &err);
assertEqualScripts("ro-RO script: Latn", LATIN, 1, scripts, num, err);
}
{ {
UScriptCode testAbbr[]={ UScriptCode testAbbr[]={