ICU-3361 fix ucnv_getUnicodeSet(ISO-2022-xx)
X-SVN-Rev: 14000
This commit is contained in:
parent
315f0b4633
commit
789228165c
@ -488,9 +488,11 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
|
|||||||
(myLocale[2]=='_' || myLocale[2]=='\0')){
|
(myLocale[2]=='_' || myLocale[2]=='\0')){
|
||||||
|
|
||||||
/* open the required converters and cache them */
|
/* open the required converters and cache them */
|
||||||
myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode);
|
myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode);
|
||||||
myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode);
|
if(version==1) {
|
||||||
myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode);
|
myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode);
|
||||||
|
}
|
||||||
|
myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode);
|
||||||
|
|
||||||
|
|
||||||
/*initialize the state variables*/
|
/*initialize the state variables*/
|
||||||
@ -2793,12 +2795,14 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||||||
if (U_FAILURE(*pErrorCode)) {
|
if (U_FAILURE(*pErrorCode)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#ifdef U_ENABLE_GENERIC_ISO_2022
|
||||||
if (cnv->sharedData == &_ISO2022Data) {
|
if (cnv->sharedData == &_ISO2022Data) {
|
||||||
/* We use UTF-8 in this case */
|
/* We use UTF-8 in this case */
|
||||||
uset_addRange(set, 0, 0xd7FF);
|
uset_addRange(set, 0, 0xd7FF);
|
||||||
uset_addRange(set, 0xE000, 0x10FFFF);
|
uset_addRange(set, 0xE000, 0x10FFFF);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
cnvData = (UConverterDataISO2022*)cnv->extraInfo;
|
cnvData = (UConverterDataISO2022*)cnv->extraInfo;
|
||||||
if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) {
|
if (cnv->sharedData == &_ISO2022KRData && cnvData->currentConverter != NULL) {
|
||||||
@ -2811,28 +2815,29 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||||||
case 'j':
|
case 'j':
|
||||||
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
|
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
|
||||||
/* include Latin-1 for some variants of JP */
|
/* include Latin-1 for some variants of JP */
|
||||||
cnvSet = uset_open(0, 0xff);
|
uset_addRange(set, 0, 0xff);
|
||||||
} else {
|
} else {
|
||||||
/* include ASCII for JP */
|
/* include ASCII for JP */
|
||||||
cnvSet = uset_open(0, 0x7f);
|
uset_addRange(set, 0, 0x7f);
|
||||||
}
|
}
|
||||||
/* include half-width Katakana for JP */
|
/* include half-width Katakana for JP */
|
||||||
uset_addRange(cnvSet, 0xff61, 0xff9f);
|
uset_addRange(set, 0xff61, 0xff9f);
|
||||||
break;
|
break;
|
||||||
case 'c':
|
case 'c':
|
||||||
case 'z':
|
case 'z':
|
||||||
/* include ASCII for CN */
|
/* include ASCII for CN */
|
||||||
cnvSet = uset_open(0, 0x7f);
|
uset_addRange(set, 0, 0x7f);
|
||||||
break;
|
break;
|
||||||
case 'k':
|
case 'k':
|
||||||
/* there is only one converter for KR, and it is not in the myConverterArray[] */
|
/* there is only one converter for KR, and it is not in the myConverterArray[] */
|
||||||
ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
|
ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
|
||||||
return;
|
return;
|
||||||
default:
|
default:
|
||||||
cnvSet = uset_open(1, 0);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* open a helper set because ucnv_getUnicodeSet() first empties its result set */
|
||||||
|
cnvSet = uset_open(1, 0);
|
||||||
if (!cnvSet) {
|
if (!cnvSet) {
|
||||||
*pErrorCode =U_MEMORY_ALLOCATION_ERROR;
|
*pErrorCode =U_MEMORY_ALLOCATION_ERROR;
|
||||||
return;
|
return;
|
||||||
@ -2847,8 +2852,19 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
|||||||
*/
|
*/
|
||||||
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
||||||
if(cnvData->myConverterArray[i]!=NULL) {
|
if(cnvData->myConverterArray[i]!=NULL) {
|
||||||
ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
|
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
|
||||||
uset_addAll(set, cnvSet /* pErrorCode */);
|
cnvData->version==0 && i==CNS_11643
|
||||||
|
) {
|
||||||
|
/* special handling for non-EXT ISO-2022-CN: add only code points for CNS planes 1 and 2 */
|
||||||
|
_MBCSGetUnicodeSetForBytes(
|
||||||
|
cnvData->myConverterArray[i],
|
||||||
|
set, UCNV_ROUNDTRIP_SET,
|
||||||
|
0, 0x81, 0x82,
|
||||||
|
pErrorCode);
|
||||||
|
} else {
|
||||||
|
ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);
|
||||||
|
uset_addAll(set, cnvSet /* pErrorCode */);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
uset_close(cnvSet);
|
uset_close(cnvSet);
|
||||||
|
@ -426,6 +426,98 @@ _MBCSSizeofFromUBytes(UConverterMBCSTable *mbcsTable) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* similar to _MBCSGetNextUChar() but recursive */
|
||||||
|
static void
|
||||||
|
_getUnicodeSetForBytes(const UConverter *cnv,
|
||||||
|
const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
|
||||||
|
USet *set,
|
||||||
|
UConverterUnicodeSet which,
|
||||||
|
uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
|
||||||
|
|
||||||
|
UErrorCode *pErrorCode) {
|
||||||
|
int32_t b, entry;
|
||||||
|
|
||||||
|
for(b=lowByte; b<=highByte; ++b) {
|
||||||
|
entry=stateTable[state][b];
|
||||||
|
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||||
|
_getUnicodeSetForBytes(
|
||||||
|
cnv, stateTable, unicodeCodeUnits,
|
||||||
|
set, which,
|
||||||
|
(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
|
||||||
|
offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
|
||||||
|
0, 0xff,
|
||||||
|
pErrorCode);
|
||||||
|
} else {
|
||||||
|
UChar32 c;
|
||||||
|
int32_t rowOffset=offset;
|
||||||
|
uint8_t action;
|
||||||
|
|
||||||
|
c=U_SENTINEL;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* An if-else-if chain provides more reliable performance for
|
||||||
|
* the most common cases compared to a switch.
|
||||||
|
*/
|
||||||
|
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||||
|
if(action==MBCS_STATE_VALID_DIRECT_16) {
|
||||||
|
/* output BMP code point */
|
||||||
|
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||||
|
} else if(action==MBCS_STATE_VALID_16) {
|
||||||
|
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||||
|
c=unicodeCodeUnits[offset];
|
||||||
|
if(c<0xfffe) {
|
||||||
|
/* output BMP code point */
|
||||||
|
} else {
|
||||||
|
c=U_SENTINEL;
|
||||||
|
}
|
||||||
|
} else if(action==MBCS_STATE_VALID_16_PAIR) {
|
||||||
|
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||||
|
c=unicodeCodeUnits[offset++];
|
||||||
|
if(c<0xd800) {
|
||||||
|
/* output BMP code point below 0xd800 */
|
||||||
|
} else if(c<=0xdbff) {
|
||||||
|
/* output roundtrip or fallback supplementary code point */
|
||||||
|
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
|
||||||
|
} else if(c==0xe000) {
|
||||||
|
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
|
||||||
|
c=unicodeCodeUnits[offset];
|
||||||
|
} else {
|
||||||
|
c=U_SENTINEL;
|
||||||
|
}
|
||||||
|
} else if(action==MBCS_STATE_VALID_DIRECT_20) {
|
||||||
|
/* output supplementary code point */
|
||||||
|
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if(c>=0) {
|
||||||
|
uset_add(set, c);
|
||||||
|
}
|
||||||
|
offset=rowOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||||
|
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||||
|
* In the future, if we add support for reverse-fallback sets, this function
|
||||||
|
* needs to be updated, and called for each initial state.
|
||||||
|
* Does not currently handle extensions.
|
||||||
|
* Does not empty the set first.
|
||||||
|
*/
|
||||||
|
U_CFUNC void
|
||||||
|
_MBCSGetUnicodeSetForBytes(const UConverter *cnv,
|
||||||
|
USet *set,
|
||||||
|
UConverterUnicodeSet which,
|
||||||
|
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||||
|
UErrorCode *pErrorCode) {
|
||||||
|
_getUnicodeSetForBytes(
|
||||||
|
cnv, cnv->sharedData->mbcs.stateTable, cnv->sharedData->mbcs.unicodeCodeUnits,
|
||||||
|
set, which,
|
||||||
|
state, 0, lowByte, highByte,
|
||||||
|
pErrorCode);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
_MBCSGetUnicodeSet(const UConverter *cnv,
|
_MBCSGetUnicodeSet(const UConverter *cnv,
|
||||||
USet *set,
|
USet *set,
|
||||||
|
@ -352,5 +352,19 @@ U_CFUNC void
|
|||||||
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||||
UErrorCode *pErrorCode);
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||||
|
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||||
|
* In the future, if we add support for reverse-fallback sets, this function
|
||||||
|
* needs to be updated, and called for each initial state.
|
||||||
|
* Does not currently handle extensions.
|
||||||
|
* Does not empty the set first.
|
||||||
|
*/
|
||||||
|
U_CFUNC void
|
||||||
|
_MBCSGetUnicodeSetForBytes(const UConverter *cnv,
|
||||||
|
USet *set,
|
||||||
|
UConverterUnicodeSet which,
|
||||||
|
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||||
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
36
icu4c/source/test/testdata/conversion.txt
vendored
36
icu4c/source/test/testdata/conversion.txt
vendored
@ -503,6 +503,42 @@ conversion {
|
|||||||
// which - numeric UConverterUnicodeSet value
|
// which - numeric UConverterUnicodeSet value
|
||||||
Headers { "charset", "map", "mapnot", "which" }
|
Headers { "charset", "map", "mapnot", "which" }
|
||||||
Cases {
|
Cases {
|
||||||
|
// ISO-2022-KR
|
||||||
|
{
|
||||||
|
"ISO-2022-KR",
|
||||||
|
"[\x00-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]",
|
||||||
|
"[\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]",
|
||||||
|
:int{0}
|
||||||
|
}
|
||||||
|
|
||||||
|
// versions of ISO-2022-JP
|
||||||
|
{
|
||||||
|
"ISO-2022-JP",
|
||||||
|
"[\x00-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||||
|
"[\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
|
||||||
|
:int{0}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"ISO-2022-JP-2",
|
||||||
|
"[\x00-\u0113\u0385-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||||
|
"[\uffe7-\U0010ffff]",
|
||||||
|
:int{0}
|
||||||
|
}
|
||||||
|
|
||||||
|
// versions of ISO-2022-CN
|
||||||
|
{
|
||||||
|
"ISO-2022-CN",
|
||||||
|
"[\x00-\x7f\u4e00\u4e01\u9f98\ufe6b]",
|
||||||
|
"[\u4e29\uffe6-\U0010ffff]",
|
||||||
|
:int{0}
|
||||||
|
}
|
||||||
|
{
|
||||||
|
"ISO-2022-CN-EXT",
|
||||||
|
"[\x00-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]",
|
||||||
|
"[\U00020001\U00020002\U0002a6d7-\U0010ffff]",
|
||||||
|
:int{0}
|
||||||
|
}
|
||||||
|
|
||||||
// DBCS-only
|
// DBCS-only
|
||||||
{
|
{
|
||||||
"ibm-971",
|
"ibm-971",
|
||||||
|
Loading…
Reference in New Issue
Block a user