ICU-5797 use Shift-JIS table for ISO 2022-JP, and hardcode JIS X 0201 mappings
X-SVN-Rev: 22772
This commit is contained in:
parent
e6ca6a5162
commit
cc36611b2f
@ -472,8 +472,7 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
|
||||
if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
|
||||
myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode);
|
||||
}
|
||||
myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode);
|
||||
myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode);
|
||||
myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode);
|
||||
if(jpCharsetMasks[version]&CSM(JISX212)) {
|
||||
myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode);
|
||||
}
|
||||
@ -1040,14 +1039,6 @@ MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData,
|
||||
length=3;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space.
|
||||
* Pass in parameter for type of output bytes, for validation and shifting:
|
||||
* - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20?
|
||||
* (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.)
|
||||
* - A1-FE: Subtract 80 after range check.
|
||||
* - SJIS: Shift DBCS result to 21-7E x 21-7E.
|
||||
*/
|
||||
/* is this code point assigned, or do we use fallbacks? */
|
||||
if((stage2Entry&(1<<(16+(c&0xf))))!=0) {
|
||||
/* assigned */
|
||||
@ -1105,6 +1096,23 @@ MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData,
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check that the result is a 2-byte value with each byte in the range A1..FE
|
||||
* (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte
|
||||
* to move it to the ISO 2022 range 21..7E.
|
||||
* Return 0 if out of range.
|
||||
*/
|
||||
static U_INLINE uint32_t
|
||||
_2022FromGR94DBCS(uint32_t value) {
|
||||
if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
||||
(uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
|
||||
) {
|
||||
return value - 0x8080; /* shift down to 21..7e byte range */
|
||||
} else {
|
||||
return 0; /* not valid for ISO 2022 */
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef U_ENABLE_GENERIC_ISO_2022
|
||||
|
||||
/**********************************************************************************
|
||||
@ -1233,7 +1241,7 @@ toUnicodeCallback(UConverter *cnv,
|
||||
}
|
||||
else{
|
||||
cnv->toUBytes[0] =(char) sourceChar;
|
||||
cnv->toULength = 2;
|
||||
cnv->toULength = 1;
|
||||
}
|
||||
|
||||
if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){
|
||||
@ -1344,6 +1352,181 @@ static const int32_t escSeqCharsLen[] ={
|
||||
* TODO: Implement a priority technique where the users are allowed to set the priority of code pages
|
||||
*/
|
||||
|
||||
/* Map 00..7F to Unicode according to JIS X 0201. */
|
||||
static U_INLINE uint32_t
|
||||
jisx201ToU(uint32_t value) {
|
||||
if(value < 0x5c) {
|
||||
return value;
|
||||
} else if(value == 0x5c) {
|
||||
return 0xa5;
|
||||
} else if(value == 0x7e) {
|
||||
return 0x203e;
|
||||
} else /* value <= 0x7f */ {
|
||||
return value;
|
||||
}
|
||||
}
|
||||
|
||||
/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */
|
||||
static U_INLINE uint32_t
|
||||
jisx201FromU(uint32_t value) {
|
||||
if(value<=0x7f) {
|
||||
if(value!=0x5c && value!=0x7e) {
|
||||
return value;
|
||||
}
|
||||
} else if(value==0xa5) {
|
||||
return 0x5c;
|
||||
} else if(value==0x203e) {
|
||||
return 0x7e;
|
||||
}
|
||||
return 0xfffe;
|
||||
}
|
||||
|
||||
/*
|
||||
* Take a valid Shift-JIS byte pair, check that it is in the range corresponding
|
||||
* to JIS X 0208, and convert it to a pair of 21..7E bytes.
|
||||
* Return 0 if the byte pair is out of range.
|
||||
*/
|
||||
static U_INLINE uint32_t
|
||||
_2022FromSJIS(uint32_t value) {
|
||||
uint8_t trail;
|
||||
|
||||
if(value > 0xEFFC) {
|
||||
return 0; /* beyond JIS X 0208 */
|
||||
}
|
||||
|
||||
trail = (uint8_t)value;
|
||||
|
||||
value &= 0xff00; /* lead byte */
|
||||
if(value <= 0x9f00) {
|
||||
value -= 0x7000;
|
||||
} else /* 0xe000 <= value <= 0xef00 */ {
|
||||
value -= 0xb000;
|
||||
}
|
||||
value <<= 1;
|
||||
|
||||
if(trail <= 0x9e) {
|
||||
value -= 0x100;
|
||||
if(trail <= 0x7e) {
|
||||
value |= trail - 0x1f;
|
||||
} else {
|
||||
value |= trail - 0x20;
|
||||
}
|
||||
} else /* trail <= 0xfc */ {
|
||||
value |= trail - 0x7e;
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS.
|
||||
* If either byte is outside 21..7E make sure that the result is not valid
|
||||
* for Shift-JIS so that the converter catches it.
|
||||
* Some invalid byte values already turn into equally invalid Shift-JIS
|
||||
* byte values and need not be tested explicitly.
|
||||
*/
|
||||
static U_INLINE void
|
||||
_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) {
|
||||
if(c1&1) {
|
||||
++c1;
|
||||
if(c2 <= 0x5f) {
|
||||
c2 += 0x1f;
|
||||
} else if(c2 <= 0x7e) {
|
||||
c2 += 0x20;
|
||||
} else {
|
||||
c2 = 0; /* invalid */
|
||||
}
|
||||
} else {
|
||||
if((uint8_t)(c2-0x21) <= (0x7e-0x21)) {
|
||||
c2 += 0x7e;
|
||||
} else {
|
||||
c2 = 0; /* invalid */
|
||||
}
|
||||
}
|
||||
c1 >>= 1;
|
||||
if(c1 <= 0x2f) {
|
||||
c1 += 0x70;
|
||||
} else if(c1 <= 0x3f) {
|
||||
c1 += 0xb0;
|
||||
} else {
|
||||
c1 = 0; /* invalid */
|
||||
}
|
||||
bytes[0] = (char)c1;
|
||||
bytes[1] = (char)c2;
|
||||
}
|
||||
|
||||
/*
|
||||
* JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS)
|
||||
* Katakana.
|
||||
* Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks
|
||||
* because Shift-JIS roundtrips half-width Katakana to single bytes.
|
||||
* These were the only fallbacks in ICU's jisx-208.ucm file.
|
||||
*/
|
||||
static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = {
|
||||
0x2123, /* U+FF61 */
|
||||
0x2156,
|
||||
0x2157,
|
||||
0x2122,
|
||||
0x2126,
|
||||
0x2572,
|
||||
0x2521,
|
||||
0x2523,
|
||||
0x2525,
|
||||
0x2527,
|
||||
0x2529,
|
||||
0x2563,
|
||||
0x2565,
|
||||
0x2567,
|
||||
0x2543,
|
||||
0x213C, /* U+FF70 */
|
||||
0x2522,
|
||||
0x2524,
|
||||
0x2526,
|
||||
0x2528,
|
||||
0x252A,
|
||||
0x252B,
|
||||
0x252D,
|
||||
0x252F,
|
||||
0x2531,
|
||||
0x2533,
|
||||
0x2535,
|
||||
0x2537,
|
||||
0x2539,
|
||||
0x253B,
|
||||
0x253D,
|
||||
0x253F, /* U+FF80 */
|
||||
0x2541,
|
||||
0x2544,
|
||||
0x2546,
|
||||
0x2548,
|
||||
0x254A,
|
||||
0x254B,
|
||||
0x254C,
|
||||
0x254D,
|
||||
0x254E,
|
||||
0x254F,
|
||||
0x2552,
|
||||
0x2555,
|
||||
0x2558,
|
||||
0x255B,
|
||||
0x255E,
|
||||
0x255F, /* U+FF90 */
|
||||
0x2560,
|
||||
0x2561,
|
||||
0x2562,
|
||||
0x2564,
|
||||
0x2566,
|
||||
0x2568,
|
||||
0x2569,
|
||||
0x256A,
|
||||
0x256B,
|
||||
0x256C,
|
||||
0x256D,
|
||||
0x256F,
|
||||
0x2573,
|
||||
0x212B,
|
||||
0x212C /* U+FF9F */
|
||||
};
|
||||
|
||||
static void
|
||||
UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) {
|
||||
UConverter *cnv = args->converter;
|
||||
@ -1499,7 +1682,7 @@ getTrail:
|
||||
}
|
||||
break;
|
||||
case HWKANA_7BIT:
|
||||
if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) {
|
||||
if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
|
||||
if(converterData->version==3) {
|
||||
/* JIS7: use G1 (SO) */
|
||||
/* Shift U+FF61..U+FF9F to bytes 21..5F. */
|
||||
@ -1526,13 +1709,34 @@ getTrail:
|
||||
break;
|
||||
case JISX201:
|
||||
/* G0 SBCS */
|
||||
len2 = MBCS_SINGLE_FROM_UCHAR32(
|
||||
value = jisx201FromU(sourceChar);
|
||||
if(value <= 0x7f) {
|
||||
targetValue = value;
|
||||
len = 1;
|
||||
cs = cs0;
|
||||
g = 0;
|
||||
useFallback = FALSE;
|
||||
}
|
||||
break;
|
||||
case JISX208:
|
||||
/* G0 DBCS from Shift-JIS table */
|
||||
len2 = MBCS_FROM_UCHAR32_ISO2022(
|
||||
converterData->myConverterArray[cs0],
|
||||
sourceChar, &value,
|
||||
useFallback);
|
||||
if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) {
|
||||
targetValue = value;
|
||||
len = len2;
|
||||
useFallback, MBCS_OUTPUT_2);
|
||||
if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */
|
||||
value = _2022FromSJIS(value);
|
||||
if(value != 0) {
|
||||
targetValue = value;
|
||||
len = len2;
|
||||
cs = cs0;
|
||||
g = 0;
|
||||
useFallback = FALSE;
|
||||
}
|
||||
} else if(len == 0 && useFallback &&
|
||||
(uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) {
|
||||
targetValue = hwkana_fb[sourceChar - HWKANA_START];
|
||||
len = -2;
|
||||
cs = cs0;
|
||||
g = 0;
|
||||
useFallback = FALSE;
|
||||
@ -1564,17 +1768,10 @@ getTrail:
|
||||
* Check for valid bytes for the encoding scheme.
|
||||
* This is necessary because the sub-converter (windows-949)
|
||||
* has a broader encoding scheme than is valid for 2022.
|
||||
*
|
||||
* Check that the result is a 2-byte value with each byte in the range A1..FE
|
||||
* (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte
|
||||
* to move it to the ISO 2022 range 21..7E.
|
||||
*/
|
||||
if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) &&
|
||||
(uint8_t)(value - 0xa1) <= (0xfe - 0xa1)
|
||||
) {
|
||||
value -= 0x8080; /* shift down to 21..7e byte range */
|
||||
} else {
|
||||
break; /* not valid for ISO 2022 */
|
||||
value = _2022FromGR94DBCS(value);
|
||||
if(value == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
targetValue = value;
|
||||
@ -1750,7 +1947,7 @@ getTrail:
|
||||
static void
|
||||
UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
UErrorCode* err){
|
||||
char tempBuf[3];
|
||||
char tempBuf[2];
|
||||
const char *mySource = (char *) args->source;
|
||||
UChar *myTarget = args->target;
|
||||
const char *mySourceLimit = args->sourceLimit;
|
||||
@ -1868,10 +2065,7 @@ escape:
|
||||
break;
|
||||
case JISX201:
|
||||
if(mySourceChar <= 0x7f) {
|
||||
targetUniChar =
|
||||
_MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(
|
||||
myData->myConverterArray[cs],
|
||||
mySourceChar);
|
||||
targetUniChar = jisx201ToU(mySourceChar);
|
||||
}
|
||||
break;
|
||||
case HWKANA_7BIT:
|
||||
@ -1885,8 +2079,13 @@ escape:
|
||||
if(mySource < mySourceLimit) {
|
||||
char trailByte;
|
||||
getTrailByte:
|
||||
tempBuf[0] = (char) (mySourceChar);
|
||||
tempBuf[1] = trailByte = *mySource++;
|
||||
trailByte = *mySource++;
|
||||
if(cs == JISX208) {
|
||||
_2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf);
|
||||
} else {
|
||||
tempBuf[0] = (char)mySourceChar;
|
||||
tempBuf[1] = trailByte;
|
||||
}
|
||||
mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte);
|
||||
targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE);
|
||||
} else {
|
||||
@ -3190,6 +3389,9 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
||||
/* open a set and initialize it with code points that are algorithmically round-tripped */
|
||||
switch(cnvData->locale[0]){
|
||||
case 'j':
|
||||
/* include JIS X 0201 which is hardcoded */
|
||||
sa->add(sa->set, 0xa5);
|
||||
sa->add(sa->set, 0x203e);
|
||||
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
|
||||
/* include Latin-1 for some variants of JP */
|
||||
sa->addRange(sa->set, 0, 0xff);
|
||||
@ -3198,6 +3400,11 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
||||
sa->addRange(sa->set, 0, 0x7f);
|
||||
}
|
||||
if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) {
|
||||
/*
|
||||
* TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks,
|
||||
* we need to include half-width Katakana for all JP variants because
|
||||
* JIS X 0208 has hardcoded fallbacks for them.
|
||||
*/
|
||||
/* include half-width Katakana for JP */
|
||||
sa->addRange(sa->set, HWKANA_START, HWKANA_END);
|
||||
}
|
||||
@ -3217,15 +3424,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Version-specific for CN:
|
||||
* CN version 0 does not map CNS planes 3..7 although
|
||||
* they are all available in the CNS conversion table;
|
||||
* CN version 1 does map them all.
|
||||
* The two versions create different Unicode sets.
|
||||
*/
|
||||
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
||||
if(cnvData->myConverterArray[i]!=NULL) {
|
||||
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
||||
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
|
||||
cnvData->version==0 && i==CNS_11643
|
||||
) {
|
||||
@ -3235,9 +3434,33 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
|
||||
sa, UCNV_ROUNDTRIP_SET,
|
||||
0, 0x81, 0x82,
|
||||
pErrorCode);
|
||||
} else {
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
|
||||
UConverterSetFilter filter;
|
||||
if(cnvData->myConverterArray[i]!=NULL) {
|
||||
if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') &&
|
||||
cnvData->version==0 && i==CNS_11643
|
||||
) {
|
||||
/*
|
||||
* Version-specific for CN:
|
||||
* CN version 0 does not map CNS planes 3..7 although
|
||||
* they are all available in the CNS conversion table;
|
||||
* CN version 1 (-EXT) does map them all.
|
||||
* The two versions create different Unicode sets.
|
||||
*/
|
||||
filter=UCNV_SET_FILTER_2022_CN;
|
||||
} else if(cnvData->locale[0]=='j' && i==JISX208) {
|
||||
/*
|
||||
* Only add code points that map to Shift-JIS codes
|
||||
* corresponding to JIS X 0208.
|
||||
*/
|
||||
filter=UCNV_SET_FILTER_SJIS;
|
||||
} else {
|
||||
filter=UCNV_SET_FILTER_NONE;
|
||||
}
|
||||
ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -362,6 +362,8 @@ gb18030Ranges[13][4]={
|
||||
|
||||
/* Miscellaneous ------------------------------------------------------------ */
|
||||
|
||||
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
||||
|
||||
/* similar to ucnv_MBCSGetNextUChar() but recursive */
|
||||
static void
|
||||
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
@ -454,11 +456,14 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UConverterSetFilter filter,
|
||||
UErrorCode *pErrorCode) {
|
||||
const UConverterMBCSTable *mbcsTable;
|
||||
const uint16_t *table;
|
||||
|
||||
@ -512,12 +517,26 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
c+=1024; /* empty stage 2 block */
|
||||
}
|
||||
}
|
||||
} else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) {
|
||||
/* ignore single-byte results */
|
||||
} else {
|
||||
const uint32_t *stage2;
|
||||
const uint16_t *stage3, *results;
|
||||
const uint8_t *stage3, *bytes;
|
||||
uint32_t st3Multiplier;
|
||||
uint32_t value;
|
||||
|
||||
results=(const uint16_t *)mbcsTable->fromUnicodeBytes;
|
||||
bytes=mbcsTable->fromUnicodeBytes;
|
||||
|
||||
switch(mbcsTable->outputType) {
|
||||
case MBCS_OUTPUT_3:
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
st3Multiplier=3;
|
||||
break;
|
||||
case MBCS_OUTPUT_4:
|
||||
st3Multiplier=4;
|
||||
break;
|
||||
default:
|
||||
st3Multiplier=2;
|
||||
break;
|
||||
}
|
||||
|
||||
for(st1=0; st1<maxStage1; ++st1) {
|
||||
st2=table[st1];
|
||||
@ -526,7 +545,7 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
for(st2=0; st2<64; ++st2) {
|
||||
if((st3=stage2[st2])!=0) {
|
||||
/* read the stage 3 block */
|
||||
stage3=results+16*(uint32_t)(uint16_t)st3;
|
||||
stage3=bytes+st3Multiplier*16*(uint32_t)(uint16_t)st3;
|
||||
|
||||
/* get the roundtrip flags for the stage 3 block */
|
||||
st3>>=16;
|
||||
@ -536,48 +555,50 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
* Once we get a set for fallback mappings, we have to check
|
||||
* non-roundtrip stage 3 results for whether they are 0.
|
||||
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
|
||||
*
|
||||
* Ignore single-byte results (<0x100).
|
||||
*/
|
||||
do {
|
||||
if((st3&1)!=0 && *stage3>=0x100) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
++stage3;
|
||||
} while((++c&0xf)!=0);
|
||||
} else {
|
||||
c+=16; /* empty stage 3 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
c+=1024; /* empty stage 2 block */
|
||||
}
|
||||
}
|
||||
} else {
|
||||
const uint32_t *stage2;
|
||||
|
||||
for(st1=0; st1<maxStage1; ++st1) {
|
||||
st2=table[st1];
|
||||
if(st2>(maxStage1>>1)) {
|
||||
stage2=(const uint32_t *)table+st2;
|
||||
for(st2=0; st2<64; ++st2) {
|
||||
if((st3=stage2[st2])!=0) {
|
||||
/* get the roundtrip flags for the stage 3 block */
|
||||
st3>>=16;
|
||||
|
||||
/*
|
||||
* Add code points for which the roundtrip flag is set.
|
||||
* Once we get a set for fallback mappings, we have to check
|
||||
* non-roundtrip stage 3 results for whether they are 0.
|
||||
* See ucnv_MBCSFromUnicodeWithOffsets() for details.
|
||||
*/
|
||||
do {
|
||||
if(st3&1) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
} while((++c&0xf)!=0);
|
||||
switch(filter) {
|
||||
case UCNV_SET_FILTER_NONE:
|
||||
do {
|
||||
if(st3&1) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
} while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_DBCS_ONLY:
|
||||
/* Ignore single-byte results (<0x100). */
|
||||
do {
|
||||
if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2; /* +=st3Multiplier */
|
||||
} while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_2022_CN:
|
||||
/* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */
|
||||
do {
|
||||
if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=3; /* +=st3Multiplier */
|
||||
} while((++c&0xf)!=0);
|
||||
break;
|
||||
case UCNV_SET_FILTER_SJIS:
|
||||
/* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */
|
||||
do {
|
||||
if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) {
|
||||
sa->add(sa->set, c);
|
||||
}
|
||||
st3>>=1;
|
||||
stage3+=2; /* +=st3Multiplier */
|
||||
} while((++c&0xf)!=0);
|
||||
break;
|
||||
default:
|
||||
*pErrorCode=U_INTERNAL_PROGRAM_ERROR;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
c+=16; /* empty stage 3 block */
|
||||
}
|
||||
@ -591,6 +612,19 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode);
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode) {
|
||||
ucnv_MBCSGetFilteredUnicodeSetForUnicode(
|
||||
sharedData, sa, which,
|
||||
sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ?
|
||||
UCNV_SET_FILTER_DBCS_ONLY :
|
||||
UCNV_SET_FILTER_NONE,
|
||||
pErrorCode);
|
||||
}
|
||||
|
||||
static void
|
||||
ucnv_MBCSGetUnicodeSet(const UConverter *cnv,
|
||||
const USetAdder *sa,
|
||||
|
@ -456,6 +456,7 @@ U_CFUNC void
|
||||
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
||||
/*
|
||||
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||
@ -470,6 +471,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||
UErrorCode *pErrorCode);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||
@ -481,9 +483,30 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
typedef enum UConverterSetFilter {
|
||||
UCNV_SET_FILTER_NONE,
|
||||
UCNV_SET_FILTER_DBCS_ONLY,
|
||||
UCNV_SET_FILTER_2022_CN,
|
||||
UCNV_SET_FILTER_SJIS,
|
||||
UCNV_SET_FILTER_COUNT
|
||||
} UConverterSetFilter;
|
||||
|
||||
/*
|
||||
* Same as ucnv_MBCSGetUnicodeSetForUnicode() but
|
||||
* the set can be filtered by encoding scheme.
|
||||
* Used by stateful converters which share regular conversion tables
|
||||
* but only use a subset of their mappings.
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
UConverterSetFilter filter,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
|
24
icu4c/source/test/testdata/conversion.txt
vendored
24
icu4c/source/test/testdata/conversion.txt
vendored
@ -48,6 +48,15 @@ conversion:table(nofallback) {
|
||||
toUnicode {
|
||||
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
|
||||
Cases {
|
||||
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
||||
// using the Shift-JIS table for JIS X 0208 (ticket #5797)
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
:bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 },
|
||||
"}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e",
|
||||
:intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 },
|
||||
:int{1}, :int{1}, "", "?", :bin{""}
|
||||
}
|
||||
// improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets()
|
||||
{
|
||||
"ISO-8859-3",
|
||||
@ -495,6 +504,15 @@ conversion:table(nofallback) {
|
||||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and
|
||||
// using the Shift-JIS table for JIS X 0208 (ticket #5797)
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
"\u203e\xa5\u4e00\ufa10\u6f3e\u0391",
|
||||
:bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 },
|
||||
:intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 },
|
||||
:int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e
|
||||
}
|
||||
// Verify that mappings that would result in byte values outside 20..7F (for SBCS)
|
||||
// or 21..7E (for DBCS) are not used.
|
||||
// ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46):
|
||||
@ -1293,13 +1311,13 @@ conversion:table(nofallback) {
|
||||
// versions of ISO-2022-JP
|
||||
{
|
||||
"ISO-2022-JP",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]",
|
||||
"[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"ISO-2022-JP-2",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||
"[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]",
|
||||
"[\x0e\x0f\x1b\uffe7-\U0010ffff]",
|
||||
:int{0}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user