From cc36611b2f3fa54193f29e21b386b610a8739080 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 11 Oct 2007 21:31:32 +0000 Subject: [PATCH] ICU-5797 use Shift-JIS table for ISO 2022-JP, and hardcode JIS X 0201 mappings X-SVN-Rev: 22772 --- icu4c/source/common/ucnv2022.c | 313 ++++++++++++++++++---- icu4c/source/common/ucnvmbcs.c | 134 +++++---- icu4c/source/common/ucnvmbcs.h | 29 +- icu4c/source/test/testdata/conversion.txt | 24 +- 4 files changed, 399 insertions(+), 101 deletions(-) diff --git a/icu4c/source/common/ucnv2022.c b/icu4c/source/common/ucnv2022.c index dff4ba8340..8c60355748 100644 --- a/icu4c/source/common/ucnv2022.c +++ b/icu4c/source/common/ucnv2022.c @@ -472,8 +472,7 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti if(jpCharsetMasks[version]&CSM(ISO8859_7)) { myConverterData->myConverterArray[ISO8859_7]= ucnv_loadSharedData("ISO8859_7", NULL, errorCode); } - myConverterData->myConverterArray[JISX201] = ucnv_loadSharedData("JISX0201", NULL, errorCode); - myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("jisx-208", NULL, errorCode); + myConverterData->myConverterArray[JISX208] = ucnv_loadSharedData("Shift-JIS", NULL, errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) { myConverterData->myConverterArray[JISX212] = ucnv_loadSharedData("jisx-212", NULL, errorCode); } @@ -1040,14 +1039,6 @@ MBCS_FROM_UCHAR32_ISO2022(UConverterSharedData* sharedData, length=3; } } - /* - * TODO(markus): Use Shift-JIS table for JIS X 0208, to save mapping table space. - * Pass in parameter for type of output bytes, for validation and shifting: - * - Direct: Pass bytes through, but forbid control codes 00-1F (except SI/SO/ESC) and space 20? - * (Need to allow some (TAB/LF/CR) or most of them for ASCII and maybe JIS X 0201.) - * - A1-FE: Subtract 80 after range check. - * - SJIS: Shift DBCS result to 21-7E x 21-7E. - */ /* is this code point assigned, or do we use fallbacks? */ if((stage2Entry&(1<<(16+(c&0xf))))!=0) { /* assigned */ @@ -1105,6 +1096,23 @@ MBCS_SINGLE_FROM_UCHAR32(UConverterSharedData* sharedData, } } +/* + * Check that the result is a 2-byte value with each byte in the range A1..FE + * (strict EUC DBCS) before accepting it and subtracting 0x80 from each byte + * to move it to the ISO 2022 range 21..7E. + * Return 0 if out of range. + */ +static U_INLINE uint32_t +_2022FromGR94DBCS(uint32_t value) { + if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && + (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) + ) { + return value - 0x8080; /* shift down to 21..7e byte range */ + } else { + return 0; /* not valid for ISO 2022 */ + } +} + #ifdef U_ENABLE_GENERIC_ISO_2022 /********************************************************************************** @@ -1233,7 +1241,7 @@ toUnicodeCallback(UConverter *cnv, } else{ cnv->toUBytes[0] =(char) sourceChar; - cnv->toULength = 2; + cnv->toULength = 1; } if(targetUniChar == (missingCharMarker-1/*0xfffe*/)){ @@ -1344,6 +1352,181 @@ static const int32_t escSeqCharsLen[] ={ * TODO: Implement a priority technique where the users are allowed to set the priority of code pages */ +/* Map 00..7F to Unicode according to JIS X 0201. */ +static U_INLINE uint32_t +jisx201ToU(uint32_t value) { + if(value < 0x5c) { + return value; + } else if(value == 0x5c) { + return 0xa5; + } else if(value == 0x7e) { + return 0x203e; + } else /* value <= 0x7f */ { + return value; + } +} + +/* Map Unicode to 00..7F according to JIS X 0201. Return U+FFFE if unmappable. */ +static U_INLINE uint32_t +jisx201FromU(uint32_t value) { + if(value<=0x7f) { + if(value!=0x5c && value!=0x7e) { + return value; + } + } else if(value==0xa5) { + return 0x5c; + } else if(value==0x203e) { + return 0x7e; + } + return 0xfffe; +} + +/* + * Take a valid Shift-JIS byte pair, check that it is in the range corresponding + * to JIS X 0208, and convert it to a pair of 21..7E bytes. + * Return 0 if the byte pair is out of range. + */ +static U_INLINE uint32_t +_2022FromSJIS(uint32_t value) { + uint8_t trail; + + if(value > 0xEFFC) { + return 0; /* beyond JIS X 0208 */ + } + + trail = (uint8_t)value; + + value &= 0xff00; /* lead byte */ + if(value <= 0x9f00) { + value -= 0x7000; + } else /* 0xe000 <= value <= 0xef00 */ { + value -= 0xb000; + } + value <<= 1; + + if(trail <= 0x9e) { + value -= 0x100; + if(trail <= 0x7e) { + value |= trail - 0x1f; + } else { + value |= trail - 0x20; + } + } else /* trail <= 0xfc */ { + value |= trail - 0x7e; + } + return value; +} + +/* + * Convert a pair of JIS X 0208 21..7E bytes to Shift-JIS. + * If either byte is outside 21..7E make sure that the result is not valid + * for Shift-JIS so that the converter catches it. + * Some invalid byte values already turn into equally invalid Shift-JIS + * byte values and need not be tested explicitly. + */ +static U_INLINE void +_2022ToSJIS(uint8_t c1, uint8_t c2, char bytes[2]) { + if(c1&1) { + ++c1; + if(c2 <= 0x5f) { + c2 += 0x1f; + } else if(c2 <= 0x7e) { + c2 += 0x20; + } else { + c2 = 0; /* invalid */ + } + } else { + if((uint8_t)(c2-0x21) <= (0x7e-0x21)) { + c2 += 0x7e; + } else { + c2 = 0; /* invalid */ + } + } + c1 >>= 1; + if(c1 <= 0x2f) { + c1 += 0x70; + } else if(c1 <= 0x3f) { + c1 += 0xb0; + } else { + c1 = 0; /* invalid */ + } + bytes[0] = (char)c1; + bytes[1] = (char)c2; +} + +/* + * JIS X 0208 has fallbacks from Unicode half-width Katakana to full-width (DBCS) + * Katakana. + * Now that we use a Shift-JIS table for JIS X 0208 we need to hardcode these fallbacks + * because Shift-JIS roundtrips half-width Katakana to single bytes. + * These were the only fallbacks in ICU's jisx-208.ucm file. + */ +static const uint16_t hwkana_fb[HWKANA_END - HWKANA_START + 1] = { + 0x2123, /* U+FF61 */ + 0x2156, + 0x2157, + 0x2122, + 0x2126, + 0x2572, + 0x2521, + 0x2523, + 0x2525, + 0x2527, + 0x2529, + 0x2563, + 0x2565, + 0x2567, + 0x2543, + 0x213C, /* U+FF70 */ + 0x2522, + 0x2524, + 0x2526, + 0x2528, + 0x252A, + 0x252B, + 0x252D, + 0x252F, + 0x2531, + 0x2533, + 0x2535, + 0x2537, + 0x2539, + 0x253B, + 0x253D, + 0x253F, /* U+FF80 */ + 0x2541, + 0x2544, + 0x2546, + 0x2548, + 0x254A, + 0x254B, + 0x254C, + 0x254D, + 0x254E, + 0x254F, + 0x2552, + 0x2555, + 0x2558, + 0x255B, + 0x255E, + 0x255F, /* U+FF90 */ + 0x2560, + 0x2561, + 0x2562, + 0x2564, + 0x2566, + 0x2568, + 0x2569, + 0x256A, + 0x256B, + 0x256C, + 0x256D, + 0x256F, + 0x2573, + 0x212B, + 0x212C /* U+FF9F */ +}; + static void UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args, UErrorCode* err) { UConverter *cnv = args->converter; @@ -1499,7 +1682,7 @@ getTrail: } break; case HWKANA_7BIT: - if((uint32_t)(HWKANA_END-sourceChar)<=(HWKANA_END-HWKANA_START)) { + if((uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { if(converterData->version==3) { /* JIS7: use G1 (SO) */ /* Shift U+FF61..U+FF9F to bytes 21..5F. */ @@ -1526,13 +1709,34 @@ getTrail: break; case JISX201: /* G0 SBCS */ - len2 = MBCS_SINGLE_FROM_UCHAR32( + value = jisx201FromU(sourceChar); + if(value <= 0x7f) { + targetValue = value; + len = 1; + cs = cs0; + g = 0; + useFallback = FALSE; + } + break; + case JISX208: + /* G0 DBCS from Shift-JIS table */ + len2 = MBCS_FROM_UCHAR32_ISO2022( converterData->myConverterArray[cs0], sourceChar, &value, - useFallback); - if(len2 != 0 && !(len2 < 0 && len != 0) && value <= 0x7f) { - targetValue = value; - len = len2; + useFallback, MBCS_OUTPUT_2); + if(len2 == 2 || (len2 == -2 && len == 0)) { /* only accept DBCS: abs(len)==2 */ + value = _2022FromSJIS(value); + if(value != 0) { + targetValue = value; + len = len2; + cs = cs0; + g = 0; + useFallback = FALSE; + } + } else if(len == 0 && useFallback && + (uint32_t)(sourceChar - HWKANA_START) <= (HWKANA_END - HWKANA_START)) { + targetValue = hwkana_fb[sourceChar - HWKANA_START]; + len = -2; cs = cs0; g = 0; useFallback = FALSE; @@ -1564,17 +1768,10 @@ getTrail: * Check for valid bytes for the encoding scheme. * This is necessary because the sub-converter (windows-949) * has a broader encoding scheme than is valid for 2022. - * - * Check that the result is a 2-byte value with each byte in the range A1..FE - * (strict EUC-KR DBCS) before accepting it and subtracting 0x80 from each byte - * to move it to the ISO 2022 range 21..7E. */ - if( (uint16_t)(value - 0xa1a1) <= (0xfefe - 0xa1a1) && - (uint8_t)(value - 0xa1) <= (0xfe - 0xa1) - ) { - value -= 0x8080; /* shift down to 21..7e byte range */ - } else { - break; /* not valid for ISO 2022 */ + value = _2022FromGR94DBCS(value); + if(value == 0) { + break; } } targetValue = value; @@ -1750,7 +1947,7 @@ getTrail: static void UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args, UErrorCode* err){ - char tempBuf[3]; + char tempBuf[2]; const char *mySource = (char *) args->source; UChar *myTarget = args->target; const char *mySourceLimit = args->sourceLimit; @@ -1868,10 +2065,7 @@ escape: break; case JISX201: if(mySourceChar <= 0x7f) { - targetUniChar = - _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP( - myData->myConverterArray[cs], - mySourceChar); + targetUniChar = jisx201ToU(mySourceChar); } break; case HWKANA_7BIT: @@ -1885,8 +2079,13 @@ escape: if(mySource < mySourceLimit) { char trailByte; getTrailByte: - tempBuf[0] = (char) (mySourceChar); - tempBuf[1] = trailByte = *mySource++; + trailByte = *mySource++; + if(cs == JISX208) { + _2022ToSJIS((uint8_t)mySourceChar, (uint8_t)trailByte, tempBuf); + } else { + tempBuf[0] = (char)mySourceChar; + tempBuf[1] = trailByte; + } mySourceChar = (mySourceChar << 8) | (uint8_t)(trailByte); targetUniChar = ucnv_MBCSSimpleGetNextUChar(myData->myConverterArray[cs], tempBuf, 2, FALSE); } else { @@ -3190,6 +3389,9 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, /* open a set and initialize it with code points that are algorithmically round-tripped */ switch(cnvData->locale[0]){ case 'j': + /* include JIS X 0201 which is hardcoded */ + sa->add(sa->set, 0xa5); + sa->add(sa->set, 0x203e); if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) { /* include Latin-1 for some variants of JP */ sa->addRange(sa->set, 0, 0xff); @@ -3198,6 +3400,11 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, sa->addRange(sa->set, 0, 0x7f); } if(jpCharsetMasks[cnvData->version]&CSM(HWKANA_7BIT)) { + /* + * TODO(markus): If and when ucnv_getUnicodeSet() supports fallbacks, + * we need to include half-width Katakana for all JP variants because + * JIS X 0208 has hardcoded fallbacks for them. + */ /* include half-width Katakana for JP */ sa->addRange(sa->set, HWKANA_START, HWKANA_END); } @@ -3217,15 +3424,7 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, break; } - /* - * Version-specific for CN: - * CN version 0 does not map CNS planes 3..7 although - * they are all available in the CNS conversion table; - * CN version 1 does map them all. - * The two versions create different Unicode sets. - */ - for (i=0; imyConverterArray[i]!=NULL) { +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && cnvData->version==0 && i==CNS_11643 ) { @@ -3235,9 +3434,33 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv, sa, UCNV_ROUNDTRIP_SET, 0, 0x81, 0x82, pErrorCode); - } else { - ucnv_MBCSGetUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, pErrorCode); } +#endif + + for (i=0; imyConverterArray[i]!=NULL) { + if( (cnvData->locale[0]=='c' || cnvData->locale[0]=='z') && + cnvData->version==0 && i==CNS_11643 + ) { + /* + * Version-specific for CN: + * CN version 0 does not map CNS planes 3..7 although + * they are all available in the CNS conversion table; + * CN version 1 (-EXT) does map them all. + * The two versions create different Unicode sets. + */ + filter=UCNV_SET_FILTER_2022_CN; + } else if(cnvData->locale[0]=='j' && i==JISX208) { + /* + * Only add code points that map to Shift-JIS codes + * corresponding to JIS X 0208. + */ + filter=UCNV_SET_FILTER_SJIS; + } else { + filter=UCNV_SET_FILTER_NONE; + } + ucnv_MBCSGetFilteredUnicodeSetForUnicode(cnvData->myConverterArray[i], sa, which, filter, pErrorCode); } } diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 37a7d4b5c5..73851c6276 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -362,6 +362,8 @@ gb18030Ranges[13][4]={ /* Miscellaneous ------------------------------------------------------------ */ +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ + /* similar to ucnv_MBCSGetNextUChar() but recursive */ static void _getUnicodeSetForBytes(const UConverterSharedData *sharedData, @@ -454,11 +456,14 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, pErrorCode); } +#endif + U_CFUNC void -ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - UErrorCode *pErrorCode) { +ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UConverterSetFilter filter, + UErrorCode *pErrorCode) { const UConverterMBCSTable *mbcsTable; const uint16_t *table; @@ -512,12 +517,26 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, c+=1024; /* empty stage 2 block */ } } - } else if(mbcsTable->outputType==MBCS_OUTPUT_DBCS_ONLY) { - /* ignore single-byte results */ + } else { const uint32_t *stage2; - const uint16_t *stage3, *results; + const uint8_t *stage3, *bytes; + uint32_t st3Multiplier; + uint32_t value; - results=(const uint16_t *)mbcsTable->fromUnicodeBytes; + bytes=mbcsTable->fromUnicodeBytes; + + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: + st3Multiplier=3; + break; + case MBCS_OUTPUT_4: + st3Multiplier=4; + break; + default: + st3Multiplier=2; + break; + } for(st1=0; st1>=16; @@ -536,48 +555,50 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, * Once we get a set for fallback mappings, we have to check * non-roundtrip stage 3 results for whether they are 0. * See ucnv_MBCSFromUnicodeWithOffsets() for details. - * - * Ignore single-byte results (<0x100). */ - do { - if((st3&1)!=0 && *stage3>=0x100) { - sa->add(sa->set, c); - } - st3>>=1; - ++stage3; - } while((++c&0xf)!=0); - } else { - c+=16; /* empty stage 3 block */ - } - } - } else { - c+=1024; /* empty stage 2 block */ - } - } - } else { - const uint32_t *stage2; - - for(st1=0; st1(maxStage1>>1)) { - stage2=(const uint32_t *)table+st2; - for(st2=0; st2<64; ++st2) { - if((st3=stage2[st2])!=0) { - /* get the roundtrip flags for the stage 3 block */ - st3>>=16; - - /* - * Add code points for which the roundtrip flag is set. - * Once we get a set for fallback mappings, we have to check - * non-roundtrip stage 3 results for whether they are 0. - * See ucnv_MBCSFromUnicodeWithOffsets() for details. - */ - do { - if(st3&1) { - sa->add(sa->set, c); - } - st3>>=1; - } while((++c&0xf)!=0); + switch(filter) { + case UCNV_SET_FILTER_NONE: + do { + if(st3&1) { + sa->add(sa->set, c); + } + st3>>=1; + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_DBCS_ONLY: + /* Ignore single-byte results (<0x100). */ + do { + if((st3&1)!=0 && *((const uint16_t *)stage3)>=0x100) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_2022_CN: + /* Only add code points that map to CNS 11643 planes 1 & 2 for non-EXT ISO-2022-CN. */ + do { + if((st3&1)!=0 && ((value=*stage3)==0x81 || value==0x82)) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=3; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + case UCNV_SET_FILTER_SJIS: + /* Only add code points that map to Shift-JIS codes corresponding to JIS X 0208. */ + do { + if((st3&1)!=0 && (value=*((const uint16_t *)stage3))>=0x8140 && value<=0xeffc) { + sa->add(sa->set, c); + } + st3>>=1; + stage3+=2; /* +=st3Multiplier */ + } while((++c&0xf)!=0); + break; + default: + *pErrorCode=U_INTERNAL_PROGRAM_ERROR; + return; + } } else { c+=16; /* empty stage 3 block */ } @@ -591,6 +612,19 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, ucnv_extGetUnicodeSet(sharedData, sa, which, pErrorCode); } +U_CFUNC void +ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode) { + ucnv_MBCSGetFilteredUnicodeSetForUnicode( + sharedData, sa, which, + sharedData->mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? + UCNV_SET_FILTER_DBCS_ONLY : + UCNV_SET_FILTER_NONE, + pErrorCode); +} + static void ucnv_MBCSGetUnicodeSet(const UConverter *cnv, const USetAdder *sa, diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h index 32439cbbeb..89a1f14c9d 100644 --- a/icu4c/source/common/ucnvmbcs.h +++ b/icu4c/source/common/ucnvmbcs.h @@ -456,6 +456,7 @@ U_CFUNC void ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); +#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ /* * Internal function returning a UnicodeSet for toUnicode() conversion. * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. @@ -470,6 +471,7 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, UConverterUnicodeSet which, uint8_t state, int32_t lowByte, int32_t highByte, UErrorCode *pErrorCode); +#endif /* * Internal function returning a UnicodeSet for toUnicode() conversion. @@ -481,9 +483,30 @@ ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, */ U_CFUNC void ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - UErrorCode *pErrorCode); + const USetAdder *sa, + UConverterUnicodeSet which, + UErrorCode *pErrorCode); + +typedef enum UConverterSetFilter { + UCNV_SET_FILTER_NONE, + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_COUNT +} UConverterSetFilter; + +/* + * Same as ucnv_MBCSGetUnicodeSetForUnicode() but + * the set can be filtered by encoding scheme. + * Used by stateful converters which share regular conversion tables + * but only use a subset of their mappings. + */ +U_CFUNC void +ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, + const USetAdder *sa, + UConverterUnicodeSet which, + UConverterSetFilter filter, + UErrorCode *pErrorCode); #endif diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 5e792b9be8..86dcf80106 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -48,6 +48,15 @@ conversion:table(nofallback) { toUnicode { Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } Cases { + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { + "ISO-2022-JP", + :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, + "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", + :intvector{ 3,4,5,9,11,13,15,17,19,21,23,25,27 }, + :int{1}, :int{1}, "", "?", :bin{""} + } // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() { "ISO-8859-3", @@ -495,6 +504,15 @@ conversion:table(nofallback) { fromUnicode { Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } Cases { + // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and + // using the Shift-JIS table for JIS X 0208 (ticket #5797) + { + "ISO-2022-JP", + "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", + :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, + :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, + :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e + } // Verify that mappings that would result in byte values outside 20..7F (for SBCS) // or 21..7E (for DBCS) are not used. // ibm-9005_X110-2007.ucm (ISO 8859-7, .F=1b2e46): @@ -1293,13 +1311,13 @@ conversion:table(nofallback) { // versions of ISO-2022-JP { "ISO-2022-JP", - "[\x00-\x0d\x10-\x1a\x1c-\x7f\u0391-\u03a1\uff61-\uff9f\u4e00\u4e01\uffe5]", - "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\uffe6-\U0010ffff]", + "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u203e\uff61-\uff9f\u4e00\u4e01\uffe5]", + "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\ufa0e-\ufa2d\uffe6-\U0010ffff]", :int{0} } { "ISO-2022-JP-2", - "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\uff61-\uff9f\u4e00-\u4e05\uffe6]", + "[\x00-\x0d\x10-\x1a\x1c-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\uff61-\uff9f\u4e00-\u4e05\uffe6]", "[\x0e\x0f\x1b\uffe7-\U0010ffff]", :int{0} }