ICU-3361 use more algorithmic conversion for ISO-2022-JP

X-SVN-Rev: 13997
This commit is contained in:
Markus Scherer 2003-12-04 16:19:50 +00:00
parent eea914cba9
commit 7479b1d7bb

View File

@ -116,15 +116,27 @@ typedef enum {
CNS_11643_7 CNS_11643_7
} StateEnum; } StateEnum;
#define CSM(cs) ((uint16_t)1<<(cs))
/*
* Each of these charset masks contains a bit for a charset in exact correspondence
* to whether that charset is listed in the same version's row of nextStateToUnicodeJP[].
*/
static const uint16_t jpCharsetMasks[5]={
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7),
CSM(ASCII)|CSM(JISX201)|CSM(JISX208)|CSM(HWKANA_7BIT)|CSM(JISX212)|CSM(GB2312)|CSM(KSC5601)|CSM(ISO8859_1)|CSM(ISO8859_7)
};
typedef enum { typedef enum {
ASCII1=0, ASCII1=0,
LATIN1, LATIN1,
SBCS, SBCS,
DBCS, DBCS,
MBCS MBCS,
HWKANA
}Cnv2022Type; }Cnv2022Type;
typedef struct ISO2022State { typedef struct ISO2022State {
@ -401,6 +413,8 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022)); cnv->extraInfo = uprv_malloc (sizeof (UConverterDataISO2022));
if(cnv->extraInfo != NULL) { if(cnv->extraInfo != NULL) {
UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo; UConverterDataISO2022 *myConverterData=(UConverterDataISO2022 *) cnv->extraInfo;
uint32_t version;
uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022)); uprv_memset(myConverterData, 0, sizeof(UConverterDataISO2022));
myConverterData->currentConverter = NULL; myConverterData->currentConverter = NULL;
myConverterData->fromUnicodeConverter = NULL; myConverterData->fromUnicodeConverter = NULL;
@ -412,21 +426,26 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
uprv_strncpy(myLocale, locale, sizeof(myLocale)); uprv_strncpy(myLocale, locale, sizeof(myLocale));
} }
myConverterData->version= 0; myConverterData->version= 0;
version = options & UCNV_OPTIONS_VERSION_MASK;
myConverterData->myConverterArray[0] =NULL; myConverterData->myConverterArray[0] =NULL;
if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') && if(myLocale[0]=='j' && (myLocale[1]=='a'|| myLocale[1]=='p') &&
(myLocale[2]=='_' || myLocale[2]=='\0')){ (myLocale[2]=='_' || myLocale[2]=='\0')){
int len=0; int len=0;
/* open the required converters and cache them */ /* open the required converters and cache them */
myConverterData->myConverterArray[0]= ucnv_open("ASCII", errorCode ); if(jpCharsetMasks[version]&CSM(ISO8859_7)) {
myConverterData->myConverterArray[1]= ucnv_open("ISO8859_1", errorCode); myConverterData->myConverterArray[ISO8859_7]= ucnv_open("ISO8859_7", errorCode);
myConverterData->myConverterArray[2]= ucnv_open("ISO8859_7", errorCode); }
myConverterData->myConverterArray[3]= ucnv_open("jisx-201", errorCode); myConverterData->myConverterArray[JISX201] = ucnv_open("jisx-201", errorCode);
myConverterData->myConverterArray[4]= ucnv_open("jisx-208", errorCode); myConverterData->myConverterArray[JISX208] = ucnv_open("jisx-208", errorCode);
myConverterData->myConverterArray[5]= ucnv_open("jisx-212", errorCode); if(jpCharsetMasks[version]&CSM(JISX212)) {
myConverterData->myConverterArray[6]= ucnv_open("ibm-5478", errorCode); /* gb_2312_80-1 */ myConverterData->myConverterArray[JISX212] = ucnv_open("jisx-212", errorCode);
myConverterData->myConverterArray[7]= ucnv_open("ksc_5601", errorCode); }
myConverterData->myConverterArray[8]= ucnv_open("jisx-201", errorCode); if(jpCharsetMasks[version]&CSM(GB2312)) {
myConverterData->myConverterArray[9]= NULL; myConverterData->myConverterArray[GB2312] = ucnv_open("ibm-5478", errorCode); /* gb_2312_80-1 */
}
if(jpCharsetMasks[version]&CSM(KSC5601)) {
myConverterData->myConverterArray[KSC5601] = ucnv_open("ksc_5601", errorCode);
}
/* initialize the state variables */ /* initialize the state variables */
setInitialStateToUnicodeJPCN(cnv, myConverterData); setInitialStateToUnicodeJPCN(cnv, myConverterData);
@ -436,7 +455,7 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData); cnv->sharedData=(UConverterSharedData*)(&_ISO2022JPData);
uprv_strcpy(myConverterData->locale,"ja"); uprv_strcpy(myConverterData->locale,"ja");
myConverterData->version =options & UCNV_OPTIONS_VERSION_MASK; myConverterData->version = version;
uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version="); uprv_strcpy(myConverterData->name,"ISO_2022,locale=ja,version=");
len = uprv_strlen(myConverterData->name); len = uprv_strlen(myConverterData->name);
myConverterData->name[len]=(char)(myConverterData->version+(int)'0'); myConverterData->name[len]=(char)(myConverterData->version+(int)'0');
@ -469,11 +488,9 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
(myLocale[2]=='_' || myLocale[2]=='\0')){ (myLocale[2]=='_' || myLocale[2]=='\0')){
/* open the required converters and cache them */ /* open the required converters and cache them */
myConverterData->myConverterArray[0] = NULL;
myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode); myConverterData->myConverterArray[GB2312_1] = ucnv_open("ibm-5478",errorCode);
myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode); myConverterData->myConverterArray[ISO_IR_165] = ucnv_open("iso-ir-165",errorCode);
myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode); myConverterData->myConverterArray[CNS_11643] = ucnv_open("cns-11643-1992",errorCode);
myConverterData->myConverterArray[4] = NULL;
/*initialize the state variables*/ /*initialize the state variables*/
@ -518,17 +535,19 @@ _ISO2022Open(UConverter *cnv, const char *name, const char *locale,uint32_t opti
static void static void
_ISO2022Close(UConverter *converter) { _ISO2022Close(UConverter *converter) {
UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo); UConverterDataISO2022* myData =(UConverterDataISO2022 *) (converter->extraInfo);
UConverter **array = myData->myConverterArray; UConverter **array = myData->myConverterArray;
int32_t i;
if (converter->extraInfo != NULL) { if (converter->extraInfo != NULL) {
/*close the array of converter pointers and free the memory*/ /*close the array of converter pointers and free the memory*/
while(*array!=NULL){ for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
if(*array==myData->currentConverter){ if(array[i]!=NULL) {
myData->currentConverter=NULL; if(array[i]==myData->currentConverter) {
myData->currentConverter=NULL;
}
ucnv_close(array[i]);
} }
ucnv_close(*array++);
} }
ucnv_close(myData->currentConverter); /* if not closed above */ ucnv_close(myData->currentConverter); /* if not closed above */
@ -1271,8 +1290,7 @@ static const Cnv2022Type myConverterType[MAX_VALID_CP_JP]={
DBCS, DBCS,
DBCS, DBCS,
DBCS, DBCS,
SBCS, HWKANA
}; };
static const StateEnum nextStateArray[5][MAX_VALID_CP_JP]= { static const StateEnum nextStateArray[5][MAX_VALID_CP_JP]= {
@ -1295,7 +1313,7 @@ static const char escSeqChars[MAX_VALID_CP_JP][6] ={
}; };
static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={ static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={
3, /* length of <ESC>(B ASCII */ 3, /* length of <ESC>(B ASCII */
3, /* length of <ESC>.A ISO-8859-1 */ 3, /* length of <ESC>.A ISO-8859-1 */
3, /* length of <ESC>.F ISO-8859-7 */ 3, /* length of <ESC>.F ISO-8859-7 */
3, /* length of <ESC>(J JISX-201 */ 3, /* length of <ESC>(J JISX-201 */
@ -1368,8 +1386,12 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
goto getTrail; goto getTrail;
} }
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState]; *currentConverter = convArray[*currentState];
sharedData= (*currentConverter)->sharedData; if(*currentConverter != NULL) {
sharedData = (*currentConverter)->sharedData;
} else {
sharedData = NULL;
}
while( source < sourceLimit){ while( source < sourceLimit){
@ -1395,7 +1417,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
} }
break; break;
case ASCII1: case ASCII1:
if(sourceChar < 0x7f){ if(sourceChar <= 0x7f){
targetByteUnit = sourceChar; targetByteUnit = sourceChar;
} }
break; break;
@ -1407,21 +1429,9 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
* If mySourceChar is unassigned, then _MBCSSingleFromUChar32() returns -1 * If mySourceChar is unassigned, then _MBCSSingleFromUChar32() returns -1
* which becomes the same as missingCharMarker with the cast to uint16_t. * which becomes the same as missingCharMarker with the cast to uint16_t.
*/ */
/* Check if the sourceChar is in the HW Kana range*/ if(targetByteUnit>0x7f && converterData->version!=4) {
if(0xFF9F-sourceChar<=(0xFF9F-0xFF61)){ /* use bytes >0x7f only for JIS8 */
if( converterData->version==3){ targetByteUnit = missingCharMarker;
/*we get a1-df from _MBCSSingleFromUChar32 so subtract 0x80*/
targetByteUnit-=0x80;
*currentState = HWKANA_7BIT;
}
else if( converterData->version==4){
*currentState = JISX201;
}
else{
targetByteUnit=missingCharMarker;
}
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
*currentType = (Cnv2022Type) myConverterType[*currentState];
} }
break; break;
@ -1431,15 +1441,37 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
} }
break; break;
case HWKANA:
/* Check if the sourceChar is in the HW Kana range*/
if(0xFF9F-sourceChar<=(0xFF9F-0xFF61)){
if( converterData->version==4){
/* 8-bit Katakana */
targetByteUnit = (uint32_t)(sourceChar - (0xff61 - 0xa1));
*currentState = JISX201;
}
else{
/* 7-bit Katakana */
targetByteUnit = (uint32_t)(sourceChar - (0xff61 - 0x21));
*currentState = HWKANA_7BIT;
}
*currentConverter = convArray[*currentState];
*currentType = (Cnv2022Type) myConverterType[*currentState];
}
break;
default: default:
/*not expected */ /*not expected */
break; break;
} }
if(targetByteUnit==missingCharMarker){ if(targetByteUnit==missingCharMarker){
*currentState = nextStateArray[converterData->version][*currentState]; *currentState = nextStateArray[converterData->version][*currentState];
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState]; *currentConverter = convArray[*currentState];
*currentType = (Cnv2022Type) myConverterType[*currentState]; *currentType = (Cnv2022Type) myConverterType[*currentState];
sharedData= (*currentConverter)->sharedData; if(*currentConverter != NULL) {
sharedData = (*currentConverter)->sharedData;
} else {
sharedData = NULL;
}
} }
else else
/*got the mapping so break from while loop*/ /*got the mapping so break from while loop*/
@ -1654,12 +1686,12 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
case ASCII1: case ASCII1:
if( mySourceChar < 0x7F){ if( mySourceChar <= 0x7F){
targetUniChar = (UChar) mySourceChar; targetUniChar = (UChar) mySourceChar;
} }
else if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) { else if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
/* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar); targetUniChar = mySourceChar + (0xff61 - 0xa1);
} }
break; break;
@ -1667,19 +1699,21 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
case SBCS: case SBCS:
if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) { if((uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4) {
/* 8-bit halfwidth katakana in any single-byte mode for JIS8 */ /* 8-bit halfwidth katakana in any single-byte mode for JIS8 */
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar); targetUniChar = mySourceChar + (0xff61 - 0xa1);
}
else if(*currentState==HWKANA_7BIT){
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->myConverterArray[JISX201]->sharedData, mySourceChar+0x80);
} }
else { else {
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->currentConverter->sharedData, mySourceChar); targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(myData->currentConverter->sharedData, mySourceChar);
} }
break;
case HWKANA:
if((uint8_t)(mySourceChar - 0x21) <= (0x5f - 0x21)) {
/* 7-bit halfwidth Katakana */
targetUniChar = mySourceChar + (0xff61 - 0x21);
}
break; break;
case LATIN1: case LATIN1:
targetUniChar = (UChar) mySourceChar; targetUniChar = (UChar) mySourceChar;
break; break;
@ -1695,8 +1729,8 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
} }
if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){ if(targetUniChar < (missingCharMarker-1/*0xfffe*/)){
if(args->offsets){ if(args->offsets){
args->offsets[myTarget - args->target]= mySource - args->source - 2 args->offsets[myTarget - args->target]= mySource - args->source -
+(myConverterType[*currentState] <= SBCS); (mySourceChar <= 0xff ? 1 : 2);
} }
*(myTarget++)=(UChar)targetUniChar; *(myTarget++)=(UChar)targetUniChar;
@ -2772,12 +2806,45 @@ _ISO_2022_GetUnicodeSet(const UConverter *cnv,
return; return;
} }
cnvSet = uset_open(0, 0); /* open a set and initialize it with code points that are algorithmically round-tripped */
switch(cnvData->locale[0]){
case 'j':
if(jpCharsetMasks[cnvData->version]&CSM(ISO8859_1)) {
/* include Latin-1 for some variants of JP */
cnvSet = uset_open(0, 0xff);
} else {
/* include ASCII for JP */
cnvSet = uset_open(0, 0x7f);
}
/* include half-width Katakana for JP */
uset_addRange(cnvSet, 0xff61, 0xff9f);
break;
case 'c':
case 'z':
/* include ASCII for CN */
cnvSet = uset_open(0, 0x7f);
break;
case 'k':
/* there is only one converter for KR, and it is not in the myConverterArray[] */
ucnv_getUnicodeSet(cnvData->currentConverter, set, which, pErrorCode);
return;
default:
cnvSet = uset_open(1, 0);
break;
}
if (!cnvSet) { if (!cnvSet) {
*pErrorCode =U_MEMORY_ALLOCATION_ERROR; *pErrorCode =U_MEMORY_ALLOCATION_ERROR;
return; return;
} }
/*
* TODO: need to make this version-specific for CN.
* CN version 0 does not map CNS planes 3..7 although
* they are all available in the CNS conversion table;
* CN version 1 does map them all.
* The two versions need to create different Unicode sets.
*/
for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) { for (i=0; i<UCNV_2022_MAX_CONVERTERS; i++) {
if(cnvData->myConverterArray[i]!=NULL) { if(cnvData->myConverterArray[i]!=NULL) {
ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode); ucnv_getUnicodeSet(cnvData->myConverterArray[i], cnvSet, which, pErrorCode);