ICU-947 ISO-2022 performance improvements

X-SVN-Rev: 4620
This commit is contained in:
Ram Viswanadha 2001-05-07 23:54:01 +00:00
parent 3edc7c1739
commit 0a875cdc0b
3 changed files with 292 additions and 227 deletions

View File

@ -34,14 +34,15 @@
#include "ucnv_cnv.h"
#include "unicode/ustring.h"
#include "unicode/ucnv_cb.h"
#include "ucnvmbcs.h"
#include "cstring.h"
#define CONCAT_ESCAPE_EX(args, target, targetLimit,offsets,strToAppend,len, err){\
#define CONCAT_ESCAPE_EX(args,source, target, targetLimit,offsets,strToAppend,len, err){\
while(len-->0){\
if(target < targetLimit){\
*(target++) = (unsigned char) *(strToAppend++);\
if(offsets){\
*(offsets++) = source - args->source -1;\
*(offsets++) = source - args->source -1;\
}\
}\
else{\
@ -51,6 +52,72 @@
}\
}
#define MBCS_FROM_UCHAR32_ISO2022(sharedData,c, value, useFallback, length, outputType) {\
const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;\
uint32_t stage2Entry;\
uint32_t myValue;\
const uint8_t *p;\
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */\
if(c<0x10000 || (sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {\
stage2Entry=MBCS_STAGE_2_FROM_U(table, c);\
/* get the bytes and the length for the output */\
if(outputType==MBCS_OUTPUT_2){\
myValue=MBCS_VALUE_2_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);\
if(myValue<=0xff) {\
length=1;\
} else {\
length=2;\
}\
}else if(outputType==MBCS_OUTPUT_3){\
p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->table->mbcs.fromUnicodeBytes, stage2Entry, c);\
myValue=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2];\
if(myValue<=0xff) {\
length=1;\
} else if(myValue<=0xffff) {\
length=2;\
} else {\
length=3;\
}\
}\
/* is this code point assigned, or do we use fallbacks? */\
if( (stage2Entry&(1<<(16+(c&0xf))))!=0 ||\
(FROM_U_USE_FALLBACK(useFallback, c) && (myValue!=0 || c==0))\
) {\
/*\
* We allow a 0 byte output if the Unicode code point is\
* U+0000 and also if the "assigned" bit is set for this entry.\
* There is no way with this data structure for fallback output\
* for other than U+0000 to be a zero byte.\
*/\
/* assigned */\
value=myValue;\
} else {\
length=0;\
}\
}else{\
length=0;\
}\
}
#define MBCS_SINGLE_FROM_UCHAR32(sharedData, c,retval, useFallback) { \
const uint16_t *table; \
int32_t value;\
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */\
if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {\
value= -1;\
}\
/* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */\
table=sharedData->table->mbcs.fromUnicodeTable;\
/* get the byte for the output */\
value=MBCS_SINGLE_RESULT_FROM_U(table, (uint16_t *)sharedData->table->mbcs.fromUnicodeBytes, c);\
/* is this code point assigned, or do we use fallbacks? */\
if(useFallback ? value>=0x800 : value>=0xc00) {\
value &=0xff;\
} else {\
value= -1;\
}\
retval=(uint16_t) value;\
}
#define UCNV_SS2 "\x1B\x4E"
#define UCNV_SS3 "\x1B\x4F"
#define UCNV_SS2_LEN 2
@ -1341,7 +1408,6 @@ static const int32_t escSeqCharsLen[MAX_VALID_CP_JP] ={
3 /* length of <ESC>(I HWKANA_7BIT */
};
/*
* The iteration over various code pages works this way:
* i) Get the currentState from myConverterData->currentState
@ -1368,11 +1434,14 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
const UChar* source = args->source;
const UChar* sourceLimit = args->sourceLimit;
int32_t* offsets = args->offsets;
uint32_t targetUniChar = missingCharMarker;
int32_t offset = 0;
uint32_t targetByteUnit = missingCharMarker;
UChar32 sourceChar =0x0000;
const char* escSeq = NULL;
int len =0; /*length of escSeq chars*/
UConverterCallbackReason reason;
UConverterSharedData* sharedData=NULL;
UBool useFallback = args->converter->useFallback;
/* state variables*/
StateEnum* currentState = &converterData->fromUnicodeCurrentState;
@ -1380,7 +1449,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UConverter** currentConverter = &converterData->fromUnicodeConverter;
Cnv2022Type* currentType = &converterData->currentType;
UConverter** convArray = converterData->myConverterArray;
/* arguments check*/
if ((args->converter == NULL) || (targetLimit < target) || (sourceLimit < source)){
*err = U_ILLEGAL_ARGUMENT_ERROR;
@ -1397,15 +1466,15 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
goto getTrail;
}
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
sharedData= (*currentConverter)->sharedData;
while( source < sourceLimit){
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
targetUniChar = missingCharMarker;
targetByteUnit = missingCharMarker;
if(target < targetLimit){
sourceChar = *(source++);
if(sourceChar > SPACE) {
do{
@ -1413,22 +1482,26 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
/* most common case*/
case DBCS:
{
uint32_t value;
if(2 == _MBCSFromUChar32((*currentConverter)->sharedData,
sourceChar, &value, args->converter->useFallback)) {
targetUniChar = (uint16_t)value;
uint32_t value=0;
int length=0;
/*if(2 == _MBCSFromUChar32(sharedData,sourceChar, &value, useFallback)) {
targetByteUnit = (uint16_t)value;
}*/
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,value,useFallback,length,MBCS_OUTPUT_2);
if(length==2){
targetByteUnit = value;
}
}
break;
case ASCII1:
if(sourceChar < 0x7f){
targetUniChar = sourceChar;
targetByteUnit = sourceChar;
}
break;
case SBCS:
targetUniChar = (uint16_t)_MBCSSingleFromUChar32((*currentConverter)->sharedData,
sourceChar, args->converter->useFallback);
MBCS_SINGLE_FROM_UCHAR32(sharedData,sourceChar,targetByteUnit,useFallback);
// targetByteUnit=(uint16_t)_MBCSSingleFromUChar32(sharedData,sourceChar,useFallback);
/*
* If mySourceChar is unassigned, then _MBCSSingleFromUChar32() returns -1
* which becomes the same as missingCharMarker with the cast to uint16_t.
@ -1437,14 +1510,14 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
if(0xFF9F-sourceChar<=(0xFF9F-0xFF61)){
if( converterData->version==3){
/*we get a1-df from _MBCSSingleFromUChar32 so subtract 0x80*/
targetUniChar-=0x80;
targetByteUnit-=0x80;
*currentState = HWKANA_7BIT;
}
else if( converterData->version==4){
*currentState = JISX201;
}
else{
targetUniChar=missingCharMarker;
targetByteUnit=missingCharMarker;
}
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
*currentType = (Cnv2022Type) myConverterType[*currentState];
@ -1453,7 +1526,7 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
case LATIN1:
if(sourceChar <= 0x00FF){
targetUniChar = sourceChar;
targetByteUnit = sourceChar;
}
break;
@ -1461,10 +1534,11 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
/*not expected */
break;
}
if(targetUniChar==missingCharMarker){
if(targetByteUnit==missingCharMarker){
*currentState = nextStateArray[converterData->version][*currentState];
*currentConverter = convArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
*currentType = (Cnv2022Type) myConverterType[*currentState];
sharedData= (*currentConverter)->sharedData;
}
else
/*got the mapping so break from while loop*/
@ -1474,63 +1548,63 @@ UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
}
else{
targetUniChar = sourceChar;
targetByteUnit = sourceChar;
*currentState = ASCII;
*currentType = (Cnv2022Type) myConverterType[*currentState];
}
if(targetUniChar != missingCharMarker){
if(targetByteUnit != missingCharMarker){
if( *currentState != initIterState){
escSeq = escSeqChars[(int)*currentState];
len = escSeqCharsLen[(int)*currentState];
CONCAT_ESCAPE_EX(args, target, targetLimit, offsets, escSeq,len,err);
CONCAT_ESCAPE_EX(args,source, target,targetLimit, offsets, escSeq,len,err);
/* Append SSN for shifting to G2 */
if(*currentState==ISO8859_1 || *currentState==ISO8859_7){
escSeq = UCNV_SS2;
len = UCNV_SS2_LEN;
CONCAT_ESCAPE_EX(args, target, targetLimit,offsets, escSeq,len,err);
CONCAT_ESCAPE_EX(args, source, target, targetLimit,offsets, escSeq,len,err);
}
}
initIterState = *currentState;
/* write the targetUniChar to target */
if(targetUniChar <= 0x00FF){
offset = source - args->source -1;
/* write the targetByteUnit to target */
if(targetByteUnit <= 0x00FF){
if( target <targetLimit){
*(target++) = (unsigned char) targetUniChar;
*(target++) = (unsigned char) targetByteUnit;
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) = offset;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetUniChar;
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetByteUnit;
*err = U_BUFFER_OVERFLOW_ERROR;
}
}else{
if(target < targetLimit){
*(target++) =(unsigned char) (targetUniChar>>8);
*(target++) =(unsigned char) (targetByteUnit>>8);
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) = offset;
}
if(target < targetLimit){
*(target++) =(unsigned char) (targetUniChar);
*(target++) =(unsigned char) (targetByteUnit);
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) = offset;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
*err = U_BUFFER_OVERFLOW_ERROR;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar>>8);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit>>8);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
}
else{
@ -1608,7 +1682,6 @@ getTrail:
args->target = (char*)target;
}
/*************** to unicode *******************/
/****************************************************************************
@ -1715,9 +1788,8 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
mySourceChar= (unsigned char) *mySource++;
/* Consume the escape sequences and ascertain the state */
switch(mySourceChar){
case UCNV_SI:
if(myData->version==3 && *toUnicodeStatus==0x00){
if(mySourceChar==UCNV_SI){
if(myData->version==3 && *toUnicodeStatus==0x00){
if(myData->toUnicodeSaveState!=INVALID_STATE){
*currentState = (StateEnum) myData->toUnicodeSaveState;
continue;
@ -1726,53 +1798,59 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
*err =U_ILLEGAL_CHAR_FOUND;
goto CALLBACK;
}
}
else
else{
goto CALLBACK;
case UCNV_SO:
}
}else if(mySourceChar==UCNV_SO){
if(myData->version==3 && *toUnicodeStatus==0x00){
myData->toUnicodeSaveState= (int) *currentState;
*currentState = HWKANA_7BIT;
continue;
}
else
else{
goto CALLBACK;
default:
if(myData->key==0){
if(mySourceChar<=SPACE){
if(*toUnicodeStatus== 0x00){
*currentState = ASCII;
}
}else if(mySourceChar==ESC_2022 || myData->key!=0){
if(*toUnicodeStatus== 0x00){
mySource--;
changeState_2022(args->converter,&(mySource),
args->sourceLimit, args->flush,ISO_2022_JP,&plane, err);
/*Invalid or illegal escape sequence */
if(U_SUCCESS(*err)){
continue;
}
else
goto CALLBACK;
}
break;
}
case ESC_2022:
if(*toUnicodeStatus== 0x00){
mySource--;
changeState_2022(args->converter,&(mySource),
args->sourceLimit, args->flush,ISO_2022_JP,&plane, err);
/*Invalid or illegal escape sequence */
if(U_SUCCESS(*err)){
continue;
}
else{
args->target = myTarget;
args->source = mySource;
return;
}
}
else
else{
args->target = myTarget;
args->source = mySource;
return;
}
}
else{
goto CALLBACK;
}
}
switch(myConverterType[*currentState]){
case DBCS:
if(*toUnicodeStatus== 0x00){
*toUnicodeStatus= (UChar) mySourceChar;
continue;
}
else{
const char *pBuf;
tempBuf[0] = (char) args->converter->toUnicodeStatus;
tempBuf[1] = (char) mySourceChar;
mySourceChar+= (args->converter->toUnicodeStatus)<<8;
*toUnicodeStatus= 0;
pBuf = tempBuf;
targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData, &pBuf, tempBuf+2, args->converter->useFallback);
}
break;
case ASCII1:
if( mySourceChar < 0x7F){
@ -1799,23 +1877,6 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
break;
case DBCS:
if(*toUnicodeStatus== 0x00){
*toUnicodeStatus= (UChar) mySourceChar;
continue;
}
else{
const char *pBuf;
tempBuf[0] = (char) args->converter->toUnicodeStatus;
tempBuf[1] = (char) mySourceChar;
mySourceChar+= (args->converter->toUnicodeStatus)<<8;
*toUnicodeStatus= 0;
pBuf = tempBuf;
targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData, &pBuf, tempBuf+2, args->converter->useFallback);
}
break;
case LATIN1:
targetUniChar = (UChar) mySourceChar;
@ -1921,6 +1982,8 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
UBool oldIsTargetByteDBCS = isTargetByteDBCS;
UConverterDataISO2022 *converterData=(UConverterDataISO2022*)args->converter->extraInfo;
UConverterCallbackReason reason;
UConverterSharedData* sharedData = converterData->fromUnicodeConverter->sharedData;
UBool useFallback = args->converter->useFallback;
int32_t length =0;
/*Arguments Check*/
@ -1951,8 +2014,9 @@ UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
if(target < (unsigned char*) args->targetLimit){
sourceChar = *source++;
length= _MBCSFromUChar32(converterData->fromUnicodeConverter->sharedData,
sourceChar,&targetByteUnit,args->converter->useFallback);
/* length= _MBCSFromUChar32(converterData->fromUnicodeConverter->sharedData,
sourceChar,&targetByteUnit,args->converter->useFallback);*/
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,targetByteUnit,useFallback,length,MBCS_OUTPUT_2);
/* only DBCS or SBCS characters are expected*/
/* DB haracters with high bit set to 1 are expected */
if(length > 2 || length==0 ||(((targetByteUnit & 0x8080) != 0x8080)&& length==2)){
@ -2147,6 +2211,8 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
UChar mySourceChar = 0x0000;
UConverterDataISO2022* myData=(UConverterDataISO2022*)(args->converter->extraInfo);
int plane =0; /*dummy variable */
UConverterSharedData* sharedData = myData->fromUnicodeConverter->sharedData;
UBool useFallback = args->converter->useFallback;
/*Arguments Check*/
if (U_FAILURE(*err)){
@ -2169,44 +2235,31 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
mySourceChar= (unsigned char) *mySource++;
switch(mySourceChar){
case UCNV_SI:
myData->currentType = SBCS;
/*consume the source */
continue;
case UCNV_SO:
myData->currentType =DBCS;
/*consume the source */
continue;
default:
/* If we are in the process of consuming an escape sequence
* we fall through execute the the statements of next switch
* tag else we break;
*/
if(myData->key==0){
break;
}
case ESC_2022:
{
/* Already doing some conversion and found escape Sequence*/
if(args->converter->mode == UCNV_SO){
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
}
else{
mySource--;
changeState_2022(args->converter,&(mySource),
args->sourceLimit, args->flush,ISO_2022_KR,&plane, err);
}
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
return;
}
continue;
if(mySourceChar==UCNV_SI){
myData->currentType = SBCS;
/*consume the source */
continue;
}else if(mySourceChar==UCNV_SO){
myData->currentType = DBCS;
/*consume the source */
continue;
}else if(mySourceChar==ESC_2022 || myData->key!=0){
/* Already doing some conversion and found escape Sequence*/
if(args->converter->mode == UCNV_SO){
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
}
}
else{
mySource--;
changeState_2022(args->converter,&(mySource),
args->sourceLimit, args->flush,ISO_2022_KR,&plane, err);
}
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
return;
}
continue;
}
if(myData->currentType==DBCS){
if(args->converter->toUnicodeStatus == 0x00){
@ -2218,28 +2271,24 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
tempBuf[1] = (char) (mySourceChar+0x80);
mySourceChar = (UChar)(mySourceChar + (args->converter->toUnicodeStatus<<8));
args->converter->toUnicodeStatus =0x00;
pBuf = &tempBuf[0];
tempLimit = &tempBuf[2]+1;
targetUniChar = _MBCSSimpleGetNextUChar(myData->fromUnicodeConverter->sharedData,
&pBuf,tempLimit,args->converter->useFallback);
pBuf = tempBuf;
targetUniChar = _MBCSSimpleGetNextUChar(sharedData,
&pBuf,(pBuf+2),useFallback);
}
}
else{
if(args->converter->fromUnicodeStatus == 0x00){
tempBuf[0] = (char) mySourceChar;
pBuf = &tempBuf[0];
tempLimit = &tempBuf[1];
targetUniChar = _MBCSSimpleGetNextUChar(myData->currentConverter->sharedData,
&pBuf,tempLimit,args->converter->useFallback);
targetUniChar = _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, mySourceChar);
}
}
if(targetUniChar < 0xfffe){
if(targetUniChar != missingCharMarker){
if(args->offsets)
args->offsets[myTarget - args->target]= mySource - args->source - 1-(myData->currentType==DBCS);
*(myTarget++)=(UChar)targetUniChar;
}
else if(targetUniChar>=0xfffe){
else {
/* Call the callback function*/
toUnicodeCallback(args,mySourceChar,&mySource,targetUniChar,&myTarget,err);
@ -2411,8 +2460,8 @@ typedef enum {
static Cnv2022Type myConverterTypeCN[4]={
ASCII1,
MBCS,
MBCS,
DBCS,
DBCS,
MBCS
};
@ -2430,13 +2479,16 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
const UChar* source = args->source;
const UChar* sourceLimit = args->sourceLimit;
int32_t* offsets = args->offsets;
uint32_t targetUniChar = missingCharMarker;
int32_t offset =0;
uint32_t targetByteUnit = missingCharMarker;
uint32_t sourceChar =0x0000;
const char* escSeq = NULL;
int len =0; /*length of escSeq chars*/
uint32_t targetValue=0;
uint8_t planeVal=0;
UConverterCallbackReason reason;
UConverterSharedData* sharedData=NULL;
UBool useFallback = args->converter->useFallback;
/* state variables*/
StateEnumCN* currentState = (StateEnumCN*)&converterData->fromUnicodeCurrentState;
@ -2456,18 +2508,18 @@ UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterFromUnicodeArgs* args
if(U_FAILURE(*err)){
return;
}
/* set up the state */
initIterState = *currentState;
*currentConverter =converterData->myConverterArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
sharedData=(*currentConverter)->sharedData;
/* check if the last codepoint of previous buffer was a lead surrogate*/
if(args->converter->fromUSurrogateLead!=0 && target< targetLimit) {
goto getTrail;
}
while( source < sourceLimit){
*currentConverter =converterData->myConverterArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
targetUniChar =missingCharMarker;
targetByteUnit =missingCharMarker;
lPlane =0;
if(target < targetLimit){
@ -2484,6 +2536,7 @@ getTrail:
UChar trail=(UChar) *source;
if(UTF_IS_SECOND_SURROGATE(trail)) {
source++;
/*(((args->converter->fromUSurrogateLead)<<10L)+(trail)-((0xd800<<10L)+0xdc00-0x10000))*/
sourceChar=UTF16_GET_PAIR_VALUE(args->converter->fromUSurrogateLead, trail);
args->converter->fromUSurrogateLead=0x00;
/* convert this surrogate code point */
@ -2510,7 +2563,7 @@ getTrail:
/* do the conversion */
if(sourceChar < 0x007f ){
targetUniChar = sourceChar;
targetByteUnit = sourceChar;
if(*currentState!= ASCII_1){
*currentState = ASCII_1;
*isEscapeAppended = FALSE;
@ -2521,57 +2574,53 @@ getTrail:
do{
if(myConverterTypeCN[*currentState] == MBCS){
len= _MBCSFromUChar32((*currentConverter)->sharedData,sourceChar,
&targetValue,args->converter->useFallback);
switch(len){
case 0:
targetUniChar = missingCharMarker;
break;
case 2:
if(( converterData->version) == 0 && *currentState ==ISO_IR_165){
targetUniChar = missingCharMarker;
}else{
targetUniChar = (UChar32) targetValue;
}
break;
case 3:
targetUniChar = (UChar32) targetValue;
/*len= _MBCSFromUChar32((*currentConverter)->sharedData,sourceChar,
&targetValue,args->converter->useFallback);*/
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,targetValue,useFallback,len,MBCS_OUTPUT_3);
if(len==3){
targetByteUnit = (UChar32) targetValue;
planeVal = (uint8_t) ((targetValue)>>16);
if(planeVal >0x80 && planeVal<0x89){
lPlane = (int)(planeVal - 0x80);
targetUniChar -= (planeVal<<16);
targetByteUnit -= (planeVal<<16);
}else {
lPlane =-1;
targetByteUnit=missingCharMarker;
}
if(converterData->version == 0 && lPlane >2){
targetUniChar = missingCharMarker;
targetByteUnit = missingCharMarker;
}
break;
default:
reason =UCNV_ILLEGAL;
*err =U_INVALID_CHAR_FOUND;
break;
}
}else if(myConverterTypeCN[*currentState] == DBCS){
MBCS_FROM_UCHAR32_ISO2022(sharedData,sourceChar,targetValue,useFallback,len,MBCS_OUTPUT_2);
if(len==2){
if(( converterData->version) == 0 && *currentState ==ISO_IR_165){
targetByteUnit = missingCharMarker;
}else{
targetByteUnit = (UChar32) targetValue;
}
}
}else{
if(sourceChar < 0x7f){
targetUniChar = sourceChar;
targetByteUnit = sourceChar;
}
}
if(targetUniChar==missingCharMarker){
if(targetByteUnit==missingCharMarker){
*currentState=(StateEnumCN)((*currentState<3)? *currentState+1:0);
*currentConverter =converterData->myConverterArray[(*currentConverter==NULL) ? 0 : (int)*currentState];
targetUniChar =missingCharMarker;
targetByteUnit =missingCharMarker;
*isEscapeAppended = FALSE;
*isShiftAppended = FALSE;
sharedData=(*currentConverter)->sharedData;
}
else
break;
}while(initIterState != *currentState);
}
if(targetUniChar != missingCharMarker){
if(targetByteUnit != missingCharMarker){
args->converter->fromUnicodeStatus=(UBool) (*currentState > ASCII_1);
/* Append the escpace sequence */
@ -2580,64 +2629,57 @@ getTrail:
temp =(*currentState==CNS_11643) ? ((int)*currentState+lPlane-1):(int)*currentState ;
escSeq = escSeqCharsCN[temp];
len =escSeqCharsLenCN[temp];
CONCAT_ESCAPE_EX(args, target, targetLimit, offsets, escSeq,len,err);
CONCAT_ESCAPE_EX(args,source, target, targetLimit, offsets, escSeq,len,err);
*plane=lPlane;
*isEscapeAppended=TRUE;
}
/* Append Shift Sequences */
switch(*currentState){
case ASCII1:
break;
case GB2312_1:
/*falls through */
case ISO_IR_165:
if(!*isShiftAppended){
len =shiftSeqCharsLenCN[*currentState];
escSeq = shiftSeqCharsCN[*currentState];
CONCAT_ESCAPE_EX(args, target, targetLimit, offsets, escSeq,len,err);
*isShiftAppended=TRUE;
}
break;
default:
len =strlen(shiftSeqCharsCN[*currentState+*plane]);
escSeq = shiftSeqCharsCN[*currentState+*plane];
CONCAT_ESCAPE_EX(args, target, targetLimit, offsets, escSeq,len,err);
break;
if(*currentState == GB2312_1 || *currentState==ISO_IR_165){
if(!*isShiftAppended){
len =shiftSeqCharsLenCN[*currentState];
escSeq = shiftSeqCharsCN[*currentState];
CONCAT_ESCAPE_EX(args,source, target, targetLimit, offsets, escSeq,len,err);
*isShiftAppended=TRUE;
}
}else if(*currentState!=ASCII1){
len =shiftSeqCharsLenCN[*currentState+*plane];
escSeq = shiftSeqCharsCN[*currentState+*plane];
CONCAT_ESCAPE_EX(args,source, target, targetLimit, offsets, escSeq,len,err);
}
initIterState = *currentState;
/* write the targetUniChar to target */
if(targetUniChar <= 0x00FF){
/* write the targetByteUnit to target */
if(targetByteUnit <= 0x00FF){
if( target <targetLimit){
*(target++) = (unsigned char) targetUniChar;
*(target++) = (unsigned char) targetByteUnit;
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) =source-args->source-1;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetUniChar;
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) targetByteUnit;
*err = U_BUFFER_OVERFLOW_ERROR;
}
}else{
if(target < targetLimit){
*(target++) =(unsigned char) (targetUniChar>>8);
*(target++) =(unsigned char) (targetByteUnit>>8);
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) = source-args->source-1;
}
if(target < targetLimit){
*(target++) =(unsigned char) (targetUniChar);
*(target++) =(unsigned char) (targetByteUnit);
if(offsets){
*(offsets++) = source - args->source-1;
*(offsets++) = source-args->source-1;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
*err = U_BUFFER_OVERFLOW_ERROR;
}
}else{
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar>>8);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetUniChar);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit>>8);
args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = (unsigned char) (targetByteUnit);
*err = U_BUFFER_OVERFLOW_ERROR;
}
}
@ -2672,9 +2714,7 @@ callback:
*flush is TRUE, we can deduce that the input stream is truncated
*/
if (args->converter->fromUSurrogateLead !=0 && (source == sourceLimit) && args->flush){
*err = U_TRUNCATED_CHAR_FOUND;
}
/* Reset the state of converter if we consumed
* the source and flush is true
@ -2941,6 +2981,7 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
mySourceChar= (unsigned char) *mySource++;
switch(mySourceChar){
case UCNV_SI:
if(args->converter->toUnicodeStatus != 0x00){
@ -3013,13 +3054,10 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
myData->plane=plane;
if(plane>0){
myData->currentType = MBCS;
}else{
myData->currentType=DBCS;
}
else if(myData->currentConverter &&
uprv_stricmp("latin_1",
myData->currentConverter->sharedData->staticData->name)==0){
myData->currentType=ASCII1;
}
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
@ -3031,12 +3069,32 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
}
if(targetUniChar < 0xfffe){
if(args->offsets){
args->offsets[myTarget - args->target]= mySource - args->source - 2
+(myData->currentType==ASCII);
}
*(myTarget++)=(UChar)targetUniChar;
}
else if(targetUniChar > 0xfffe){
/* disassemble the surrogate pair and write to output*/
if(args->offsets){
args->offsets[myTarget - args->target]= mySource - args->source - 2
+(myData->currentType==ASCII);
}
targetUniChar-=0x0010000;
*(myTarget++) =(UChar)(0xd800+(UChar)(targetUniChar>>10));
if(myTarget< args->targetLimit){
if(args->offsets){
args->offsets[myTarget - args->target]= mySource - args->source - 2
+(myData->currentType==ASCII);
}
*(myTarget)++ = (UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
}else{
args->converter->UCharErrorBuffer[args->converter->UCharErrorBufferLength++]=
(UChar)(0xdc00+(UChar)(targetUniChar&0x3ff));
}
}
else{
/* Call the callback function*/
toUnicodeCallback(args,mySourceChar,&mySource,targetUniChar,&myTarget,err);

View File

@ -28,6 +28,12 @@
* - byte sequences must not have leading zero bytes
* - except for SBCS codepages: no fallback mapping from Unicode to a zero byte
* - limitation to up to 4 bytes per character
*
* Change history:
*
* 5/6/2001 Ram Moved MBCS_SINGLE_RESULT_FROM_U,MBCS_STAGE_2_FROM_U,
* MBCS_VALUE_2_FROM_STAGE_2, MBCS_VALUE_4_FROM_STAGE_2
* macros to ucnvmbcs.h file
*/
#include "unicode/utypes.h"
@ -269,17 +275,6 @@
* adding new ones without crashing an unaware converter
*/
/* single-byte fromUnicode: get the 16-bit result word */
#define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ]
/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
#define MBCS_STAGE_2_FROM_U(table, c) ((uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ]
#define MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c) ((uint16_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
#define MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c) ((uint32_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
#define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3)
/* prototypes --------------------------------------------------------------- */
U_CFUNC void

View File

@ -70,6 +70,18 @@ enum {
#define MBCS_ENTRY_FINAL_VALUE(entry) ((entry)&0xfffff)
#define MBCS_ENTRY_FINAL_VALUE_16(entry) (uint16_t)(entry)
/* single-byte fromUnicode: get the 16-bit result word */
#define MBCS_SINGLE_RESULT_FROM_U(table, results, c) (results)[ (table)[ (table)[(c)>>10] +(((c)>>4)&0x3f) ] +((c)&0xf) ]
/* multi-byte fromUnicode: get the 32-bit stage 2 entry */
#define MBCS_STAGE_2_FROM_U(table, c) ((uint32_t *)(table))[ (table)[(c)>>10] +(((c)>>4)&0x3f) ]
#define MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c) ((uint16_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
#define MBCS_VALUE_4_FROM_STAGE_2(bytes, stage2Entry, c) ((uint32_t *)(bytes))[16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf)]
#define MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c) ((bytes)+(16*(uint32_t)(uint16_t)(stage2Entry)+((c)&0xf))*3)
/**
* MBCS output types for conversions from Unicode.
* These per-converter types determine the storage method in stage 3 of the lookup table,