ICU-6175 Invoke toUnicode error handler for empty segments in ISO-2022-x & HZ with new UConverter.toUCallbackReason=UCNV_IRREGULAR
X-SVN-Rev: 23571
This commit is contained in:
parent
ccd1b36465
commit
867af878ad
@ -1529,11 +1529,14 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
|
||||
cnv->toULength=0;
|
||||
|
||||
/* call the callback function */
|
||||
if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
|
||||
cnv->toUCallbackReason = UCNV_UNASSIGNED;
|
||||
}
|
||||
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
|
||||
cnv->invalidCharBuffer, errorInputLength,
|
||||
(*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
|
||||
UCNV_UNASSIGNED : UCNV_ILLEGAL,
|
||||
cnv->toUCallbackReason,
|
||||
err);
|
||||
cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
|
||||
|
||||
/*
|
||||
* loop back to the offset handling
|
||||
|
@ -201,6 +201,7 @@ typedef struct{
|
||||
#ifdef U_ENABLE_GENERIC_ISO_2022
|
||||
UBool isFirstBuffer;
|
||||
#endif
|
||||
UBool isEmptySegment;
|
||||
char name[30];
|
||||
char locale[3];
|
||||
}UConverterDataISO2022;
|
||||
@ -609,6 +610,7 @@ _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
|
||||
if(choice<=UCNV_RESET_TO_UNICODE) {
|
||||
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
|
||||
myConverterData->key = 0;
|
||||
myConverterData->isEmptySegment = FALSE;
|
||||
}
|
||||
if(choice!=UCNV_RESET_TO_UNICODE) {
|
||||
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
|
||||
@ -814,6 +816,7 @@ DONE:
|
||||
if(chosenConverterName == NULL) {
|
||||
/* SS2 or SS3 */
|
||||
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
|
||||
_this->toUCallbackReason = UCNV_UNASSIGNED;
|
||||
return;
|
||||
}
|
||||
|
||||
@ -935,6 +938,8 @@ DONE:
|
||||
}
|
||||
if(U_SUCCESS(*err)) {
|
||||
_this->toULength = 0;
|
||||
} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
|
||||
_this->toUCallbackReason = UCNV_UNASSIGNED;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1986,6 +1991,7 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
continue;
|
||||
} else {
|
||||
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
||||
myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
||||
break;
|
||||
}
|
||||
|
||||
@ -1997,21 +2003,39 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
continue;
|
||||
} else {
|
||||
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
|
||||
myData->isEmptySegment = FALSE; /* reset this, we have a different error */
|
||||
break;
|
||||
}
|
||||
|
||||
case ESC_2022:
|
||||
mySource--;
|
||||
escape:
|
||||
changeState_2022(args->converter,&(mySource),
|
||||
mySourceLimit, ISO_2022_JP,err);
|
||||
{
|
||||
const char * mySourceBefore = mySource;
|
||||
int8_t toULengthBefore = args->converter->toULength;
|
||||
|
||||
changeState_2022(args->converter,&(mySource),
|
||||
mySourceLimit, ISO_2022_JP,err);
|
||||
|
||||
/* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
|
||||
if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
||||
args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
||||
}
|
||||
}
|
||||
|
||||
/* invalid or illegal escape sequence */
|
||||
if(U_FAILURE(*err)){
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
||||
return;
|
||||
}
|
||||
/* If we successfully completed an escape sequence, we begin a new segment, empty so far */
|
||||
if(myData->key==0) {
|
||||
myData->isEmptySegment = TRUE;
|
||||
}
|
||||
continue;
|
||||
|
||||
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
|
||||
@ -2028,6 +2052,7 @@ escape:
|
||||
/* falls through */
|
||||
default:
|
||||
/* convert one or two bytes */
|
||||
myData->isEmptySegment = FALSE;
|
||||
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
|
||||
if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
|
||||
!IS_JP_DBCS(cs)
|
||||
@ -2524,15 +2549,27 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
|
||||
if(mySourceChar==UCNV_SI){
|
||||
myData->toU2022State.g = 0;
|
||||
if (myData->isEmptySegment) {
|
||||
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
||||
args->converter->toUBytes[0] = mySourceChar;
|
||||
args->converter->toULength = 1;
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
return;
|
||||
}
|
||||
/*consume the source */
|
||||
continue;
|
||||
}else if(mySourceChar==UCNV_SO){
|
||||
myData->toU2022State.g = 1;
|
||||
myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
||||
/*consume the source */
|
||||
continue;
|
||||
}else if(mySourceChar==ESC_2022){
|
||||
mySource--;
|
||||
escape:
|
||||
myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
|
||||
changeState_2022(args->converter,&(mySource),
|
||||
mySourceLimit, ISO_2022_KR, err);
|
||||
if(U_FAILURE(*err)){
|
||||
@ -2543,6 +2580,7 @@ escape:
|
||||
continue;
|
||||
}
|
||||
|
||||
myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
|
||||
if(myData->toU2022State.g == 1) {
|
||||
if(mySource < mySourceLimit) {
|
||||
char trailByte;
|
||||
@ -3075,27 +3113,52 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
switch(mySourceChar){
|
||||
case UCNV_SI:
|
||||
pToU2022State->g=0;
|
||||
if (myData->isEmptySegment) {
|
||||
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
||||
args->converter->toUBytes[0] = mySourceChar;
|
||||
args->converter->toULength = 1;
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
return;
|
||||
}
|
||||
continue;
|
||||
|
||||
case UCNV_SO:
|
||||
if(pToU2022State->cs[1] != 0) {
|
||||
pToU2022State->g=1;
|
||||
myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
|
||||
continue;
|
||||
} else {
|
||||
/* illegal to have SO before a matching designator */
|
||||
myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
|
||||
break;
|
||||
}
|
||||
|
||||
case ESC_2022:
|
||||
mySource--;
|
||||
escape:
|
||||
changeState_2022(args->converter,&(mySource),
|
||||
mySourceLimit, ISO_2022_CN,err);
|
||||
{
|
||||
const char * mySourceBefore = mySource;
|
||||
int8_t toULengthBefore = args->converter->toULength;
|
||||
|
||||
changeState_2022(args->converter,&(mySource),
|
||||
mySourceLimit, ISO_2022_CN,err);
|
||||
|
||||
/* After SO there must be at least one character before a designator (designator error handled separately) */
|
||||
if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
||||
args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
|
||||
}
|
||||
}
|
||||
|
||||
/* invalid or illegal escape sequence */
|
||||
if(U_FAILURE(*err)){
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
|
||||
return;
|
||||
}
|
||||
continue;
|
||||
@ -3109,6 +3172,7 @@ escape:
|
||||
/* falls through */
|
||||
default:
|
||||
/* convert one or two bytes */
|
||||
myData->isEmptySegment = FALSE;
|
||||
if(pToU2022State->g != 0) {
|
||||
if(mySource < mySourceLimit) {
|
||||
UConverterSharedData *cnv;
|
||||
|
@ -948,6 +948,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
|
||||
myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
|
||||
myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
|
||||
uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
|
||||
myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
|
||||
|
||||
if(mySharedConverterData->impl->open != NULL) {
|
||||
mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2006, International Business Machines
|
||||
* Copyright (C) 1999-2006,2008 International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
@ -226,6 +226,9 @@ struct UConverter {
|
||||
char preToU[UCNV_EXT_MAX_BYTES];
|
||||
int8_t preFromULength, preToULength; /* negative: replay */
|
||||
int8_t preToUFirstLength; /* length of first character */
|
||||
|
||||
/* new fields for ICU 4.0 */
|
||||
UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
|
||||
};
|
||||
|
||||
U_CDECL_END /* end of UConverter */
|
||||
|
@ -59,6 +59,7 @@ typedef struct{
|
||||
UBool isEscapeAppended;
|
||||
UBool isStateDBCS;
|
||||
UBool isTargetUCharDBCS;
|
||||
UBool isEmptySegment;
|
||||
}UConverterDataHZ;
|
||||
|
||||
|
||||
@ -98,6 +99,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
|
||||
cnv->mode=0;
|
||||
if(cnv->extraInfo != NULL){
|
||||
((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
|
||||
((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
|
||||
}
|
||||
}
|
||||
if(choice!=UCNV_RESET_TO_UNICODE) {
|
||||
@ -130,6 +132,10 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
|
||||
* from-GB code '~}' ($7E7D) is outside the defined GB range.)
|
||||
*
|
||||
* Source: RFC 1842
|
||||
*
|
||||
* Note that the formal syntax in RFC 1842 is invalid. I assume that the
|
||||
* intended definition of single-byte-segment is as follows (pedberg):
|
||||
* single-byte-segment = single-byte-seq 1*single-byte-char
|
||||
*/
|
||||
|
||||
|
||||
@ -170,12 +176,23 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
|
||||
}
|
||||
*(myTarget++)=(UChar)mySourceChar;
|
||||
myData->isEmptySegment = FALSE;
|
||||
continue;
|
||||
case UCNV_OPEN_BRACE:
|
||||
myData->isStateDBCS = TRUE;
|
||||
continue;
|
||||
case UCNV_CLOSE_BRACE:
|
||||
myData->isStateDBCS = FALSE;
|
||||
myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
|
||||
if (myData->isEmptySegment) {
|
||||
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
|
||||
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
|
||||
args->converter->toUCallbackReason = UCNV_IRREGULAR;
|
||||
args->converter->toUBytes[0] = UCNV_TILDE;
|
||||
args->converter->toUBytes[1] = mySourceChar;
|
||||
args->converter->toULength = 2;
|
||||
args->target = myTarget;
|
||||
args->source = mySource;
|
||||
return;
|
||||
}
|
||||
myData->isEmptySegment = TRUE;
|
||||
continue;
|
||||
default:
|
||||
/* if the first byte is equal to TILDE and the trail byte
|
||||
@ -183,6 +200,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
*/
|
||||
mySourceChar = 0x7e00 | mySourceChar;
|
||||
targetUniChar = 0xffff;
|
||||
myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
|
||||
break;
|
||||
}
|
||||
} else if(myData->isStateDBCS) {
|
||||
@ -193,6 +211,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
} else {
|
||||
/* add another bit to distinguish a 0 byte from not having seen a lead byte */
|
||||
args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
|
||||
myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
|
||||
}
|
||||
continue;
|
||||
}
|
||||
@ -220,8 +239,10 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
|
||||
continue;
|
||||
} else if(mySourceChar <= 0x7f) {
|
||||
targetUniChar = (UChar)mySourceChar; /* ASCII */
|
||||
myData->isEmptySegment = FALSE; /* the segment has something valid */
|
||||
} else {
|
||||
targetUniChar = 0xffff;
|
||||
myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
|
||||
}
|
||||
}
|
||||
if(targetUniChar < 0xfffe){
|
||||
|
Loading…
Reference in New Issue
Block a user