ICU-6175 Invoke toUnicode error handler for empty segments in ISO-2022-x & HZ with new UConverter.toUCallbackReason=UCNV_IRREGULAR

X-SVN-Rev: 23571
This commit is contained in:
Peter Edberg 2008-03-12 23:20:11 +00:00
parent ccd1b36465
commit 867af878ad
5 changed files with 102 additions and 10 deletions

View File

@ -1529,11 +1529,14 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
cnv->toULength=0;
/* call the callback function */
if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
cnv->toUCallbackReason = UCNV_UNASSIGNED;
}
cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
cnv->invalidCharBuffer, errorInputLength,
(*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
UCNV_UNASSIGNED : UCNV_ILLEGAL,
cnv->toUCallbackReason,
err);
cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */
/*
* loop back to the offset handling

View File

@ -201,6 +201,7 @@ typedef struct{
#ifdef U_ENABLE_GENERIC_ISO_2022
UBool isFirstBuffer;
#endif
UBool isEmptySegment;
char name[30];
char locale[3];
}UConverterDataISO2022;
@ -609,6 +610,7 @@ _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
if(choice<=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
myConverterData->key = 0;
myConverterData->isEmptySegment = FALSE;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
@ -814,6 +816,7 @@ DONE:
if(chosenConverterName == NULL) {
/* SS2 or SS3 */
*err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
_this->toUCallbackReason = UCNV_UNASSIGNED;
return;
}
@ -935,6 +938,8 @@ DONE:
}
if(U_SUCCESS(*err)) {
_this->toULength = 0;
} else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
_this->toUCallbackReason = UCNV_UNASSIGNED;
}
}
@ -1986,6 +1991,7 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
@ -1997,21 +2003,39 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
continue;
} else {
/* only JIS7 uses SI/SO, not ISO-2022-JP-x */
myData->isEmptySegment = FALSE; /* reset this, we have a different error */
break;
}
case ESC_2022:
mySource--;
escape:
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_JP,err);
{
const char * mySourceBefore = mySource;
int8_t toULengthBefore = args->converter->toULength;
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_JP,err);
/* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
}
}
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
/* If we successfully completed an escape sequence, we begin a new segment, empty so far */
if(myData->key==0) {
myData->isEmptySegment = TRUE;
}
continue;
/* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
@ -2028,6 +2052,7 @@ escape:
/* falls through */
default:
/* convert one or two bytes */
myData->isEmptySegment = FALSE;
cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
!IS_JP_DBCS(cs)
@ -2524,15 +2549,27 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
if(mySourceChar==UCNV_SI){
myData->toU2022State.g = 0;
if (myData->isEmptySegment) {
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = mySourceChar;
args->converter->toULength = 1;
args->target = myTarget;
args->source = mySource;
return;
}
/*consume the source */
continue;
}else if(mySourceChar==UCNV_SO){
myData->toU2022State.g = 1;
myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
/*consume the source */
continue;
}else if(mySourceChar==ESC_2022){
mySource--;
escape:
myData->isEmptySegment = FALSE; /* Any invalid ESC sequences will be detected separately, so just reset this */
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_KR, err);
if(U_FAILURE(*err)){
@ -2543,6 +2580,7 @@ escape:
continue;
}
myData->isEmptySegment = FALSE; /* Any invalid char errors will be detected separately, so just reset this */
if(myData->toU2022State.g == 1) {
if(mySource < mySourceLimit) {
char trailByte;
@ -3075,27 +3113,52 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
switch(mySourceChar){
case UCNV_SI:
pToU2022State->g=0;
if (myData->isEmptySegment) {
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = mySourceChar;
args->converter->toULength = 1;
args->target = myTarget;
args->source = mySource;
return;
}
continue;
case UCNV_SO:
if(pToU2022State->cs[1] != 0) {
pToU2022State->g=1;
myData->isEmptySegment = TRUE; /* Begin a new segment, empty so far */
continue;
} else {
/* illegal to have SO before a matching designator */
myData->isEmptySegment = FALSE; /* Handling a different error, reset this to avoid future spurious errs */
break;
}
case ESC_2022:
mySource--;
escape:
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_CN,err);
{
const char * mySourceBefore = mySource;
int8_t toULengthBefore = args->converter->toULength;
changeState_2022(args->converter,&(mySource),
mySourceLimit, ISO_2022_CN,err);
/* After SO there must be at least one character before a designator (designator error handled separately) */
if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
}
}
/* invalid or illegal escape sequence */
if(U_FAILURE(*err)){
args->target = myTarget;
args->source = mySource;
myData->isEmptySegment = FALSE; /* Reset to avoid future spurious errors */
return;
}
continue;
@ -3109,6 +3172,7 @@ escape:
/* falls through */
default:
/* convert one or two bytes */
myData->isEmptySegment = FALSE;
if(pToU2022State->g != 0) {
if(mySource < mySourceLimit) {
UConverterSharedData *cnv;

View File

@ -948,6 +948,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */
if(mySharedConverterData->impl->open != NULL) {
mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2006, International Business Machines
* Copyright (C) 1999-2006,2008 International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -226,6 +226,9 @@ struct UConverter {
char preToU[UCNV_EXT_MAX_BYTES];
int8_t preFromULength, preToULength; /* negative: replay */
int8_t preToUFirstLength; /* length of first character */
/* new fields for ICU 4.0 */
UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
};
U_CDECL_END /* end of UConverter */

View File

@ -59,6 +59,7 @@ typedef struct{
UBool isEscapeAppended;
UBool isStateDBCS;
UBool isTargetUCharDBCS;
UBool isEmptySegment;
}UConverterDataHZ;
@ -98,6 +99,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
cnv->mode=0;
if(cnv->extraInfo != NULL){
((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
}
}
if(choice!=UCNV_RESET_TO_UNICODE) {
@ -130,6 +132,10 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
* from-GB code '~}' ($7E7D) is outside the defined GB range.)
*
* Source: RFC 1842
*
* Note that the formal syntax in RFC 1842 is invalid. I assume that the
* intended definition of single-byte-segment is as follows (pedberg):
* single-byte-segment = single-byte-seq 1*single-byte-char
*/
@ -170,12 +176,23 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
}
*(myTarget++)=(UChar)mySourceChar;
myData->isEmptySegment = FALSE;
continue;
case UCNV_OPEN_BRACE:
myData->isStateDBCS = TRUE;
continue;
case UCNV_CLOSE_BRACE:
myData->isStateDBCS = FALSE;
myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
if (myData->isEmptySegment) {
myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
*err = U_ILLEGAL_ESCAPE_SEQUENCE;
args->converter->toUCallbackReason = UCNV_IRREGULAR;
args->converter->toUBytes[0] = UCNV_TILDE;
args->converter->toUBytes[1] = mySourceChar;
args->converter->toULength = 2;
args->target = myTarget;
args->source = mySource;
return;
}
myData->isEmptySegment = TRUE;
continue;
default:
/* if the first byte is equal to TILDE and the trail byte
@ -183,6 +200,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
*/
mySourceChar = 0x7e00 | mySourceChar;
targetUniChar = 0xffff;
myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
break;
}
} else if(myData->isStateDBCS) {
@ -193,6 +211,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
} else {
/* add another bit to distinguish a 0 byte from not having seen a lead byte */
args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
}
continue;
}
@ -220,8 +239,10 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
continue;
} else if(mySourceChar <= 0x7f) {
targetUniChar = (UChar)mySourceChar; /* ASCII */
myData->isEmptySegment = FALSE; /* the segment has something valid */
} else {
targetUniChar = 0xffff;
myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
}
}
if(targetUniChar < 0xfffe){