ICU-6175 Invoke toUnicode error handler for empty segments in ISO-2022-x & HZ with new UConverter.toUCallbackReason=UCNV_IRREGULAR

X-SVN-Rev: 23571
2008-03-12 23:20:11 +00:00 · 2008-03-12 23:20:11 +00:00 · 867af878ad
commit 867af878ad
parent ccd1b36465
5 changed files with 102 additions and 10 deletions
--- a/icu4c/source/common/ucnv.c
+++ b/icu4c/source/common/ucnv.c
@ -1529,11 +1529,14 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
            cnv->toULength=0;

            /* call the callback function */
+            if(cnv->toUCallbackReason==UCNV_ILLEGAL && *err==U_INVALID_CHAR_FOUND) {
+                cnv->toUCallbackReason = UCNV_UNASSIGNED;
+            }
            cnv->fromCharErrorBehaviour(cnv->toUContext, pArgs,
                cnv->invalidCharBuffer, errorInputLength,
-                (*err==U_INVALID_CHAR_FOUND || *err==U_UNSUPPORTED_ESCAPE_SEQUENCE) ?
-                    UCNV_UNASSIGNED : UCNV_ILLEGAL,
+                cnv->toUCallbackReason,
                err);
+            cnv->toUCallbackReason = UCNV_ILLEGAL; /* reset to default value */

            /*
             * loop back to the offset handling
--- a/icu4c/source/common/ucnv2022.c
+++ b/icu4c/source/common/ucnv2022.c
@ -201,6 +201,7 @@ typedef struct{
 #ifdef U_ENABLE_GENERIC_ISO_2022
    UBool isFirstBuffer;
 #endif
+    UBool isEmptySegment;
    char name[30];
    char locale[3];
 }UConverterDataISO2022;
@ -609,6 +610,7 @@ _ISO2022Reset(UConverter *converter, UConverterResetChoice choice) {
    if(choice<=UCNV_RESET_TO_UNICODE) {
        uprv_memset(&myConverterData->toU2022State, 0, sizeof(ISO2022State));
        myConverterData->key = 0;
+        myConverterData->isEmptySegment = FALSE;
    }
    if(choice!=UCNV_RESET_TO_UNICODE) {
        uprv_memset(&myConverterData->fromU2022State, 0, sizeof(ISO2022State));
@ -814,6 +816,7 @@ DONE:
            if(chosenConverterName == NULL) {
                /* SS2 or SS3 */
                *err = U_UNSUPPORTED_ESCAPE_SEQUENCE;
+                _this->toUCallbackReason = UCNV_UNASSIGNED;
                return;
            }

@ -935,6 +938,8 @@ DONE:
    }
    if(U_SUCCESS(*err)) {
        _this->toULength = 0;
+    } else if(*err==U_UNSUPPORTED_ESCAPE_SEQUENCE) {
+        _this->toUCallbackReason = UCNV_UNASSIGNED;
    }
 }

@ -1986,6 +1991,7 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                    continue;
                } else {
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
                    break;
                }

@ -1997,21 +2003,39 @@ UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                    continue;
                } else {
                    /* only JIS7 uses SI/SO, not ISO-2022-JP-x */
+                    myData->isEmptySegment = FALSE;	/* reset this, we have a different error */
                    break;
                }

            case ESC_2022:
                mySource--;
 escape:
-                changeState_2022(args->converter,&(mySource),
-                    mySourceLimit, ISO_2022_JP,err);
+                {
+                    const char * mySourceBefore = mySource;
+                    int8_t toULengthBefore = args->converter->toULength;
+
+                    changeState_2022(args->converter,&(mySource),
+                        mySourceLimit, ISO_2022_JP,err);
+
+                    /* If in ISO-2022-JP only and we successully completed an escape sequence, but previous segment was empty, create an error */
+                    if(myData->version==0 && myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
+                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
+                        args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+                    }
+                }

                /* invalid or illegal escape sequence */
                if(U_FAILURE(*err)){
                    args->target = myTarget;
                    args->source = mySource;
+                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
                    return;
                }
+                /* If we successfully completed an escape sequence, we begin a new segment, empty so far */
+                if(myData->key==0) {
+                    myData->isEmptySegment = TRUE;
+                }
                continue;

            /* ISO-2022-JP does not use single-byte (C1) SS2 and SS3 */
@ -2028,6 +2052,7 @@ escape:
                /* falls through */
            default:
                /* convert one or two bytes */
+                myData->isEmptySegment = FALSE;
                cs = (StateEnum)pToU2022State->cs[pToU2022State->g];
                if( (uint8_t)(mySourceChar - 0xa1) <= (0xdf - 0xa1) && myData->version==4 &&
                    !IS_JP_DBCS(cs)
@ -2524,15 +2549,27 @@ UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,

            if(mySourceChar==UCNV_SI){
                myData->toU2022State.g = 0;
+                if (myData->isEmptySegment) {
+                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
+                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
+                    args->converter->toUBytes[0] = mySourceChar;
+                    args->converter->toULength = 1;
+                    args->target = myTarget;
+                    args->source = mySource;
+                    return;
+                }
                /*consume the source */
                continue;
            }else if(mySourceChar==UCNV_SO){
                myData->toU2022State.g = 1;
+                myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
                /*consume the source */
                continue;
            }else if(mySourceChar==ESC_2022){
                mySource--;
 escape:
+                myData->isEmptySegment = FALSE;	/* Any invalid ESC sequences will be detected separately, so just reset this */
                changeState_2022(args->converter,&(mySource),
                                mySourceLimit, ISO_2022_KR, err);
                if(U_FAILURE(*err)){
@ -2543,6 +2580,7 @@ escape:
                continue;
            }

+            myData->isEmptySegment = FALSE;	/* Any invalid char errors will be detected separately, so just reset this */
            if(myData->toU2022State.g == 1) {
                if(mySource < mySourceLimit) {
                    char trailByte;
@ -3075,27 +3113,52 @@ UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
            switch(mySourceChar){
            case UCNV_SI:
                pToU2022State->g=0;
+                if (myData->isEmptySegment) {
+                    myData->isEmptySegment = FALSE;	/* we are handling it, reset to avoid future spurious errors */
+                    *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                    args->converter->toUCallbackReason = UCNV_IRREGULAR;
+                    args->converter->toUBytes[0] = mySourceChar;
+                    args->converter->toULength = 1;
+                    args->target = myTarget;
+                    args->source = mySource;
+                    return;
+                }
                continue;

            case UCNV_SO:
                if(pToU2022State->cs[1] != 0) {
                    pToU2022State->g=1;
+                    myData->isEmptySegment = TRUE;	/* Begin a new segment, empty so far */
                    continue;
                } else {
                    /* illegal to have SO before a matching designator */
+                    myData->isEmptySegment = FALSE;	/* Handling a different error, reset this to avoid future spurious errs */
                    break;
                }

            case ESC_2022:
                mySource--;
 escape:
-                changeState_2022(args->converter,&(mySource),
-                    mySourceLimit, ISO_2022_CN,err);
+                {
+                    const char * mySourceBefore = mySource;
+                    int8_t toULengthBefore = args->converter->toULength;
+
+                    changeState_2022(args->converter,&(mySource),
+                        mySourceLimit, ISO_2022_CN,err);
+
+                    /* After SO there must be at least one character before a designator (designator error handled separately) */
+                    if(myData->key==0 && U_SUCCESS(*err) && myData->isEmptySegment) {
+                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
+                        args->converter->toULength = toULengthBefore + (mySource - mySourceBefore);
+                    }
+                }

                /* invalid or illegal escape sequence */
                if(U_FAILURE(*err)){
                    args->target = myTarget;
                    args->source = mySource;
+                    myData->isEmptySegment = FALSE;	/* Reset to avoid future spurious errors */
                    return;
                }
                continue;
@ -3109,6 +3172,7 @@ escape:
                /* falls through */
            default:
                /* convert one or two bytes */
+                myData->isEmptySegment = FALSE;
                if(pToU2022State->g != 0) {
                    if(mySource < mySourceLimit) {
                        UConverterSharedData *cnv;
--- a/icu4c/source/common/ucnv_bld.c
+++ b/icu4c/source/common/ucnv_bld.c
@ -948,6 +948,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
    myUConverter->subCharLen = mySharedConverterData->staticData->subCharLen;
    myUConverter->subChars = (uint8_t *)myUConverter->subUChars;
    uprv_memcpy(myUConverter->subChars, mySharedConverterData->staticData->subChar, myUConverter->subCharLen);
+    myUConverter->toUCallbackReason = UCNV_ILLEGAL; /* default reason to invoke (*fromCharErrorBehaviour) */

    if(mySharedConverterData->impl->open != NULL) {
        mySharedConverterData->impl->open(myUConverter, realName, locale, options, err);
--- a/icu4c/source/common/ucnv_bld.h
+++ b/icu4c/source/common/ucnv_bld.h
@ -1,6 +1,6 @@
 /*
 **********************************************************************
-*   Copyright (C) 1999-2006, International Business Machines
+*   Copyright (C) 1999-2006,2008 International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
@ -226,6 +226,9 @@ struct UConverter {
    char preToU[UCNV_EXT_MAX_BYTES];
    int8_t preFromULength, preToULength;    /* negative: replay */
    int8_t preToUFirstLength;               /* length of first character */
+
+    /* new fields for ICU 4.0 */
+    UConverterCallbackReason toUCallbackReason; /* (*fromCharErrorBehaviour) reason, set when error is detected */
 };

 U_CDECL_END /* end of UConverter */
--- a/icu4c/source/common/ucnvhz.c
+++ b/icu4c/source/common/ucnvhz.c
@ -59,6 +59,7 @@ typedef struct{
    UBool isEscapeAppended;
    UBool isStateDBCS;
    UBool isTargetUCharDBCS;
+    UBool isEmptySegment;
 }UConverterDataHZ;


@ -98,6 +99,7 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
        cnv->mode=0;
        if(cnv->extraInfo != NULL){
            ((UConverterDataHZ*)cnv->extraInfo)->isStateDBCS = FALSE;
+            ((UConverterDataHZ*)cnv->extraInfo)->isEmptySegment = FALSE;
        }
    }
    if(choice!=UCNV_RESET_TO_UNICODE) {
@ -130,6 +132,10 @@ _HZReset(UConverter *cnv, UConverterResetChoice choice){
 *   from-GB code '~}' ($7E7D) is outside the defined GB range.)
 *
 *   Source: RFC 1842
+*
+*   Note that the formal syntax in RFC 1842 is invalid. I assume that the
+*   intended definition of single-byte-segment is as follows (pedberg):
+*   single-byte-segment = single-byte-seq 1*single-byte-char
 */


@ -170,12 +176,23 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                        args->offsets[myTarget - args->target]=(int32_t)(mySource - args->source - 2);
                    }
                    *(myTarget++)=(UChar)mySourceChar;
+                    myData->isEmptySegment = FALSE;
                    continue;
                case UCNV_OPEN_BRACE:
-                    myData->isStateDBCS = TRUE;
-                    continue;
                case UCNV_CLOSE_BRACE:
-                    myData->isStateDBCS = FALSE;
+                    myData->isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
+                    if (myData->isEmptySegment) {
+                        myData->isEmptySegment = FALSE; /* we are handling it, reset to avoid future spurious errors */
+                        *err = U_ILLEGAL_ESCAPE_SEQUENCE;
+                        args->converter->toUCallbackReason = UCNV_IRREGULAR;
+                        args->converter->toUBytes[0] = UCNV_TILDE;
+                        args->converter->toUBytes[1] = mySourceChar;
+                        args->converter->toULength = 2;
+                        args->target = myTarget;
+                        args->source = mySource;
+                        return;
+                    }
+                    myData->isEmptySegment = TRUE;
                    continue;
                default:
                     /* if the first byte is equal to TILDE and the trail byte
@ -183,6 +200,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                     */
                    mySourceChar = 0x7e00 | mySourceChar;
                    targetUniChar = 0xffff;
+                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
                    break;
                }
            } else if(myData->isStateDBCS) {
@ -193,6 +211,7 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                    } else {
                        /* add another bit to distinguish a 0 byte from not having seen a lead byte */
                        args->converter->toUnicodeStatus = (uint32_t) (mySourceChar | 0x100);
+                        myData->isEmptySegment = FALSE; /* the segment has something, either valid or will produce a different error, so reset this */
                    }
                    continue;
                }
@ -220,8 +239,10 @@ UConverter_toUnicode_HZ_OFFSETS_LOGIC(UConverterToUnicodeArgs *args,
                    continue;
                } else if(mySourceChar <= 0x7f) {
                    targetUniChar = (UChar)mySourceChar;  /* ASCII */
+                    myData->isEmptySegment = FALSE; /* the segment has something valid */
                } else {
                    targetUniChar = 0xffff;
+                    myData->isEmptySegment = FALSE; /* different error here, reset this to avoid spurious future error */
                }
            }
            if(targetUniChar < 0xfffe){