ICU-484 reimplement LATIN_1 to work correctly with offsets and UTF-16

X-SVN-Rev: 3285
2000-12-20 01:22:02 +00:00 · 2000-12-20 01:22:02 +00:00 · 8d9bdf7a1e
commit 8d9bdf7a1e
parent 4bd8af1a79
1 changed files with 250 additions and 142 deletions
--- a/icu4c/source/common/ucnvlat1.c
+++ b/icu4c/source/common/ucnvlat1.c
@ -10,168 +10,276 @@
 *
 *   created on: 2000feb07
 *   created by: Markus W. Scherer
 *   Change history:
 *
 *   06/29/2000  helena      Major rewrite of the callback APIs.
 */
 #include "unicode/utypes.h"
-#include "ucmp16.h"
+#include "unicode/ucnv.h"
 #include "ucmp8.h"
 #include "unicode/ucnv_err.h"
 #include "ucnv_bld.h"
 #include "unicode/ucnv.h"
 #include "ucnv_cnv.h"
 /* ISO 8859-1 --------------------------------------------------------------- */
-U_CFUNC void  T_UConverter_toUnicode_LATIN_1 (UConverterToUnicodeArgs * args,
+/* This is a table-less and callback-less version of _MBCSSingleToBMPWithOffsets(). */
-                                      UErrorCode * err)
+U_CFUNC void
-{
+_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
-  unsigned char *mySource = (unsigned char *)  args->source;
+                            UErrorCode *pErrorCode) {
-  UChar *myTarget = args->target;
+    const uint8_t *source;
-  int32_t sourceLength = args->sourceLimit - (char *) mySource;
+    UChar *target;
-  int32_t readLen = 0;
+    int32_t targetCapacity, length;
-  int32_t i = 0;
+    int32_t *offsets;
-  /*Since there is no risk of encountering illegal Chars
+    /* set up the local pointers */
-   *we need to pad our latin1 chars to create Unicode codepoints
+    source=(const uint8_t *)pArgs->source;
-   *we need to go as far a min(targetLen, sourceLen)
+    target=pArgs->target;
-   *in case we don't have enough buffer space
+    targetCapacity=pArgs->targetLimit-pArgs->target;
-   *we set the error flag accordingly
+
-   */
+    /*
-  if ((args->targetLimit - args->target) < sourceLength)
+     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
-    {
+     * for the minimum of the sourceLength and targetCapacity
-      readLen = args->targetLimit - args->target;
+     */
-      *err = U_BUFFER_OVERFLOW_ERROR;
+    length=(const uint8_t *)pArgs->sourceLimit-source;
-    }
+    if(length<=targetCapacity) {
-  else
+        targetCapacity=length;
-    {
+    } else {
-      readLen = args->sourceLimit - (char *) mySource;
+        /* target will be full */
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
        length=targetCapacity;
    }
-  for (i = 0; i < readLen; i++) myTarget[i] = (UChar) mySource[i];
+    /* conversion loop */
    while(targetCapacity>0) {
        *target++=*source++;
        --targetCapacity;
    }
-  args->target += i;
+    /* write back the updated pointers */
-  args->source += i;
+    pArgs->source=(const char *)source;
-  return;
+    pArgs->target=target;
    /* set offsets */
    offsets=pArgs->offsets;
    if(offsets!=NULL) {
        int32_t sourceIndex=0;
        while(length>0) {
            *offsets++=sourceIndex++;
            --length;
        }
        pArgs->offsets=offsets;
    }
 }
-U_CFUNC void   T_UConverter_fromUnicode_LATIN_1 (UConverterFromUnicodeArgs * args,
+/* This is a table-less and callback-less version of _MBCSSingleGetNextUChar(). */
-                                         UErrorCode * err)
+U_CFUNC UChar32
-{
+_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
-  const UChar *mySource = args->source;
+                    UErrorCode *pErrorCode) {
-  unsigned char *myTarget = (unsigned char *) args->target;
+    const uint8_t *source=(const uint8_t *)pArgs->source;
-  int32_t mySourceIndex = 0;
+    if(source<(const uint8_t *)pArgs->sourceLimit) {
-  int32_t myTargetIndex = 0;
+        pArgs->source=(const char *)(source+1);
-  int32_t targetLength = args->targetLimit - (char *) myTarget;
+        return *source;
-  int32_t sourceLength = args->sourceLimit - mySource;
+    }
  UConverterCallbackReason reason;
-  /*writing the char to the output stream */
+    /* no output because of empty input */
-  while (mySourceIndex < sourceLength)
+    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    {
+    return 0xffff;
-      if (myTargetIndex < targetLength)
+}
-        {
+
-          if (mySource[mySourceIndex] < 0x0100)
+/* This is a table-less version of _MBCSSingleFromBMPWithOffsets(). */
-            {
+U_CFUNC void
-              /*writes the char to the output stream */
+_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
-              myTarget[myTargetIndex++] = (char) mySource[mySourceIndex++];
+                              UErrorCode *pErrorCode) {
    UConverter *cnv;
    const UChar *source, *sourceLimit, *lastSource;
    uint8_t *target;
    int32_t targetCapacity, length;
    int32_t *offsets;
    UChar32 c, max;
    int32_t sourceIndex;
    UConverterCallbackReason reason;
    int32_t i;
    /* set up the local pointers */
    cnv=pArgs->converter;
    source=pArgs->source;
    sourceLimit=pArgs->sourceLimit;
    target=(uint8_t *)pArgs->target;
    targetCapacity=pArgs->targetLimit-pArgs->target;
    offsets=pArgs->offsets;
    max=0xff; /* ### 0x7f for US-ASCII */
    /* get the converter state from UConverter */
    c=cnv->fromUSurrogateLead;
    /* sourceIndex=-1 if the current character began in the previous buffer */
    sourceIndex= c==0 ? 0 : -1;
    lastSource=source;
    /*
     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
     * for the minimum of the sourceLength and targetCapacity
     */
    length=sourceLimit-source;
    if(length<targetCapacity) {
        targetCapacity=length;
    }
    /* conversion loop */
    if(c!=0 && targetCapacity>0) {
        goto getTrail;
    }
    while(targetCapacity>0) {
        /*
         * Get a correct Unicode code point:
         * a single UChar for a BMP code point or
         * a matched surrogate pair for a "surrogate code point".
         */
        c=*source++;
        if(c<=max) {
            /* convert the Unicode code point */
            *target++=(uint8_t)c;
            --targetCapacity;
            /* normal end of conversion: prepare for a new character */
            c=0;
        } else {
            if(!UTF_IS_SURROGATE(c)) {
                /* callback(unassigned) */
                reason=UCNV_UNASSIGNED;
                *pErrorCode=U_INVALID_CHAR_FOUND;
            } else if(UTF_IS_SURROGATE_FIRST(c)) {
 getTrail:
                if(source<sourceLimit) {
                    /* test the following code unit */
                    UChar trail=*source;
                    if(UTF_IS_SECOND_SURROGATE(trail)) {
                        ++source;
                        c=UTF16_GET_PAIR_VALUE(c, trail);
                        /* this codepage does not map supplementary code points */
                        /* callback(unassigned) */
                        reason=UCNV_UNASSIGNED;
                        *pErrorCode=U_INVALID_CHAR_FOUND;
                    } else {
                        /* this is an unmatched lead code unit (1st surrogate) */
                        /* callback(illegal) */
                        reason=UCNV_ILLEGAL;
                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
                    }
                } else {
                    /* no more input */
                    break;
                }
            } else {
                /* this is an unmatched trail code unit (2nd surrogate) */
                /* callback(illegal) */
                reason=UCNV_ILLEGAL;
                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
            }
          else
            {
              *err = U_INVALID_CHAR_FOUND;
              reason = UCNV_UNASSIGNED;
              args->converter->invalidUCharBuffer[0] = (UChar)mySource[mySourceIndex];
              args->converter->invalidUCharLength = 1;
              if (UTF_IS_LEAD(mySource[mySourceIndex++]))
              {
                  if (mySourceIndex < sourceLength)
                  {
                      if (UTF_IS_TRAIL(mySource[mySourceIndex]))
                      {
                          args->converter->invalidUCharBuffer[1] = (UChar)mySource[mySourceIndex];
                          args->converter->invalidUCharLength++;
                          mySourceIndex++;
                      }
                      else 
                      {
                          reason = UCNV_ILLEGAL;
                      }                          
                  }
                  else if (args->flush == TRUE)
                  {
                      reason = UCNV_ILLEGAL;
                      *err = U_TRUNCATED_CHAR_FOUND;
                  } 
                  else 
                  {
                      args->converter->fromUSurrogateLead = args->converter->invalidUCharBuffer[0];
                      /* do not call the callback */
                  }
              }
              if (args->converter->fromUSurrogateLead == 0) 
              {
                  const UChar *saveSource = args->source;
                  char *saveTarget = args->target;
                  int32_t *saveOffset = args->offsets;
    /* Needed explicit cast for myTarget on MVS to make compiler happy - JJD */
                  args->target = (char*)myTarget + myTargetIndex;;
                  args->source = mySource + mySourceIndex;                  
-                  FromU_CALLBACK_MACRO(args->converter->fromUContext,
+            /* call the callback function with all the preparations and post-processing */
-                                     args,
+            /* get the number of code units for c to correctly advance sourceIndex after the callback call */
-                                     args->converter->invalidUCharBuffer,
+            length=UTF_CHAR_LENGTH(c);
-                                     args->converter->invalidUCharLength,
+
-                                     (UChar32) (args->converter->invalidUCharLength == 2 ? 
+            /* set offsets since the start or the last callback */
-                                         UTF16_GET_PAIR_VALUE(args->converter->invalidUCharBuffer[0], 
+            if(offsets!=NULL) {
-                                                              args->converter->invalidUCharBuffer[1]) 
+                int32_t count=(int32_t)(source-lastSource);
-                                                : args->converter->invalidUCharBuffer[0]),
+
-                                     reason,
+                /* do not set the offset for the callback-causing character */
-                                     err);
+                count-=length;
-                  args->source = saveSource;
+
-                  args->target = saveTarget;
+                while(count>0) {
-                  args->offsets = saveOffset;
+                    *offsets++=sourceIndex++;
-                  if (U_FAILURE (*err)) 
+                    --count;
-                  {
+                }
-                      break;
+                /* offset and sourceIndex are now set for the current character */
-                  }
+            }
-                  args->converter->invalidUCharLength = 0;
+
-              }
+            /* update the arguments structure */
            pArgs->source=source;
            pArgs->target=(char *)target;
            pArgs->offsets=offsets;
            /* set the converter state in UConverter to deal with the next character */
            cnv->fromUSurrogateLead=0;
            /* write the code point as code units */
            i=0;
            UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
            cnv->invalidUCharLength=(int8_t)i;
            /* i==length */
            /* call the callback function */
            cnv->fromUCharErrorBehaviour(cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
            /* get the converter state from UConverter */
            c=cnv->fromUSurrogateLead;
            /* update target and deal with offsets if necessary */
            offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
            target=(uint8_t *)pArgs->target;
            /* update the source pointer and index */
            sourceIndex+=length+(pArgs->source-source);
            source=lastSource=pArgs->source;
            targetCapacity=(uint8_t *)pArgs->targetLimit-target;
            length=sourceLimit-source;
            if(length<targetCapacity) {
                targetCapacity=length;
            }
            /*
             * If the callback overflowed the target, then we need to
             * stop here with an overflow indication.
             */
            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                break;
            } else if(U_FAILURE(*pErrorCode)) {
                /* break on error */
                c=0;
                break;
            } else if(cnv->charErrorBufferLength>0) {
                /* target is full */
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                break;
            }
        }
-      else
+    }
-        {
+
-          *err = U_BUFFER_OVERFLOW_ERROR;
+    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
-          break;
+        /* target is full */
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    /* set offsets since the start or the last callback */
    if(offsets!=NULL) {
        size_t count=source-lastSource;
        while(count>0) {
            *offsets++=sourceIndex++;
            --count;
        }
    }
-  args->target += myTargetIndex;
+    if(pArgs->flush && source>=sourceLimit) {
-  args->source += mySourceIndex;;
+        /* reset the state for the next conversion */
-
+        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-  return;
+            /* a Unicode code point remains incomplete (only a first surrogate) */
-}
+            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
-
+        }
-U_CFUNC UChar32 T_UConverter_getNextUChar_LATIN_1(UConverterToUnicodeArgs* args,
+        cnv->fromUSurrogateLead=0;
-                                                UErrorCode* err)
+    } else {
-{
+        /* set the converter state back into UConverter */
-  
+        cnv->fromUSurrogateLead=(UChar)c;
  /* Empties the internal buffers if need be
   * In this case since ErrorFunctors are never called 
   * (LATIN_1 is a subset of Unicode)
   */
  if (args->source+1 > args->sourceLimit) 
    {
      *err = U_INDEX_OUTOFBOUNDS_ERROR;
      return 0xffff;
    }
-  /* make sure that we zero-extend, not sign-extend, the byte */
+    /* write back the updated pointers */
-  return  (UChar)(uint8_t)*(args->source++);
+    pArgs->source=source;
    pArgs->target=(char *)target;
    pArgs->offsets=offsets;
 }
 static const UConverterImpl _Latin1Impl={
@ -184,11 +292,11 @@ static const UConverterImpl _Latin1Impl={
    NULL,
    NULL,
-    T_UConverter_toUnicode_LATIN_1,
+    _Latin1ToUnicodeWithOffsets,
-    NULL,
+    _Latin1ToUnicodeWithOffsets,
-    T_UConverter_fromUnicode_LATIN_1,
+    _Latin1FromUnicodeWithOffsets,
-    NULL,
+    _Latin1FromUnicodeWithOffsets,
-    T_UConverter_getNextUChar_LATIN_1,
+    _Latin1GetNextUChar,
    NULL,
    NULL