From b7c791ad7543fdd25217c24694a98514ed6247c0 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Tue, 19 Dec 2000 00:29:27 +0000
Subject: [PATCH] ICU-507 use common implementation for getNextUChar() where
 appropriate

X-SVN-Rev: 3261
---
 icu4c/source/common/ucnv.c     |  12 +++-
 icu4c/source/common/ucnv2022.c | 126 +--------------------------------
 icu4c/source/common/ucnv_cnv.c |  48 +++++++++++++
 icu4c/source/common/ucnv_cnv.h |  26 +++++++
 icu4c/source/common/ucnvhz.c   |  33 +--------
 icu4c/source/common/ucnvmbcs.c |  63 +----------------
 6 files changed, 88 insertions(+), 220 deletions(-)

diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c
index ab9a4e348f..b205a230b0 100644
--- a/icu4c/source/common/ucnv.c
+++ b/icu4c/source/common/ucnv.c
@@ -986,8 +986,8 @@ UChar32 ucnv_getNextUChar(UConverter * converter,
       UTF_NEXT_CHAR(converter->UCharErrorBuffer, i, sizeof(converter->UCharErrorBuffer), myUChar);
       /*In this memmove we update the internal buffer by
        *popping the first character.
-         *Note that in the call itself we decrement
-         *UCharErrorBufferLength
+       *Note that in the call itself we decrement
+       *UCharErrorBufferLength
        */
       uprv_memmove (converter->UCharErrorBuffer,
                    converter->UCharErrorBuffer + i,
@@ -1005,7 +1005,13 @@ UChar32 ucnv_getNextUChar(UConverter * converter,
   args.target = NULL;
   args.targetLimit = NULL;
   args.size = sizeof(args);
-  ch = converter->sharedData->impl->getNextUChar(&args, err);
+  if (converter->sharedData->impl->getNextUChar != NULL)
+  {
+	ch = converter->sharedData->impl->getNextUChar(&args, err);
+  } else {
+	/* default implementation */
+	ch = ucnv_getNextUCharFromToUImpl(&args, converter->sharedData->impl->toUnicode, FALSE, err);
+  }
   *source = args.source;
   return ch;
 }
diff --git a/icu4c/source/common/ucnv2022.c b/icu4c/source/common/ucnv2022.c
index 2d552f1e2c..196da651d4 100644
--- a/icu4c/source/common/ucnv2022.c
+++ b/icu4c/source/common/ucnv2022.c
@@ -152,9 +152,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_JP(UConverterToUnicodeArgs* args,
 U_CFUNC void UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 
                                                             UErrorCode* err);
 
-U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_JP (UConverterToUnicodeArgs * args,
-                                                     UErrorCode * err);
-
 /***************** ISO-2022-KR ********************************/
 U_CFUNC void UConverter_fromUnicode_ISO_2022_KR(UConverterFromUnicodeArgs* args, 
                                                 UErrorCode* err);
@@ -168,9 +165,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_KR(UConverterToUnicodeArgs* args,
 U_CFUNC void UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 
                                                             UErrorCode* err);
 
-U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_KR (UConverterToUnicodeArgs * args,
-                                                     UErrorCode * err);
-
 /***************** ISO-2022-CN ********************************/
 U_CFUNC void UConverter_fromUnicode_ISO_2022_CN(UConverterFromUnicodeArgs* args, 
                                                 UErrorCode* err);
@@ -184,9 +178,6 @@ U_CFUNC void UConverter_toUnicode_ISO_2022_CN(UConverterToUnicodeArgs* args,
 U_CFUNC void UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC(UConverterToUnicodeArgs* args, 
                                                             UErrorCode* err);
 
-U_CFUNC UChar32 UConverter_getNextUChar_ISO_2022_CN (UConverterToUnicodeArgs * args,
-                                                     UErrorCode * err);
-
 #define ESC_2022 0x1B /*ESC*/
 
 typedef enum 
@@ -416,7 +407,7 @@ static const UConverterImpl _ISO2022JPImpl={
     UConverter_toUnicode_ISO_2022_JP_OFFSETS_LOGIC,
     UConverter_fromUnicode_ISO_2022_JP,
     UConverter_fromUnicode_ISO_2022_JP_OFFSETS_LOGIC,
-    UConverter_getNextUChar_ISO_2022_JP,
+    NULL,
     
     NULL,
     _ISO2022getName
@@ -447,7 +438,7 @@ static const UConverterImpl _ISO2022KRImpl={
     UConverter_toUnicode_ISO_2022_KR_OFFSETS_LOGIC,
     UConverter_fromUnicode_ISO_2022_KR,
     UConverter_fromUnicode_ISO_2022_KR_OFFSETS_LOGIC,
-    UConverter_getNextUChar_ISO_2022_KR,
+    NULL,
     
     NULL,
     _ISO2022getName
@@ -479,7 +470,7 @@ static const UConverterImpl _ISO2022CNImpl={
     UConverter_toUnicode_ISO_2022_CN_OFFSETS_LOGIC,
     UConverter_fromUnicode_ISO_2022_CN,
     UConverter_fromUnicode_ISO_2022_CN_OFFSETS_LOGIC,
-    UConverter_getNextUChar_ISO_2022_CN,
+    NULL,
     
     NULL,
     _ISO2022getName
@@ -1899,43 +1890,6 @@ static void concatChar(UConverterFromUnicodeArgs* args, int32_t *targetIndex, in
 
 /*************** to unicode *******************/
 
-/*
-* This is a simple, interim implementation of GetNextUChar()
-* that allows to concentrate on testing one single implementation
-* of the ToUnicode conversion before it gets copied to
-* multiple version that are then optimized for their needs
-* (with vs. without offsets and getNextUChar).
-*/
-
-U_CFUNC UChar32
-UConverter_getNextUChar_ISO_2022_JP(UConverterToUnicodeArgs *pArgs,
-                                    UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
-    
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-    
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-        UConverter_toUnicode_ISO_2022_JP(pArgs, pErrorCode);
-        if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-            return 0xffff;
-        } else if(pArgs->target!=buffer) {
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-            }
-            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer);
-        }
-    }
-    
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
-}
-
 /****************************************************************************
  * Recognized escape sequences are
  * <ESC>(B  ASCII      
@@ -3243,43 +3197,6 @@ END_LOOP:
     args->source = mySource;
 }
 
-/*
-* This is a simple, interim implementation of GetNextUChar()
-* that allows to concentrate on testing one single implementation
-* of the ToUnicode conversion before it gets copied to
-* multiple version that are then optimized for their needs
-* (with vs. without offsets and getNextUChar).
-*/
-
-U_CFUNC UChar32
-UConverter_getNextUChar_ISO_2022_KR(UConverterToUnicodeArgs *pArgs,
-                                    UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
-    
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-    
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-        UConverter_toUnicode_ISO_2022_KR(pArgs, pErrorCode);
-        if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-            return 0xffff;
-        } else if(pArgs->target!=buffer) {
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-            }
-            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer);
-        }
-    }
-    
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
-}
-
 /*************************** END ISO2022-KR *********************************/
 
 
@@ -4779,40 +4696,3 @@ END_LOOP:
     args->target = myTarget;
     args->source = mySource;
 }
-
-/*
-* This is a simple, interim implementation of GetNextUChar()
-* that allows to concentrate on testing one single implementation
-* of the ToUnicode conversion before it gets copied to
-* multiple version that are then optimized for their needs
-* (with vs. without offsets and getNextUChar).
-*/
-
-U_CFUNC UChar32
-UConverter_getNextUChar_ISO_2022_CN(UConverterToUnicodeArgs *pArgs,
-                                    UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
-    
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-    
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-        UConverter_toUnicode_ISO_2022_CN(pArgs, pErrorCode);
-        if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-            return 0xffff;
-        } else if(pArgs->target!=buffer) {
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-            }
-            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer);
-        }
-    }
-    
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
-}
diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c
index 35c445738d..adda252c41 100644
--- a/icu4c/source/common/ucnv_cnv.c
+++ b/icu4c/source/common/ucnv_cnv.c
@@ -193,3 +193,51 @@ ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex
         return NULL;
     }
 }
+
+/*
+ * This is a simple implementation of ucnv_getNextUChar() that uses the
+ * converter's toUnicode() function. See ucnv_cnv.h for details.
+ */
+U_CFUNC UChar32
+ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
+							 T_ToUnicodeFunction toU,
+							 UBool collectPairs,
+							 UErrorCode *pErrorCode) {
+    UChar buffer[UTF_MAX_CHAR_LENGTH];
+    const char *realLimit=pArgs->sourceLimit;
+
+    pArgs->target=buffer;
+    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
+
+    while(pArgs->source<realLimit) {
+        /* feed in one byte at a time to make sure to get only one character out */
+        pArgs->sourceLimit=pArgs->source+1;
+        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
+
+		/* convert this byte and check the result */
+        toU(pArgs, pErrorCode);
+        if(U_SUCCESS(*pErrorCode)) {
+            int32_t length=pArgs->target-buffer;
+
+			/* this test is UTF-16 specific */
+            if(/* some output and
+				  (source consumed or don't collect surrogate pairs or not a surrogate or a surrogate pair) */
+               length>0 &&
+               (pArgs->flush || !collectPairs || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2)
+            ) {
+                return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length);
+            }
+			/* else continue with the loop */
+		} else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+            *pErrorCode=U_ZERO_ERROR;
+            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, UTF_MAX_CHAR_LENGTH);
+        } else {
+			/* U_FAILURE() */
+            return 0xffff;
+        }
+    }
+
+    /* no output because of empty input or only state changes and skipping callbacks */
+    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+    return 0xffff;
+}
diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h
index 40df55fa56..b55663fa41 100644
--- a/icu4c/source/common/ucnv_cnv.h
+++ b/icu4c/source/common/ucnv_cnv.h
@@ -260,4 +260,30 @@ ucnv_updateCallbackOffsets(int32_t *offsets, int32_t length, int32_t sourceIndex
 #define FROM_U_USE_FALLBACK(useFallback, c) ((useFallback) || (uint32_t)((c)-0xe000)<0x1900 || (uint32_t)((c)-0xf0000)<0x20000)
 #define UCNV_FROM_U_USE_FALLBACK(cnv, c) FROM_U_USE_FALLBACK((cnv)->useFallback, c)
 
+/**
+ * This is a simple implementation of ucnv_getNextUChar() that uses the
+ * converter's toUnicode() function.
+ *
+ * \par
+ * A surrogate pair from a single byte sequence is always
+ * combined to a supplementary code point.
+ * A surrogate pair from consecutive byte sequences is only combined
+ * if collectPairs is set. This is necessary for SCSU
+ * but not allowed for most legacy codepages.
+ *
+ * @param pArgs The argument structure supplied by ucnv_getNextUChar()
+ * @param toU   A function pointer to the converter's toUnicode() function
+ * @param collectPairs indicates whether separate surrogate results from
+ *                     consecutive byte sequences should be combined into
+ *                     a single code point
+ * @param pErrorCode An ICU error code parameter
+ * @return The Unicode code point as a result of a conversion of a minimal
+ *         number of input bytes
+ */
+U_CFUNC UChar32
+ucnv_getNextUCharFromToUImpl(UConverterToUnicodeArgs *pArgs,
+							 T_ToUnicodeFunction toU,
+							 UBool collectPairs,
+							 UErrorCode *pErrorCode);
+
 #endif /* UCNV_CNV */
diff --git a/icu4c/source/common/ucnvhz.c b/icu4c/source/common/ucnvhz.c
index f631e0313c..9679bc4c51 100644
--- a/icu4c/source/common/ucnvhz.c
+++ b/icu4c/source/common/ucnvhz.c
@@ -60,9 +60,6 @@ U_CFUNC void UConverter_fromUnicode_HZ(UConverterFromUnicodeArgs *args,
 U_CFUNC void UConverter_fromUnicode_HZ_OFFSETS_LOGIC (UConverterFromUnicodeArgs *args,
                                                               UErrorCode *err);
 
-U_CFUNC UChar32 UConverter_getNextUChar_HZ (UConverterToUnicodeArgs *pArgs,
-                                                    UErrorCode *pErrorCode);   
-
 static UConverterImpl _HZImpl={
     UCNV_HZ,
     
@@ -77,7 +74,7 @@ static UConverterImpl _HZImpl={
     UConverter_toUnicode_HZ_OFFSETS_LOGIC,
     UConverter_fromUnicode_HZ,
     UConverter_fromUnicode_HZ_OFFSETS_LOGIC,
-    UConverter_getNextUChar_HZ,
+    NULL,
     
     NULL,
     NULL
@@ -998,31 +995,3 @@ CALLBACK:
     
     return;
 }
-
-U_CFUNC UChar32 UConverter_getNextUChar_HZ (UConverterToUnicodeArgs * pArgs,
-                                            UErrorCode *pErrorCode){
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
-    
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-    
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-        UConverter_toUnicode_HZ(pArgs, pErrorCode);
-        if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-            return 0xffff;
-        } else if(pArgs->target!=buffer) {
-            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                *pErrorCode=U_ZERO_ERROR;
-            }
-            return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, pArgs->target-buffer);
-        }
-    }
-    
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
-}
diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c
index 0011fb6ce3..103d489ce2 100644
--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
@@ -908,73 +908,12 @@ endloop:
 }
 
 /*
- * This is a simple, interim implementation of GetNextUChar()
- * that allows to concentrate on testing one single implementation
- * of the ToUnicode conversion before it gets copied to
- * multiple version that are then optimized for their needs
- * (with vs. without offsets and getNextUChar).
  * ### TODO: implement this directly similar to ToUnicode()
  */
 U_CFUNC UChar32
 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
                   UErrorCode *pErrorCode) {
-    UChar buffer[UTF_MAX_CHAR_LENGTH];
-    const char *realLimit=pArgs->sourceLimit;
-
-    pArgs->target=buffer;
-    pArgs->targetLimit=buffer+UTF_MAX_CHAR_LENGTH;
-
-    while(pArgs->source<realLimit) {
-        /* feed in one byte at a time to make sure to get only one character out */
-        pArgs->sourceLimit=pArgs->source+1;
-        pArgs->flush= (UBool)(pArgs->sourceLimit==realLimit);
-        _MBCSToUnicode(pArgs, pErrorCode);
-        if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) {
-            return 0xffff;
-        } else {
-            int32_t length=pArgs->target-buffer;
-#if 0
-            /*
-             *     markus 2000-oct-26
-             *
-             * This version of the exit condition is commented out because of
-             * a clarification of the semantics of ucnv_getNextUChar() (see updated javadoc):
-             *
-             * Codepages that provide direct encodings of supplementary Unicode code points (U+10000 and up)
-             * should return single surrogates without combining them into pairs if single surrogates
-             * are encoded. This group of codepages includes UTF-8, UTF-32, and GB 18030.
-             *
-             * Codepages that provide direct encodings only of single surrogates
-             * must attempt to match pairs of them into supplementary code points.
-             * Single surrogates are returned only if they are not part of matched pairs.
-             * This group of codepages includes SCSU, LMBCS, and UTF-16.
-             *
-             * Currently, there is no MBCS codepage in the second group. SCSU, LMBCS, and UTF-16
-             * are implemented with separate code.
-             *
-             * Therefore, this feature is removed here.
-             * It might need to be added back in later when some MBCS codepages are created that
-             * fall into the second group. In this case, a flag in the .cnv file will be necessary
-             * to indicate this. makeconv would need to set this flag based on whether the codepage
-             * contains only mappings for single surrogates but
-             * not directly for any supplementary code points.
-             */
-            if(/* some output and (source consumed or not a surrogate or a surrogate pair [UTF-16 specific]) */
-               length>0 &&
-               (pArgs->flush || !UTF_IS_FIRST_SURROGATE(buffer[0]) || length==2)
-#endif
-            if(length>0) {
-                if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
-                    *pErrorCode=U_ZERO_ERROR;
-                }
-                return ucnv_getUChar32KeepOverflow(pArgs->converter, buffer, length);
-            }
-        }
-    }
-
-    /* no output because of empty input or only state changes and skipping callbacks */
-    *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
-    return 0xffff;
+	return ucnv_getNextUCharFromToUImpl(pArgs, _MBCSToUnicode, FALSE, pErrorCode);
 }
 
 /*