ICU-484 add unicodeMask to UConverterStaticData for optimized implementations

X-SVN-Rev: 3280
2000-12-19 23:07:50 +00:00 · 2000-12-19 23:07:50 +00:00 · 998f792a5a
commit 998f792a5a
parent b2b6812d1e
11 changed files with 657 additions and 84 deletions
--- a/icu4c/source/common/ucnv2022.c
+++ b/icu4c/source/common/ucnv2022.c
@ -376,7 +376,8 @@ const UConverterStaticData _ISO2022StaticData={
    1, 
    FALSE, 
    FALSE,
-    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };
            
            
--- a/icu4c/source/common/ucnv_bld.h
+++ b/icu4c/source/common/ucnv_bld.h
@ -64,6 +64,10 @@ typedef union UConverterTable UConverterTable;
 struct UConverterImpl;
 typedef struct UConverterImpl UConverterImpl;

+/** values for the unicodeMask */
+#define UCNV_HAS_SUPPLEMENTARY 1
+#define UCNV_HAS_SURROGATES    2
+
 typedef struct UConverterStaticData {   /* +offset: size */
    uint32_t structSize;                /* +0: 4 Size of this structure */
    
@ -83,7 +87,8 @@ typedef struct UConverterStaticData {   /* +offset: size */
    
    uint8_t hasToUnicodeFallback;   /* +77: 1 UBool needs to be changed to UBool to be consistent across platform */
    uint8_t hasFromUnicodeFallback; /* +78: 1 */
-    uint8_t reserved[21];           /* +79: 21 to round out the structure */
+    uint8_t unicodeMask;            /* +79: 1  bit 0: has supplementary  bit 1: has single surrogates */
+    uint8_t reserved[20];           /* +80: 20 to round out the structure */
                                    /* total size: 100 */
 } UConverterStaticData;

--- a/icu4c/source/common/ucnv_lmb.c
+++ b/icu4c/source/common/ucnv_lmb.c
@ -518,7 +518,7 @@ const UConverterStaticData _LMBCSStaticData##n={\
  sizeof(UConverterStaticData),\
 "LMBCS-"  #n,\
    0, UCNV_IBM, UCNV_LMBCS_##n, 1, 1,\
-    { 0x3f, 0, 0, 0 },1,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
+    { 0x3f, 0, 0, 0 },1,FALSE,FALSE,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} \
 };\
 const UConverterSharedData _LMBCSData##n={\
    sizeof(UConverterSharedData), ~((uint32_t) 0),\
--- a/icu4c/source/common/ucnv_utf.c
+++ b/icu4c/source/common/ucnv_utf.c
@ -784,7 +784,9 @@ const UConverterStaticData _UTF8StaticData={
  sizeof(UConverterStaticData),
 "UTF8",
    1208, UCNV_IBM, UCNV_UTF8, 1, 4,
-    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


@ -964,7 +966,9 @@ const UConverterStaticData _UTF16BEStaticData={
  sizeof(UConverterStaticData),
 "UTF16_BigEndian",
    1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
-    { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


@ -1154,7 +1158,9 @@ const UConverterStaticData _UTF16LEStaticData={
    sizeof(UConverterStaticData),
    "UTF16_LittleEndian",
    1200, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
-    { 0xfd, 0xff, 0, 0 },2,0,0,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    { 0xfd, 0xff, 0, 0 },2,0,0,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


@ -1382,7 +1388,8 @@ const UConverterStaticData _UTF32BEStaticData = {
    0,  /* Should be the UTF-32 CCSID */
    UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
-    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

 const UConverterSharedData _UTF32BEData = {
@ -1610,7 +1617,8 @@ const UConverterStaticData _UTF32LEStaticData = {
    0,  /* Should be the UTF-32 CCSID */
    UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
-    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


--- a/icu4c/source/common/ucnvhz.c
+++ b/icu4c/source/common/ucnvhz.c
@ -85,7 +85,8 @@ const UConverterStaticData _HZStaticData={
        "HZ",
        2023, UCNV_IBM, UCNV_HZ, 1, 4,
    { 0x1a, 0, 0, 0 },1, FALSE, FALSE,
-    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} /* reserved */
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };
            
            
--- a/icu4c/source/common/ucnvlat1.c
+++ b/icu4c/source/common/ucnvlat1.c
@ -195,10 +195,12 @@ static const UConverterImpl _Latin1Impl={
 };

 const UConverterStaticData _Latin1StaticData={
-  sizeof(UConverterStaticData),
-  "LATIN_1",
+    sizeof(UConverterStaticData),
+    "LATIN_1",
    819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
-  { 0x1a, 0, 0, 0 },1,FALSE, FALSE,{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    { 0x1a, 0, 0, 0 },1,FALSE, FALSE,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
@ -33,6 +33,7 @@
 #include "unicode/utypes.h"
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_cb.h"
+#include "unicode/udata.h"
 #include "ucnv_bld.h"
 #include "ucnvmbcs.h"
 #include "ucnv_cnv.h"
@ -173,6 +174,10 @@ U_CFUNC void
 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                                UErrorCode *pErrorCode);

+U_CFUNC void
+_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
+                            UErrorCode *pErrorCode);
+
 U_CFUNC UChar32
 _MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
                  UErrorCode *pErrorCode);
@ -185,6 +190,10 @@ U_CFUNC void
 _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
                                  UErrorCode *pErrorCode);

+U_CFUNC void
+_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
+                              UErrorCode *pErrorCode);
+
 static void
 fromUCallback(UConverter *cnv,
              void *context, UConverterFromUnicodeArgs *pArgs,
@ -238,6 +247,7 @@ U_CFUNC void
 _MBCSLoad(UConverterSharedData *sharedData,
          const uint8_t *raw,
          UErrorCode *pErrorCode) {
+    UDataInfo info;
    UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
    _MBCSHeader *header=(_MBCSHeader *)raw;

@ -255,6 +265,20 @@ _MBCSLoad(UConverterSharedData *sharedData,
    mbcsTable->fromUnicodeTable=(const uint16_t *)(raw+header->offsetFromUTable);
    mbcsTable->fromUnicodeBytes=(const uint8_t *)(raw+header->offsetFromUBytes);
    mbcsTable->outputType=(uint8_t)header->flags;
+
+    /*
+     * converter versions 6.1 and up contain a unicodeMask that is
+     * used here to select the most efficient function implementations
+     */
+    info.size=sizeof(UDataInfo);
+    udata_getInfo((UDataMemory *)sharedData->dataMemory, &info);
+    if(info.formatVersion[0]>6 || info.formatVersion[0]==6 && info.formatVersion[1]>=1) {
+        /* mask off possible future extensions to be safe */
+        mbcsTable->unicodeMask=sharedData->staticData->unicodeMask&3;
+    } else {
+        /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */
+        mbcsTable->unicodeMask=UCNV_HAS_SUPPLEMENTARY|UCNV_HAS_SURROGATES;
+    }
 }

 U_CFUNC void
@ -338,7 +362,11 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    /* use optimized function if possible */
    cnv=pArgs->converter;
    if(cnv->sharedData->table->mbcs.countStates==1) {
-        _MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
+        if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+            _MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
+        } else {
+            _MBCSSingleToUnicodeWithOffsets(pArgs, pErrorCode);
+        }
        return;
    }

@ -669,16 +697,16 @@ callback:
                 */
                if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                    break;
-                } else if(cnv->UCharErrorBufferLength>0) {
-                    /* target is full */
-                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                    break;
                } else if(U_FAILURE(*pErrorCode)) {
                    /* break on error */
                    offset=0;
                    state=0;
                    byteIndex=0;
                    break;
+                } else if(cnv->UCharErrorBufferLength>0) {
+                    /* target is full */
+                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+                    break;
                }

                /*
@ -717,7 +745,7 @@ endloop:
    pArgs->offsets=offsets;
 }

-/* This version of _MBCSToUnicode() is optimized for single-byte, single-state codepages. */
+/* This version of _MBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */
 U_CFUNC void
 _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
                                UErrorCode *pErrorCode) {
@ -875,13 +903,13 @@ callback:
             */
            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                break;
+            } else if(U_FAILURE(*pErrorCode)) {
+                /* break on error */
+                break;
            } else if(cnv->UCharErrorBufferLength>0) {
                /* target is full */
                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
                break;
-            } else if(U_FAILURE(*pErrorCode)) {
-                /* break on error */
-                break;
            }

            /*
@ -903,6 +931,175 @@ endloop:
    pArgs->offsets=offsets;
 }

+/*
+ * This version of _MBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages
+ * that only map to and from the BMP.
+ * In addition to single-byte optimizations, the offset calculations
+ * become much easier.
+ */
+U_CFUNC void
+_MBCSSingleToBMPWithOffsets(UConverterToUnicodeArgs *pArgs,
+                            UErrorCode *pErrorCode) {
+    UConverter *cnv;
+    const uint8_t *source, *sourceLimit, *lastSource;
+    UChar *target;
+    int32_t targetCapacity, length;
+    int32_t *offsets;
+
+    const int32_t (*stateTable)[256];
+
+    int32_t sourceIndex;
+
+    int32_t entry;
+    uint8_t b;
+    UConverterCallbackReason reason;
+
+    /* set up the local pointers */
+    cnv=pArgs->converter;
+    source=(const uint8_t *)pArgs->source;
+    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+    target=pArgs->target;
+    targetCapacity=pArgs->targetLimit-pArgs->target;
+    offsets=pArgs->offsets;
+
+    stateTable=cnv->sharedData->table->mbcs.stateTable;
+
+    /* sourceIndex=-1 if the current character began in the previous buffer */
+    sourceIndex=0;
+    lastSource=source;
+
+    /*
+     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+     * for the minimum of the sourceLength and targetCapacity
+     */
+    length=sourceLimit-source;
+    if(length<targetCapacity) {
+        targetCapacity=length;
+    }
+
+    /* conversion loop */
+    while(targetCapacity>0) {
+        b=*source++;
+        entry=stateTable[0][b];
+        /* entry<0 */
+        /*
+         * bit 31 is set, bits:
+         * 30..27 action code
+         *        (do not mask out bit 31 for speed, include it in action values)
+         * 26..7  depend on the action code
+         *  6..0  next state
+         */
+
+        /* switch per action code */
+        switch((uint32_t)entry>>27U) {
+        case 16|MBCS_STATE_ILLEGAL:
+            /* bits 26..7 are not used, 0 */
+            /* callback(illegal) */
+            reason=UCNV_ILLEGAL;
+            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+            break;
+        case 16|MBCS_STATE_UNASSIGNED:
+            /* bits 26..7 are not used, 0 */
+            /* callback(unassigned) */
+            reason=UCNV_UNASSIGNED;
+            *pErrorCode=U_INVALID_CHAR_FOUND;
+            break;
+        case 16|MBCS_STATE_FALLBACK_DIRECT_16:
+            /* bits 26..23 are not used, 0 */
+            /* bits 22..7 contain the Unicode BMP code point */
+            if(!UCNV_TO_U_USE_FALLBACK(cnv)) {
+                /* callback(unassigned) */
+                reason=UCNV_UNASSIGNED;
+                *pErrorCode=U_INVALID_CHAR_FOUND;
+                break;
+            }
+            /* fall through to the MBCS_STATE_VALID_DIRECT_16 branch */
+        case 16|MBCS_STATE_VALID_DIRECT_16:
+            /* bits 26..23 are not used, 0 */
+            /* bits 22..7 contain the Unicode BMP code point */
+            /* output BMP code point */
+            *target++=(UChar)(entry>>7);
+            --targetCapacity;
+            continue;
+        default:
+            /* reserved, must never occur */
+            /* bits 26..7 are not used, 0 */
+            continue;
+        }
+
+        /* call the callback function with all the preparations and post-processing */
+        /* set offsets since the start or the last callback */
+        if(offsets!=NULL) {
+            int32_t count=(int32_t)(source-lastSource);
+
+            /* predecrement: do not set the offset for the callback-causing character */
+            while(--count>0) {
+                *offsets++=sourceIndex++;
+            }
+            /* offset and sourceIndex are now set for the current character */
+        }
+
+        /* update the arguments structure */
+        pArgs->source=(const char *)source;
+        pArgs->target=target;
+        pArgs->offsets=offsets;
+
+        /* copy the current bytes to invalidCharBuffer */
+        cnv->invalidCharBuffer[0]=b;
+        cnv->invalidCharLength=1;
+
+        /* call the callback function */
+        toUCallback(cnv, cnv->toUContext, pArgs, (const char *)&b, 1, reason, pErrorCode);
+
+        /* update target and deal with offsets if necessary */
+        offsets=ucnv_updateCallbackOffsets(offsets, pArgs->target-target, sourceIndex);
+        target=pArgs->target;
+
+        /* update the source pointer and index */
+        sourceIndex+=1+((const uint8_t *)pArgs->source-source);
+        source=lastSource=(const uint8_t *)pArgs->source;
+        targetCapacity=pArgs->targetLimit-target;
+        length=sourceLimit-source;
+        if(length<targetCapacity) {
+            targetCapacity=length;
+        }
+
+        /*
+         * If the callback overflowed the target, then we need to
+         * stop here with an overflow indication.
+         */
+        if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+            break;
+        } else if(U_FAILURE(*pErrorCode)) {
+            /* break on error */
+            break;
+        } else if(cnv->UCharErrorBufferLength>0) {
+            /* target is full */
+            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+            break;
+        }
+    }
+
+    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=pArgs->targetLimit) {
+        /* target is full */
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+    }
+
+    /* set offsets since the start or the last callback */
+    if(offsets!=NULL) {
+        size_t count=source-lastSource;
+        while(count>0) {
+            *offsets++=sourceIndex++;
+            --count;
+        }
+    }
+
+    /* write back the updated pointers */
+    pArgs->source=(const char *)source;
+    pArgs->target=target;
+    pArgs->offsets=offsets;
+}
+
 U_CFUNC UChar32
 _MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
                  UErrorCode *pErrorCode) {
@ -1521,7 +1718,11 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    cnv=pArgs->converter;
    outputType=cnv->sharedData->table->mbcs.outputType;
    if(outputType==MBCS_OUTPUT_1) {
-        _MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
+        if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
+            _MBCSSingleFromBMPWithOffsets(pArgs, pErrorCode);
+        } else {
+            _MBCSSingleFromUnicodeWithOffsets(pArgs, pErrorCode);
+        }
        return;
    }

@ -1917,14 +2118,14 @@ callback:
             */
            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                break;
-            } else if(cnv->charErrorBufferLength>0) {
-                /* target is full */
-                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                break;
            } else if(U_FAILURE(*pErrorCode)) {
                /* break on error */
                c=0;
                break;
+            } else if(cnv->charErrorBufferLength>0) {
+                /* target is full */
+                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+                break;
            }

            /*
@ -1942,7 +2143,7 @@ callback:
    if(pArgs->flush && source>=sourceLimit) {
        /* reset the state for the next conversion */
        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
+            /* a Unicode code point remains incomplete (only a first surrogate) */
            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
        }
        cnv->fromUSurrogateLead=0;
@ -1969,7 +2170,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

    const uint16_t *table;
    const uint8_t *bytes;
-    uint8_t outputType;

    UChar32 c;

@ -1977,7 +2177,7 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

    UConverterCallbackReason reason;
    uint32_t i;
-    uint32_t value;
+    uint8_t value;

    /* set up the local pointers */
    cnv=pArgs->converter;
@ -1989,7 +2189,6 @@ _MBCSSingleFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

    table=cnv->sharedData->table->mbcs.fromUnicodeTable;
    bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
-    outputType=cnv->sharedData->table->mbcs.outputType;

    /* get the converter state from UConverter */
    c=cnv->fromUSurrogateLead;
@ -2064,7 +2263,21 @@ getTrail:
                value=*p;

                /* is the codepage value really an "unassigned" indicator? */
-                if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
+                if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
+                    /* assigned, write the output character bytes from value and length */
+                    /* length==1 */
+                    /* this is easy because we know that there is enough space */
+                    *target++=value;
+                    if(offsets!=NULL) {
+                        *offsets++=sourceIndex;
+                    }
+                    --targetCapacity;
+
+                    /* normal end of conversion: prepare for a new character */
+                    c=0;
+                    sourceIndex=nextSourceIndex;
+                    continue;
+                } else { /* unassigned */
                    /*
                     * We allow a 0 byte output if the Unicode code point is
                     * U+0000 and also if the "assigned" bit is set for this entry.
@ -2074,29 +2287,13 @@ getTrail:
                    /* callback(unassigned) */
                    reason=UCNV_UNASSIGNED;
                    *pErrorCode=U_INVALID_CHAR_FOUND;
-                    goto callback;
                }
            } else {
                /* callback(unassigned) */
                reason=UCNV_UNASSIGNED;
                *pErrorCode=U_INVALID_CHAR_FOUND;
-                goto callback;
            }

-            /* write the output character bytes from value and length */
-            /* length==1 */
-            /* this is easy because we know that there is enough space */
-            *target++=(uint8_t)value;
-            if(offsets!=NULL) {
-                *offsets++=sourceIndex;
-            }
-            --targetCapacity;
-
-            /* normal end of conversion: prepare for a new character */
-            c=0;
-            sourceIndex=nextSourceIndex;
-            continue;
-
 callback:
            /* call the callback function with all the preparations and post-processing */
            /* update the arguments structure */
@ -2133,14 +2330,14 @@ callback:
             */
            if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
                break;
-            } else if(cnv->charErrorBufferLength>0) {
-                /* target is full */
-                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
-                break;
            } else if(U_FAILURE(*pErrorCode)) {
                /* break on error */
                c=0;
                break;
+            } else if(cnv->charErrorBufferLength>0) {
+                /* target is full */
+                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+                break;
            }

            /*
@ -2158,7 +2355,242 @@ callback:
    if(pArgs->flush && source>=sourceLimit) {
        /* reset the state for the next conversion */
        if(c!=0 && U_SUCCESS(*pErrorCode)) {
-            /* a character byte sequence remains incomplete */
+            /* a Unicode code point remains incomplete (only a first surrogate) */
+            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
+        }
+        cnv->fromUSurrogateLead=0;
+    } else {
+        /* set the converter state back into UConverter */
+        cnv->fromUSurrogateLead=(UChar)c;
+    }
+
+    /* write back the updated pointers */
+    pArgs->source=source;
+    pArgs->target=(char *)target;
+    pArgs->offsets=offsets;
+}
+
+/*
+ * This version of _MBCSFromUnicode() is optimized for single-byte codepages
+ * that map only to and from the BMP.
+ * In addition to single-byte/state optimizations, the offset calculations
+ * become much easier.
+ */
+U_CFUNC void
+_MBCSSingleFromBMPWithOffsets(UConverterFromUnicodeArgs *pArgs,
+                              UErrorCode *pErrorCode) {
+    UConverter *cnv;
+    const UChar *source, *sourceLimit, *lastSource;
+    uint8_t *target;
+    int32_t targetCapacity, length;
+    int32_t *offsets;
+
+    const uint16_t *table;
+    const uint8_t *bytes;
+
+    UChar32 c;
+
+    int32_t sourceIndex;
+
+    UConverterCallbackReason reason;
+    uint32_t i;
+    uint8_t value;
+
+    /* set up the local pointers */
+    cnv=pArgs->converter;
+    source=pArgs->source;
+    sourceLimit=pArgs->sourceLimit;
+    target=(uint8_t *)pArgs->target;
+    targetCapacity=pArgs->targetLimit-pArgs->target;
+    offsets=pArgs->offsets;
+
+    table=cnv->sharedData->table->mbcs.fromUnicodeTable;
+    bytes=cnv->sharedData->table->mbcs.fromUnicodeBytes;
+
+    /* get the converter state from UConverter */
+    c=cnv->fromUSurrogateLead;
+
+    /* sourceIndex=-1 if the current character began in the previous buffer */
+    sourceIndex= c==0 ? 0 : -1;
+    lastSource=source;
+
+    /*
+     * since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+     * for the minimum of the sourceLength and targetCapacity
+     */
+    length=sourceLimit-source;
+    if(length<targetCapacity) {
+        targetCapacity=length;
+    }
+
+    /* conversion loop */
+    if(c!=0 && targetCapacity>0) {
+        goto getTrail;
+    }
+
+    while(targetCapacity>0) {
+        /*
+         * Get a correct Unicode code point:
+         * a single UChar for a BMP code point or
+         * a matched surrogate pair for a "surrogate code point".
+         */
+        c=*source++;
+        if(!UTF_IS_SURROGATE(c)) {
+            /* convert the Unicode code point in c into codepage bytes */
+            i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
+
+            /* is this code point assigned, or do we use fallbacks? */
+            if((table[i++]&(1<<(c&0xf)))!=0 || UCNV_FROM_U_USE_FALLBACK(cnv, c)) {
+                const uint8_t *p=bytes;
+
+                /* MBCS_OUTPUT_1 */
+                p+=(16*(uint32_t)table[i]+(c&0xf));
+                value=*p;
+
+                /* is the codepage value really an "unassigned" indicator? */
+                if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
+                    /* assigned, write the output character bytes from value and length */
+                    /* length==1 */
+                    /* this is easy because we know that there is enough space */
+                    *target++=value;
+                    --targetCapacity;
+
+                    /* normal end of conversion: prepare for a new character */
+                    c=0;
+                    continue;
+                } else { /* unassigned */
+                    /*
+                     * We allow a 0 byte output if the Unicode code point is
+                     * U+0000 and also if the "assigned" bit is set for this entry.
+                     * There is no way with this data structure for fallback output
+                     * for other than U+0000 to be a zero byte.
+                     */
+                    /* callback(unassigned) */
+                    reason=UCNV_UNASSIGNED;
+                    *pErrorCode=U_INVALID_CHAR_FOUND;
+                }
+            } else {
+                /* callback(unassigned) */
+                reason=UCNV_UNASSIGNED;
+                *pErrorCode=U_INVALID_CHAR_FOUND;
+            }
+        } else {
+            if(UTF_IS_SURROGATE_FIRST(c)) {
+getTrail:
+                if(source<sourceLimit) {
+                    /* test the following code unit */
+                    UChar trail=*source;
+                    if(UTF_IS_SECOND_SURROGATE(trail)) {
+                        ++source;
+                        c=UTF16_GET_PAIR_VALUE(c, trail);
+                        /* this codepage does not map supplementary code points */
+                        /* callback(unassigned) */
+                        reason=UCNV_UNASSIGNED;
+                        *pErrorCode=U_INVALID_CHAR_FOUND;
+                    } else {
+                        /* this is an unmatched lead code unit (1st surrogate) */
+                        /* callback(illegal) */
+                        reason=UCNV_ILLEGAL;
+                        *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+                    }
+                } else {
+                    /* no more input */
+                    break;
+                }
+            } else {
+                /* this is an unmatched trail code unit (2nd surrogate) */
+                /* callback(illegal) */
+                reason=UCNV_ILLEGAL;
+                *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+            }
+        }
+
+        /* call the callback function with all the preparations and post-processing */
+        /* get the number of code units for c to correctly advance sourceIndex after the callback call */
+        length=UTF_CHAR_LENGTH(c);
+
+        /* set offsets since the start or the last callback */
+        if(offsets!=NULL) {
+            int32_t count=(int32_t)(source-lastSource);
+
+            /* do not set the offset for the callback-causing character */
+            count-=length;
+
+            while(count>0) {
+                *offsets++=sourceIndex++;
+                --count;
+            }
+            /* offset and sourceIndex are now set for the current character */
+        }
+
+        /* update the arguments structure */
+        pArgs->source=source;
+        pArgs->target=(char *)target;
+        pArgs->offsets=offsets;
+
+        /* set the converter state in UConverter to deal with the next character */
+        cnv->fromUSurrogateLead=0;
+
+        /* write the code point as code units */
+        i=0;
+        UTF_APPEND_CHAR_UNSAFE(cnv->invalidUCharBuffer, i, c);
+        cnv->invalidUCharLength=(int8_t)i;
+        /* i==length */
+
+        /* call the callback function */
+        fromUCallback(cnv, cnv->fromUContext, pArgs, cnv->invalidUCharBuffer, i, c, reason, pErrorCode);
+
+        /* get the converter state from UConverter */
+        c=cnv->fromUSurrogateLead;
+
+        /* update target and deal with offsets if necessary */
+        offsets=ucnv_updateCallbackOffsets(offsets, ((uint8_t *)pArgs->target)-target, sourceIndex);
+        target=(uint8_t *)pArgs->target;
+
+        /* update the source pointer and index */
+        sourceIndex+=length+(pArgs->source-source);
+        source=lastSource=pArgs->source;
+        targetCapacity=(uint8_t *)pArgs->targetLimit-target;
+        length=sourceLimit-source;
+        if(length<targetCapacity) {
+            targetCapacity=length;
+        }
+
+        /*
+         * If the callback overflowed the target, then we need to
+         * stop here with an overflow indication.
+         */
+        if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
+            break;
+        } else if(U_FAILURE(*pErrorCode)) {
+            /* break on error */
+            c=0;
+            break;
+        } else if(cnv->charErrorBufferLength>0) {
+            /* target is full */
+            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+            break;
+        }
+    }
+
+    if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
+        /* target is full */
+        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+    }
+
+    /* set offsets since the start or the last callback */
+    if(offsets!=NULL) {
+        size_t count=source-lastSource;
+        while(count>0) {
+            *offsets++=sourceIndex++;
+            --count;
+        }
+    }
+
+    if(pArgs->flush && source>=sourceLimit) {
+        /* reset the state for the next conversion */
+        if(c!=0 && U_SUCCESS(*pErrorCode)) {
+            /* a Unicode code point remains incomplete (only a first surrogate) */
            *pErrorCode=U_TRUNCATED_CHAR_FOUND;
        }
        cnv->fromUSurrogateLead=0;
@ -2295,21 +2727,54 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
        }

        /* is the codepage value really an "unassigned" indicator? */
-        if(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0) {
-            /*
-             * We allow a 0 byte output if the Unicode code point is
-             * U+0000 and also if the "assigned" bit is set for this entry.
-             * There is no way with this data structure for fallback output
-             * for other than U+0000 to be a zero byte.
-             */
-            return 0;
-        } else {
+        /*
+         * We allow a 0 byte output if the Unicode code point is
+         * U+0000 and also if the "assigned" bit is set for this entry.
+         * There is no way with this data structure for fallback output
+         * for other than U+0000 to be a zero byte.
+         */
+        if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
+            /* assigned */
            *pValue=value;
            return length;
        }
-    } else {
-        return 0;
    }
+    return 0;
+}
+
+U_CFUNC int32_t
+_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
+                       UChar32 c,
+                       UBool useFallback) {
+    const uint16_t *table=sharedData->table->mbcs.fromUnicodeTable;
+    uint32_t i;
+    int32_t value;
+
+    /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */
+    i=0x440+2*((uint32_t)table[c>>10]+((c>>4)&0x3f));
+
+    /* is this code point assigned, or do we use fallbacks? */
+    if((table[i++]&(1<<(c&0xf)))!=0 || FROM_U_USE_FALLBACK(useFallback, c)) {
+        const uint8_t *p=sharedData->table->mbcs.fromUnicodeBytes;
+
+        /* get the byte for the output */
+        /* MBCS_OUTPUT_1 */
+        p+=(16*(uint32_t)table[i]+(c&0xf));
+        value=*p;
+
+        /* is the codepage value really an "unassigned" indicator? */
+        /*
+         * We allow a 0 byte output if the Unicode code point is
+         * U+0000 and also if the "assigned" bit is set for this entry.
+         * There is no way with this data structure for fallback output
+         * for other than U+0000 to be a zero byte.
+         */
+        if(!(value==0 && c!=0 && (table[i-1]&(1<<(c&0xf)))==0)) {
+            /* assigned */
+            return value;
+        }
+    }
+    return -1;
 }

 /* miscellaneous ------------------------------------------------------------ */
@ -2369,8 +2834,6 @@ const UConverterSharedData _MBCSData={

 /* GB 18030 special handling ------------------------------------------------ */

-/* ### IMPORTANT: THIS IS ALPHA-VERSION SUPPORT CODE FOR GB 18030 AND MAY CHANGE WITHOUT NOTICE */
-
 /* definition of LINEAR macros and gb18030Ranges see near the beginning of the file */

 /* the callback functions handle GB 18030 specially */
--- a/icu4c/source/common/ucnvmbcs.h
+++ b/icu4c/source/common/ucnvmbcs.h
@ -21,6 +21,10 @@

 /* MBCS converter data and state -------------------------------------------- */

+/**
+ * MBCS action codes for conversions to Unicode.
+ * These values are in bits 30..27 of the state table entries.
+ */
 enum {
    MBCS_STATE_ILLEGAL,
    MBCS_STATE_CHANGE_ONLY,
@ -36,6 +40,11 @@ enum {
    MBCS_STATE_VALID_16_PAIR
 };

+/**
+ * MBCS output types for conversions from Unicode.
+ * These per-converter types determine the storage method in stage 3 of the lookup table,
+ * mostly how many bytes are stored per entry.
+ */
 enum {
    MBCS_OUTPUT_1,
    MBCS_OUTPUT_2,
@ -46,11 +55,19 @@ enum {
    MBCS_OUTPUT_4_EUC
 };

+/**
+ * Fallbacks to Unicode are stored outside the normal state table and code point structures
+ * in a vector of items of this type. They are sorted by offset.
+ */
 typedef struct {
    uint32_t offset;
    UChar32 codePoint;
 } _MBCSToUFallback;

+/**
+ * This is the MBCS part of the UConverterTable union (a runtime data structure).
+ * It keeps all the per-converter data and points into the loaded mapping tables.
+ */
 typedef struct UConverterMBCSTable {
    /* toUnicode */
    uint8_t countStates;
@ -63,10 +80,10 @@ typedef struct UConverterMBCSTable {
    /* fromUnicode */
    const uint16_t *fromUnicodeTable;
    const uint8_t *fromUnicodeBytes;
-    uint8_t outputType;
+    uint8_t outputType, unicodeMask;
 } UConverterMBCSTable;

-/*
+/**
 * MBCS data structure as part of a .cnv file:
 *
 * uint32_t [8]; -- 8 values:
@ -105,20 +122,78 @@ typedef struct {
             reserved;
 } _MBCSHeader;

+/** Forward declaration to enable the following function declarations. */
 struct UConverterSharedData;
+
+/** Forward declaration to enable the following function declarations. */
 typedef struct UConverterSharedData UConverterSharedData;

+/**
+ * This is a simple version of _MBCSGetNextUChar() that is used
+ * by other converter implementations.
+ * It does not use state from the converter, nor error codes.
+ *
+ * Return value:
+ * U+fffe   unassigned
+ * U+ffff   illegal
+ * otherwise the Unicode code point
+ */
 U_CFUNC UChar32
 _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
                        const char **pSource, const char *sourceLimit,
                        UBool useFallback);

+/** This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. */
+U_CFUNC UChar32
+_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
+                              uint8_t b, UBool useFallback);
+
+/**
+ * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte.
+ * It works for single-byte, single-state codepages that only map
+ * to and from BMP code points, and it always
+ * returns fallback values.
+ */
+#define _MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(sharedData, b) \
+    (UChar)(((sharedData)->table->mbcs.stateTable[0][b])>>7)
+
+/**
+ * This is an internal function that allows other converter implementations
+ * to check whether a byte is a lead byte.
+ */
 U_CFUNC UBool
 _MBCSIsLeadByte(UConverterSharedData *sharedData, char byte);

+/** This is a macro version of _MBCSIsLeadByte(). */
+#define _MBCS_IS_LEAD_BYTE(sharedData, byte) \
+    (UBool)((sharedData)->table->mbcs.stateTable[0][(uint8_t)(byte)]>=0)
+
+/**
+ * This is another simple conversion function for internal use by other
+ * conversion implementations.
+ * It does not use the converter state nor call callbacks.
+ * It converts one single Unicode code point into codepage bytes, encoded
+ * as one 32-bit value. The function returns the number of bytes in *pValue:
+ * 1..4 the number of bytes in *pValue
+ * 0    unassigned (*pValue undefined)
+ * -1   illegal (currently not used, *pValue undefined)
+ *
+ * *pValue will contain the resulting bytes with the last byte in bits 7..0,
+ * the second to last byte in bits 15..8, etc.
+ * Currently, the function assumes but does not check that 0<=c<=0x10ffff.
+ */
 U_CFUNC int32_t
 _MBCSFromUChar32(UConverterSharedData *sharedData,
                 UChar32 c, uint32_t *pValue,
                 UBool useFallback);

+/**
+ * This version of _MBCSFromUChar32() is optimized for single-byte codepages.
+ * It returns the codepage byte for the code point, or -1 if it is unassigned.
+ */
+U_CFUNC int32_t
+_MBCSSingleFromUChar32(UConverterSharedData *sharedData,
+                       UChar32 c,
+                       UBool useFallback);
+
 #endif
--- a/icu4c/source/common/ucnvscsu.c
+++ b/icu4c/source/common/ucnvscsu.c
@ -1328,11 +1328,12 @@ static const UConverterStaticData _SCSUStaticData={
    1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */
    { 0x0e, 0xff, 0xfd, 0 }, 3, /* ### the subchar really must be written by an SCSU function! */
    FALSE, FALSE,
-    {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

 const UConverterSharedData _SCSUData={
-    sizeof(UConverterSharedData), 1,
+    sizeof(UConverterSharedData), ~((uint32_t)0),
    NULL, NULL, &_SCSUStaticData, FALSE, &_SCSUImpl,
    0
 };
--- a/icu4c/source/tools/makeconv/makeconv.c
+++ b/icu4c/source/tools/makeconv/makeconv.c
@ -173,7 +173,7 @@ static UDataInfo dataInfo={
    0,

    0x63, 0x6e, 0x76, 0x74,     /* dataFormat="cnvt" */
-    6, 0, 0, 0,                 /* formatVersion */
+    6, 1, 0, 0,                 /* formatVersion */
    0, 0, 0, 0                  /* dataVersion (calculated at runtime) */
 };

@ -648,7 +648,7 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
    int32_t mbcsLength;
    char codepointBytes[20];
    UBool isOK = TRUE;
-    uint8_t precisionMask = 0;
+    uint8_t precisionMask = 0, unicodeMask = 0;
    char endOfLine;

    if(cnvData->startMappings!=NULL)
@ -684,6 +684,13 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
                /* End of line could be \0 or | (if fallback) */
                endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
            } while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
+
+            if(unicodeValue>=0x10000) {
+                unicodeMask|=UCNV_HAS_SUPPLEMENTARY;    /* there are supplementary code points */
+            } else if(UTF_IS_SURROGATE(unicodeValue)) {
+                unicodeMask|=UCNV_HAS_SURROGATES;       /* there are single surrogates */
+            }
+
            if((uint32_t)unicodeValue > 0x10ffff)
            {
                fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
@ -730,6 +737,12 @@ void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, U
        }
    }

+    if(unicodeMask == 3)
+    {
+        fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
+    }
+    staticData->unicodeMask = unicodeMask;
+
    if(cnvData->finishMappings!=NULL)
    {
        cnvData->finishMappings(cnvData, staticData);
--- a/icu4c/source/tools/makeconv/ucnvstat.c
+++ b/icu4c/source/tools/makeconv/ucnvstat.c
@ -17,36 +17,40 @@


 static const UConverterStaticData _SBCSStaticData={
-  sizeof(UConverterStaticData),
-  "SBCS",
+    sizeof(UConverterStaticData),
+    "SBCS",
    0, UCNV_IBM, UCNV_SBCS, 1, 1,
    { 0, 0, 0, 0 }, 1, FALSE, FALSE,
-  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };


 static const UConverterStaticData _DBCSStaticData={
-  sizeof(UConverterStaticData),
-  "DBCS",
+    sizeof(UConverterStaticData),
+    "DBCS",
    0, UCNV_IBM, UCNV_DBCS, 2, 2,
    { 0, 0, 0, 0 },1, FALSE, FALSE, /* subchar */
-  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

 static const UConverterStaticData _MBCSStaticData={
-  sizeof(UConverterStaticData),
-  "MBCS",
+    sizeof(UConverterStaticData),
+    "MBCS",
    0, UCNV_IBM, UCNV_MBCS, 1, 1,
    { 0, 0, 0, 0 }, 1, FALSE, FALSE,
-  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

 static const UConverterStaticData _EBCDICStatefulStaticData={
-  sizeof(UConverterStaticData),
- "EBCDICStateful",
+    sizeof(UConverterStaticData),
+    "EBCDICStateful",
    0, UCNV_IBM, UCNV_EBCDIC_STATEFUL, 1, 1,
-  { 0, 0, 0, 0 },1, FALSE, FALSE,
-  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved bytes */
+    { 0, 0, 0, 0 },1, FALSE, FALSE,
+    0,
+    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
 };

 /* NULLs for algorithmic types, their tables live in ucnv_bld.c */