ICU-6955 Remove duplicate code in CharsetMBCS add test case for SCSU.

X-SVN-Rev: 26139
2009-06-22 19:37:57 +00:00 · 2009-06-22 19:37:57 +00:00 · 9dcb89e824
commit 9dcb89e824
parent 42da1f8d6b
4 changed files with 22 additions and 452 deletions
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@ -864,6 +864,13 @@ conversion:table(nofallback) {
          :intvector{},
          :int{1}, :int{0}, "", ".", :bin{""}
        }
+        {
+          "SCSU",
+          :bin{ 0f6441b413a733f2 },
+          "\u6441\ub413\ua733",
+          :intvector{},
+          :int{1}, :int{0}, "illegal", ".", :bin{ f2 }
+        }
      }
    }

--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetMBCS.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetMBCS.java
@ -2772,7 +2772,6 @@ class CharsetMBCS extends CharsetICU {
        }

        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
-
            CoderResult[] cr = { CoderResult.UNDERFLOW };
            // if (!source.hasRemaining() && fromUChar32 == 0)
            // return cr[0];
@ -3861,455 +3860,8 @@ class CharsetMBCS extends CharsetICU {
        }
        
        CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
-            CoderResult[] cr = { CoderResult.UNDERFLOW };
-            
-            char[] table;
-            int p;
-            ByteBuffer bytes;
-            short outputType;
-            
-            SideEffects x = new SideEffects(0, 0, 0, 0, 0, 0);
-            
-            int targetCapacity = target.limit() - target.position();
-            
-            int stage2Entry = 0;
-            //int asciiRoundtrips;
-            long value;
-            int length = 0;
-            int uniMask;
-            
-            boolean doLoop = true;
-            boolean gotoGetTrail = false;
-            
-            if (preFromUFirstCP >= 0) {
-                /*
-                 * pass sourceIndex=-1 because we continue from an earlier buffer
-                 * in the future, this may change with continuous offsets.
-                 */
-                cr[0] = continueMatchFromU(source, target, offsets, flush, -1);
-                if (cr[0].isError() || preFromULength < 0) {
-                    return cr[0];
-                }
-            }
-            
-            /* use optimized function if possible */
-            outputType = sharedData.mbcs.outputType;
-            uniMask = sharedData.mbcs.unicodeMask;
-            if (outputType == MBCS_OUTPUT_1 && ((uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
-                if ((uniMask&UConverterConstants.HAS_SURROGATES) == 0) {
-                    cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush);
-                } else {
-                    cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush);
-                }
-                return cr[0];
-            }/* else if (outputType == MBCS_OUTPUT_2 && mbcs.sharedData.mbcs.utf8Friendly) {
-                cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush);
-                return cr[0];
-            }*/
-            
-            table = sharedData.mbcs.fromUnicodeTable;
-            /* if (mbcs.sharedData.mbcs.utf8Friendly) {
-                mbcsIndex = mbcs.sharedData.mbcs.mbcsIndex;
-            } else {
-                mbcsIndex = null;
-            } */
-            
-            if ((options&UConverterConstants.OPTION_SWAP_LFNL) != 0) {
-                bytes = ByteBuffer.wrap(sharedData.mbcs.swapLFNLFromUnicodeBytes);
-            } else {
-                bytes = ByteBuffer.wrap(sharedData.mbcs.fromUnicodeBytes);
-            }
-            //asciiRoundtrips = mbcs.sharedData.mbcs.asciiRoundtrips;
-            
-            /* get the converter state from UConverter */
-            x.c = fromUChar32;
-            if (outputType == MBCS_OUTPUT_2_SISO) {
-                x.prevLength = fromUnicodeStatus;
-                if (x.prevLength == 0) {
-                    /* set the real value */
-                    x.prevLength = 1;
-                }
-            } else {
-                /* prevent fromUnicodeStatus from being set to something non-0 */
-                x.prevLength = 0;
-            }
-            
-            /* sourceIndex = -1 if the current character began in the previous buffer */
-            x.prevSourceIndex = -1;
-            x.sourceIndex = x.c==0 ? 0 : -1;
-            x.nextSourceIndex = 0;
-            
-            /* conversion loop */
-            if (x.c != 0 && targetCapacity > 0) {
-                gotoGetTrail = true; // set gotoGetTrail flag and go to gotoGetTrail label
-            }
-            
-            while (gotoGetTrail || source.hasRemaining()) {
-                /*
-                 * This following test is to see if available input would overflow the output.
-                 * It does not catch output of more than one byte that
-                 * overflows as a result of a multi-byte character or callback output
-                 * from the last source character.
-                 * Therefore, those situations also test for overflows and will
-                 * then break the loop, too.
-                 */
-                if (gotoGetTrail || targetCapacity > 0) {
-                    /*
-                     * Get a correct Unicode code point:
-                     * a single UChar for a BMP code point or 
-                     * a matched surrogate pair for a "supplementary code point."
-                     */
-                    if (!gotoGetTrail) {
-                        x.c = source.get();
-                        ++x.nextSourceIndex;
-                        /* This is commented out because of the fact that IS_ASCII_ROUNDTRIP is not
-                         * being used in ICU4J.
-                         */
-                        /*if (x.c <= 0x7f && IS_ASCII_ROUNDTRIP(c, asciiRoundtrips)) {
-                            target.put((byte)x.c);
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                                x.prevSourceIndex = x.sourceIndex;
-                                x.sourceIndex = x.nextSourceIndex;
-                            }
-                            targetCapacity--;
-                            x.c = 0;
-                            continue;
-                        }*/
-                    }
-                  /* Code to use utf8friendly code was removed since it is not needed in Java. */
-                    /* This also tests if the codepage maps single surrogates.
-                     * If it does, then surrogates are not paired but mapped separately.
-                     * Note that in this case unmatched surrogates are not detected.
-                     */
-                    if (gotoGetTrail || (UTF16.isSurrogate((char)x.c) && (uniMask&UConverterConstants.HAS_SURROGATES) == 0)) {
-                        if (gotoGetTrail || (UTF16.isLeadSurrogate((char)x.c))) {
-// getTrail label
-                            gotoGetTrail = false; // reset gotoGetTrail flag
-                            
-                            x.sourceArrayIndex = source.position();
-                            
-                            doLoop = getTrail(source, target, uniMask, x, flush, cr);
-                            if (x.doread && doLoop) {
-                                continue;
-                            } else if (!x.doread && !doLoop) {
-                                break;
-                            } else if (!doLoop) {
-                                break;
-                            }
-                        } else {
-                            /* this is an unmatched trail code unit (2nd surrogate) */
-                            /* callback(illegal) */
-                            cr[0] = CoderResult.malformedForLength(1);
-                            break;
-                        }
-                    }
-                    
-                    /* convert the Unicode point in c into codepage bytes */
-                    /*
-                     * The basic lookup is a triple-stage compact array (trie) lookup.
-                     * 
-                     * Single-byte codepages are handled with a different data structure
-                     * by _MBCSSingle... functions.
-                     * 
-                     * The result consists of a 32-bit value from stage 2 and
-                     * a pointer to as many bytes as are stored per character.
-                     * The pointer points to the character's bytes in stage 3.
-                     * Bits 15..0 of the stage 2 entry contain the stage 3 index
-                     * for that pointer, while bits 31..16 are flags for which of
-                     * the 16 characters in the block are roundtrip-assigned.
-                     * 
-                     * For 2-byte and 4 byte codepages, the bytes are stored as uint16_t
-                     * respectively as uint32_t, in the platform encoding.
-                     * For 3-byte codepages, the bytes are always stored in big-endian order.
-                     * 
-                     * For EUC encodings that use only either 0x8e or 0x8f as the first
-                     * byte of their longest byte sequences, the first two bytes in 
-                     * this third stage indicate with their 7th bits whether these bytes
-                     * are to be writeen directly or actually need to be preceeded by
-                     * one of the two Single-Shift codes. With this, the third stage
-                     * stores one byte fewer per character than the actual maximum length of
-                     * EUC byte sequences.
-                     * 
-                     * Other than that, leading zero bytes are removed and the other
-                     * bytes output. A single zero byte may be ouput if the "assigned"
-                     * bit in stage 2 was on.
-                     * The data structure does not support zero byte output as a fallback,
-                     * and also does not allow output of leading zeros.
-                     */
-                    stage2Entry = MBCS_STAGE_2_FROM_U(table, x.c);
-                    
-                    /* get the bytes and the length for the output */
-                    switch (outputType) {
-                    case MBCS_OUTPUT_2:
-                        value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        if (value <= 0xff) {
-                            length = 1;
-                        } else {
-                            length = 2;
-                        }
-                        break;
-                    case MBCS_OUTPUT_2_SISO:
-                        /* 1/2-byte stateful with Shift-In/Shift-Out */
-                        /*
-                         * Save the old state in the converter object
-                         * right here, then change the local pervLength state variable if necessary.
-                         * Then, if this character turns out to be unassigned or a fallback that
-                         * is not taken, the callback code must not save the new state in the converter
-                         * because the new state is for a character that is not output.
-                         * However, the callback must still restore the state from the converter
-                         * in case the callback function changed it for its output.
-                         */
-                        fromUnicodeStatus = x.prevLength; /* save the old state */
-                        value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        if (value <= 0xff) {
-                            if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) {
-                                /* no mapping, leave value == 0 */
-                                length = 0;
-                            } else if (x.prevLength <= 1) {
-                                length = 1;
-                            } else {
-                                /* change from double-byte mode to single-byte */
-                                value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SI<<8);
-                                length = 2;
-                                x.prevLength = 1;
-                            }
-                        } else {
-                            if (x.prevLength == 2) {
-                                length = 2;
-                            } else {
-                                /* change from single-byte mode to double-byte */
-                                value |= UConverterConstants.UNSIGNED_INT_MASK & (UConverterConstants.SO<<16);
-                                length = 3;
-                                x.prevLength = 2;
-                            }
-                        }
-                        break;
-                    case MBCS_OUTPUT_DBCS_ONLY:
-                        /* table with single-byte results, but only DBCS mappings used */
-                        value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        if (value <= 0xff) {
-                            /* no mapping or SBCS result, not taken for DBCS-only */
-                            value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */
-                            length = 0;
-                        } else {
-                            length = 2;
-                        }
-                        break;
-                    case MBCS_OUTPUT_3:
-                        p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
-                        if (value <= 0xff) {
-                            length = 1;
-                        } else if (value <= 0xffff) {
-                            length = 2;
-                        } else {
-                            length = 3;
-                        }
-                        break;
-                    case MBCS_OUTPUT_4:
-                        value = MBCS_VALUE_4_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        if (value <= 0xff) {
-                            length = 1;
-                        } else if (value <= 0xffff) {
-                            length = 2;
-                        } else if (value <= 0xffffff) {
-                            length = 3;
-                        } else {
-                            length = 4;
-                        }
-                        break;
-                    case MBCS_OUTPUT_3_EUC:
-                        value = MBCS_VALUE_2_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        /* EUC 16-bit fixed-length representation */
-                        if (value <= 0xff) {
-                            length = 1;
-                        } else if ((value&0x8000) == 0) {
-                            value |= 0x8e8000;
-                            length = 3;
-                        } else if ((value&0x80) == 0) {
-                            value |= 0x8f0080;
-                            length = 3;
-                        } else {
-                            length = 2;
-                        }
-                        break;
-                    case MBCS_OUTPUT_4_EUC:
-                        p = MBCS_POINTER_3_FROM_STAGE_2(bytes.array(), stage2Entry, x.c);
-                        value = UConverterConstants.UNSIGNED_INT_MASK&((int)bytes.get(p)<<16 | (int)bytes.get(p+1)<<8 | bytes.get(p+2));
-                        /* EUC 16-bit fixed-length representation applied to the first two bytes */
-                        if (value <= 0xff) {
-                            length = 1;
-                        } else if (value <= 0xffff) {
-                            length = 2;
-                        } else if ((value&0x800000) == 0) {
-                            value |= 0x08e800000;
-                            length = 4;
-                        } else if ((value&0x8000) == 0) {
-                            value |= 0x08f008000;
-                            length = 4;
-                        } else {
-                            length = 3;
-                        }
-                        break;
-                    default :
-                        /* must not occur */
-                        value = stage2Entry = 0;
-                        length = 0;
-                        break;
-                    }
-                    /* is this code point assigned, or do we use fallbacks? */
-                    if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, x.c)) || 
-                            (CharsetEncoderICU.isFromUUseFallback(useFallback, x.c) && value != 0)) {
-                        /*
-                         * We allow a 0 byte output if the "assigned" bit is set for this entry.
-                         * There is no way with this data structure for fallback output
-                         * to be a zero byte.
-                         */
-// unassigned label 
-                        int currentSourcePos = source.position();
-                        doLoop = unassigned(source, target, offsets, x, flush, cr);
-                        if (doLoop) {
-                            continue;
-                        } else {
-                            if (source.position() < currentSourcePos) {
-                                source.position(currentSourcePos);
-                            }
-                            break;
-                        }
-                    }
-                    
-                    /* write the output character bytes from value and length */
-                    /* from the first if in the loop we know that targetCapacity>0 */
-                    if (length <= targetCapacity) {
-                        switch (length) {
-                        /* each branch falls through to the next one */
-                        case 4:
-                            target.put((byte)(value>>24));
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        case 3:
-                            target.put((byte)(value>>16));
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        case 2:
-                            target.put((byte)(value>>8));
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        case 1:
-                            target.put((byte)value);
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        default :
-                            /* will never occur */
-                            break;
-                        }
-                        
-                        targetCapacity -= length;
-                    } else {
-                        /*
-                         * We actually do this backwards here:
-                         * In order to save an intermediate variable, we output
-                         * first to the overflow buffer what does not fit into the
-                         * regular target.
-                         */
-                        /* we know that 1<=targetCapacity<length<=4 */
-                        length -= targetCapacity;
-                        int i = 0; // index for errorBuffer
-                        switch (length) {
-                            /* each branch falls through to the next one */
-                        case 3:
-                            errorBuffer[i++] = (byte)(value>>16);
-                        case 2:
-                            errorBuffer[i++] = (byte)(value>>8);
-                        case 1:
-                            errorBuffer[i++] = (byte)value;
-                        default :
-                            /* will never occur */
-                            break;
-                        }
-                        errorBufferLength = length;
-                        
-                        /* now output what fits into the regular target */
-                        value>>=8*length; /* length was reduced by targetCapacity */
-                        switch (targetCapacity) {
-                            /* each branch falls through to the next one */
-                        case 3:
-                            target.put((byte)(value>>16));
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        case 2:
-                            target.put((byte)(value>>8));
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        case 1:
-                            target.put((byte)value);
-                            if (offsets != null) {
-                                offsets.put(x.sourceIndex);
-                            }
-                        default :
-                            /* will never occur */
-                            break;
-                        }
-                        
-                        /* target overflow */
-                        targetCapacity = 0;
-                        cr[0] = CoderResult.OVERFLOW;
-                        x.c = 0;
-                        break;
-                    }
-                    
-                    /* normal end of conversion: prepare for a new character */
-                    x.c = 0;
-                    if (offsets != null) {
-                        x.prevSourceIndex = x.sourceIndex;
-                        x.sourceIndex = x.nextSourceIndex;
-                    }
-                    continue;
-                } else {
-                    /* target is full */
-                    cr[0] = CoderResult.OVERFLOW;
-                    break;
-                }
-            }
-            
-            /*
-             * the end of the input stream and detection of truncated input
-             * are handled by the framework, but for EBCDIC_STATEFUL conversion
-             * we need to emit an SI at the very end
-             * 
-             * conditions:
-             *  successful
-             *  EBCDIC_STATEFUL in DBCS mode
-             *  end of input and no truncated input
-             */
-            if (!cr[0].isError() && outputType == MBCS_OUTPUT_2_SISO && x.prevLength == 2 && flush && !source.hasRemaining() && x.c == 0) {
-                /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */
-                if (targetCapacity > 0) {
-                    target.put((byte)UConverterConstants.SI);
-                    if (offsets != null) {
-                        /* set the last source character's index (sourceIndex points at sourceLimit now) */
-                        offsets.put(x.prevSourceIndex);
-                    }
-                } else {
-                    /* target is full */
-                    errorBuffer[0] = UConverterConstants.SI;
-                    errorBufferLength = 1;
-                    cr[0] = CoderResult.OVERFLOW;
-                }
-                x.prevLength = 1; /* we switched into SBCS */
-            }
-            /* set the converter state back into UConverter */
-            fromUChar32 = x.c;
-            fromUnicodeStatus = x.prevLength;
-            
-            return cr[0];
+            // Just call encodeLoop to remove duplicate code.
+            return encodeLoop(source, target, offsets, flush);
        }

        /*
--- a/icu4j/main/shared/data/testdata.jar
+++ b/icu4j/main/shared/data/testdata.jar
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8e824c842e59c326c65665a7e9f26ec7c7c4c8aa1cec2bde603fb8f27371184
-size 772451
+oid sha256:5e4ffe9070b3d419a5df23d222bb6dcd68790c8044172727c985f5fd8adbe555
+size 772538
--- a/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/main/tests/charset/src/com/ibm/icu/dev/test/charset/TestCharset.java
@ -5157,6 +5157,17 @@ public class TestCharset extends TestFmwk {
                if(!roundTripResult.equals(encoderBuffer)){
                    errln("Error occured while encoding "+ charset.name());
                }
+                // Test overflow for code coverage reasons
+                if (i == 0) {
+                    ByteBuffer test = encoderResult;
+                    test.position(0);
+                    CharBuffer smallBuffer = CharBuffer.allocate(11);
+                    decode.reset();
+                    CoderResult status = decode.decode(test, smallBuffer, true);
+                    if (status != CoderResult.OVERFLOW) {
+                        errln("Overflow buffer error should have been thrown.");
+                    }
+                }
            }catch(Exception e){
                errln("Exception while converting SCSU thrown: " + e);
            }