ICU-8038 Implement UnicodeLittle and UnicodeBig in ICU4J.

X-SVN-Rev: 28818
2010-10-13 20:21:16 +00:00 · 2010-10-13 20:21:16 +00:00 · 99102b01d2
commit 99102b01d2
parent 5dac211295
2 changed files with 37 additions and 4 deletions
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetICU.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetICU.java
@ -125,7 +125,9 @@ public abstract class CharsetICU extends Charset{
        algorithmicCharsets.put("ISO-8859-1",            "com.ibm.icu.charset.Charset88591" );
        algorithmicCharsets.put("UTF-16",                "com.ibm.icu.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-16BE",              "com.ibm.icu.charset.CharsetUTF16BE" );
+        algorithmicCharsets.put("UTF-16BE,version=1",    "com.ibm.icu.charset.CharsetUTF16BE" );
        algorithmicCharsets.put("UTF-16LE",              "com.ibm.icu.charset.CharsetUTF16LE" );
+        algorithmicCharsets.put("UTF-16LE,version=1",    "com.ibm.icu.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_OppositeEndian",  "com.ibm.icu.charset.CharsetUTF16LE" );
        algorithmicCharsets.put("UTF16_PlatformEndian",  "com.ibm.icu.charset.CharsetUTF16" );
        algorithmicCharsets.put("UTF-32",                "com.ibm.icu.charset.CharsetUTF32" );
--- a/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF16.java
+++ b/icu4j/main/classes/charset/src/com/ibm/icu/charset/CharsetUTF16.java
@ -35,10 +35,20 @@ class CharsetUTF16 extends CharsetICU {
    private int endianXOR;
    private byte[] bom;
    private byte[] fromUSubstitution;
+    
+    private int version;

    public CharsetUTF16(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
        super(icuCanonicalName, javaCanonicalName, aliases);

+        /* Get the version number (e.g. UTF-16LE,version=1) */
+        int versionIndex = icuCanonicalName.indexOf("version=");
+        if (versionIndex > 0) {
+            version = Integer.decode(icuCanonicalName.substring(versionIndex+8, versionIndex+9)).intValue();
+        } else {
+            version = 0;
+        }
+        
        this.isEndianSpecified = (this instanceof CharsetUTF16BE || this instanceof CharsetUTF16LE);
        this.isBigEndian = !(this instanceof CharsetUTF16LE);

@ -98,10 +108,22 @@ class CharsetUTF16 extends CharsetICU {
                            actualEndianXOR = ENDIAN_XOR_LE;
                        } else {
                            // we do not have a BOM (and we have toULength==1 bytes)
-                            actualBOM = null;
-                            actualEndianXOR = endianXOR;
+                            if (isEndianSpecified && version == 1) {
+                                actualBOM = isBigEndian ? CharsetUTF16.BOM_BE : CharsetUTF16.BOM_LE;
+                                actualEndianXOR = isBigEndian ? CharsetUTF16.ENDIAN_XOR_BE : CharsetUTF16.ENDIAN_XOR_LE;
+                            } else {
+                                actualBOM = null;
+                                actualEndianXOR = endianXOR;
+                            }
                            break;
                        }
+                    } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
+                        return CoderResult.malformedForLength(2);
+                    } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
+                        // we found a BOM! at last!
+                        // too bad we have to get ignore it now (like it was unwanted or something)
+                        toULength = 0;
+                        break;
                    } else if (isEndianSpecified || toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
                        // we do not have a BOM (and we have toULength bytes)
                        actualBOM = null;
@ -135,6 +157,15 @@ class CharsetUTF16 extends CharsetICU {
                        return CoderResult.UNDERFLOW;
                    toUBytesArray[toULength++] = source.get();
                }
+                
+                if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 2] && toUBytesArray[toULength - 2] == actualBOM[toULength - 1])) {
+                    return CoderResult.malformedForLength(2);
+                } else if (isEndianSpecified && version == 1 && (toUBytesArray[toULength - 1] == actualBOM[toULength - 1] && toUBytesArray[toULength - 2] == actualBOM[toULength - 2])) {
+                    // we found a BOM! at last!
+                    // too bad we have to get ignore it now (like it was unwanted or something)
+                    toULength = 0;
+                    continue;
+                }

                if (!target.hasRemaining())
                    return CoderResult.OVERFLOW;
@ -202,12 +233,12 @@ class CharsetUTF16 extends CharsetICU {

        public CharsetEncoderUTF16(CharsetICU cs) {
            super(cs, fromUSubstitution);
-            fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
+            fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
        }

        protected void implReset() {
            super.implReset();
-            fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
+            fromUnicodeStatus = (isEndianSpecified && version != 1) ? 0 : NEED_TO_WRITE_BOM;
        }

        protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {