ICU-3840 Update Charset ISCII to support new characters in Unicode.

X-SVN-Rev: 21869
2007-06-30 00:18:01 +00:00 · 2007-06-30 00:18:01 +00:00 · 9ef6426440
commit 9ef6426440
parent 3fc4373c3e
2 changed files with 125 additions and 4 deletions
--- a/icu4j/src/com/ibm/icu/charset/CharsetISCII.java
+++ b/icu4j/src/com/ibm/icu/charset/CharsetISCII.java
@ -34,6 +34,7 @@ class CharsetISCII extends CharsetICU {
    private final short ISCII_NUKTA = 0xe9;
    private final short ISCII_HALANT = 0xe8;
    private final short ISCII_DANDA = 0xea;
+    private final short ISCII_VOWEL_SIGN_E = 0xe0;
    private final short ISCII_INV = 0xd9;
    private final short INDIC_BLOCK_BEGIN = 0x0900;
    private final short INDIC_BLOCK_END = 0x0d7f;
@ -174,7 +175,7 @@ class CharsetISCII extends CharsetICU {
        /* 0xa1: 0xb8: 0x901 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
        /* 0xa2: 0xfe: 0x902 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK, 
        /* 0xa3: 0xbf: 0x903 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
-        /* 0x00: 0x00: 0x904 */ MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
+        /* 0x00: 0x00: 0x904 */ MaskEnum.DEV_MASK + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO + MaskEnum.ZERO,
        /* 0xa4: 0xff: 0x905 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
        /* 0xa5: 0xff: 0x906 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
        /* 0xa6: 0xff: 0x907 */ MaskEnum.DEV_MASK + MaskEnum.PNJ_MASK + MaskEnum.GJR_MASK + MaskEnum.ORI_MASK + MaskEnum.BNG_MASK + MaskEnum.KND_MASK + MaskEnum.MLM_MASK + MaskEnum.TML_MASK,
@ -296,7 +297,7 @@ class CharsetISCII extends CharsetICU {
      0x00a1, /* 0x0901 */
      0x00a2, /* 0x0902 */
      0x00a3, /* 0x0903 */
-      0xFFFF, /* 0x0904 */
+      0xa4e0, /* 0x0904 */
      0x00a4, /* 0x0905 */
      0x00a5, /* 0x0906 */
      0x00a6, /* 0x0907 */
@ -697,6 +698,10 @@ class CharsetISCII extends CharsetICU {
        { 0xDB, 0x0962 },
        { 0xDC, 0x0963 }
    };
+    private static final char vowelSignESpecialCases[][] = {
+        { 2 /* length of array */ , 0 },
+        { 0xA4, 0x0904 }
+    };
    
    private static final short lookupTable[][] = {
        { MaskEnum.ZERO, MaskEnum.ZERO }, /* DEFAULT */
@ -925,6 +930,25 @@ class CharsetISCII extends CharsetICU {
                                }
                                /* else fall through to default */
                            }
+                        case ISCII_VOWEL_SIGN_E:
+                            /* find <CHAR> + SIGN_VOWEL_E special mapping */
+                            int i = 1;
+                            boolean found = false;
+                            for (; i < vowelSignESpecialCases[0][0]; i++) {
+                                if (vowelSignESpecialCases[i][0] == ((short)data.contextCharToUnicode & UConverterConstants.UNSIGNED_BYTE_MASK)) {
+                                    targetUniChar = vowelSignESpecialCases[i][1];
+                                    found = true;
+                                    break;
+                                }
+                            }
+                            if (found) {
+                                /* find out if the mapping is valid in this state */
+                                if ((validityTable[(byte)targetUniChar] & data.currentMaskFromUnicode) > 0) {
+                                    data.contextCharToUnicode = NO_CHAR_MARKER;
+                                    this.toUnicodeStatus = UConverterConstants.missingCharMarker;
+                                    break;
+                                }
+                            }
                        default:
                            targetUniChar = GetMapping(sourceChar, targetUniChar, data);
                            data.contextCharToUnicode = (char)sourceChar;
@ -979,7 +1003,7 @@ class CharsetISCII extends CharsetICU {
                    toULength = 0;
                }
                
-                if (toUnicodeStatus != UConverterConstants.missingCharMarker) {
+                if (this.toUnicodeStatus != UConverterConstants.missingCharMarker) {
                    /* output a remaining target character */
                    WriteToTargetToU(offsets, (source.position() - 2), source, target, this.toUnicodeStatus, data.currentDeltaToUnicode);
                    this.toUnicodeStatus = UConverterConstants.missingCharMarker;    
--- a/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charset/TestCharset.java
@ -2286,6 +2286,34 @@ public class TestCharset extends TestFmwk {
            errln("ISCII round trip test failed.");
        }
        
+        //Test new characters in the ISCII charset
+        encoder = cs.newEncoder();
+        decoder = cs.newDecoder();
+        char u_pts[] = {
+                (char)0x0904
+            };
+        byte b_pts[] = {
+                /*(byte)0xef, (byte)0x42, */(byte)0xa4, (byte)0xe0
+            };
+        us = CharBuffer.allocate(u_pts.length);
+        bs = ByteBuffer.allocate(b_pts.length);
+        us.put(u_pts);
+        bs.put(b_pts);
+        
+        bs.limit(bs.position());
+        bs.position(0);
+        us.limit(us.position());
+        us.position(0);
+        
+        try {
+            smBufDecode(decoder, "ISCII-update", bs, us, true, true);         
+            bs.position(0);
+            us.position(0);
+            smBufEncode(encoder, "ISCII-update", us, bs, true, true);
+        } catch (Exception ex) {
+            errln("Error occurred while encoding/decoding ISCII with the new characters.");
+        }
+        
        //The rest of the code in this method is to provide better code coverage
        CharBuffer ccus = CharBuffer.allocate(0x10);
        ByteBuffer ccbs = ByteBuffer.allocate(0x10);
@ -2836,6 +2864,75 @@ public class TestCharset extends TestFmwk {
            errln("Exception while encoding UTF32LE (6) should have been thrown.");
        } catch (Exception ex) {
        }
-   
+    }
+    
+    //Test for charset UTF16LE to provide better code coverage
+    public void TestCharsetUTF16LE() {
+        CoderResult result = CoderResult.UNDERFLOW;
+        CharsetProvider provider = new CharsetProviderICU();
+        Charset cs = provider.charsetForName("UTF-16LE");        
+        CharsetEncoder encoder = cs.newEncoder();
+        CharsetDecoder decoder = cs.newDecoder();
+        
+        // Test for malform and change fromUChar32 for next call
+        char u_pts1[] = {
+                (char)0xD805, 
+                (char)0xDC01, (char)0xDC02, (char)0xDC03,
+                (char)0xD901, (char)0xD902
+                };
+        byte b_pts1[] = {
+                (byte)0x00, 
+                (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00, (byte)0x00
+                };
+        
+        CharBuffer us = CharBuffer.allocate(u_pts1.length);
+        ByteBuffer bs = ByteBuffer.allocate(b_pts1.length);
+        
+        us.put(u_pts1);
+        bs.put(b_pts1);
+        
+        us.limit(1);
+        us.position(0);
+        bs.limit(1);
+        bs.position(0);
+       
+        result = encoder.encode(us, bs, true);
+        
+        if (!result.isMalformed()) {
+            errln("Error while encoding UTF-16LE (1) should have occured.");
+        }
+        
+        // Test for malform surrogate from previous buffer
+        us.limit(4);
+        us.position(1);
+        bs.limit(7);
+        bs.position(1);
+        
+        result = encoder.encode(us, bs, true);
+        
+        if (!result.isMalformed()) {
+            errln("Error while encoding UTF-16LE (2) should have occured.");
+        }       
+        
+        // Test for malform trail surrogate
+        encoder.reset();
+        
+        us.limit(1);
+        us.position(0);
+        bs.limit(1);
+        bs.position(0);
+       
+        result = encoder.encode(us, bs, true);    
+        
+        us.limit(6);
+        us.position(4);
+        bs.limit(4);
+        bs.position(1);
+        
+        result = encoder.encode(us, bs, true);
+        
+        if (!result.isMalformed()) {
+            errln("Error while encoding UTF-16LE (3) should have occured.");
+        }          
    }
 }