ICU-6229 detect buffer overflow in ISO-2022; improve confidence calculation; add buffer overflow tests.

X-SVN-Rev: 23853
2008-04-30 18:23:32 +00:00 · 2008-04-30 18:23:32 +00:00 · f74dcc0253
commit f74dcc0253
parent 835532c572
3 changed files with 69 additions and 5 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -335,6 +335,57 @@ public class TestCharsetDetector extends TestFmwk
        }
    }
    
+    public void TestBufferOverflow()
+    {
+        byte testStrings[][] = {
+            {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
+            {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
+            {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
+            {(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
+            {(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
+            {(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
+            {(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
+            {(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
+        };
+        
+        String testResults[] = {
+            "windows-1252",
+            "windows-1252",
+            "windows-1252",
+            "windows-1252",
+            "ISO-2022-JP",
+            null,
+            null,
+            "ISO-8859-1"
+        };
+        
+        CharsetDetector det = new CharsetDetector();
+        CharsetMatch match;
+
+        det.setDeclaredEncoding("ISO-2022-JP");
+
+        for (int idx = 0; idx < testStrings.length; idx += 1) {
+            det.setText(testStrings[idx]);
+            match = det.detect();
+
+            if (match == null) {
+                if (testResults[idx] != null) {
+                    errln("Unexpectedly got no results at index " + idx);
+                }
+                else {
+                    logln("Got no result as expected at index " + idx);
+                }
+                continue;
+            }
+
+            if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
+                errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
+                      " at index " + idx + " with confidence " + match.getConfidence());
+                return;
+            }
+        }
+    }
+    
    public void TestDetection()
    {
        //
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_2022.java
@ -1,6 +1,6 @@
 /*
 *******************************************************************************
-* Copyright (C) 2005, International Business Machines Corporation and         *
+* Copyright (C) 2005 - 2008, International Business Machines Corporation and  *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
@ -45,6 +45,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
                        for (escN=0; escN<escapeSequences.length; escN++) {
                            byte [] seq = escapeSequences[escN];
                            
+                            if ((textLen - i) < seq.length) {
+                                continue checkEscapes;
+                            }
+                            
                            for (j=1; j<seq.length; j++) {
                                if (seq[j] != text[i+j])  {
                                    continue checkEscapes;
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
@ -1,6 +1,6 @@
 /*
 ****************************************************************************
- * Copyright (C) 2005-2007, International Business Machines Corporation and *
+ * Copyright (C) 2005-2008, International Business Machines Corporation and *
 * others. All Rights Reserved.                                             *
 ****************************************************************************
 *
@ -81,9 +81,18 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
            
            if (doubleByteCharCount <= 10 && badCharCount== 0) {
                // Not many multi-byte chars.
-                //   ASCII or ISO file?  It's probably not our encoding,
-                //   but is not incompatible with our encoding, so don't give it a zero.
-                confidence = 10;
+                if (doubleByteCharCount == 0 && totalCharCount < 10) {
+                    // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
+                    // We don't have enough data to have any confidence.
+                    // Statistical analysis of single byte non-ASCII charcters would probably help here.
+                    confidence = 0;
+                }
+                else {
+                    //   ASCII or ISO file?  It's probably not our encoding,
+                    //   but is not incompatible with our encoding, so don't give it a zero.
+                    confidence = 10;
+                }
+                
                break detectBlock;
            }