ICU-6229 detect buffer overflow in ISO-2022; improve confidence calculation; add buffer overflow tests.

X-SVN-Rev: 23853
This commit is contained in:
Eric Mader 2008-04-30 18:23:32 +00:00
parent 835532c572
commit f74dcc0253
3 changed files with 69 additions and 5 deletions

View File

@ -335,6 +335,57 @@ public class TestCharsetDetector extends TestFmwk
}
}
public void TestBufferOverflow()
{
byte testStrings[][] = {
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
{(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
{(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
{(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
{(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
};
String testResults[] = {
"windows-1252",
"windows-1252",
"windows-1252",
"windows-1252",
"ISO-2022-JP",
null,
null,
"ISO-8859-1"
};
CharsetDetector det = new CharsetDetector();
CharsetMatch match;
det.setDeclaredEncoding("ISO-2022-JP");
for (int idx = 0; idx < testStrings.length; idx += 1) {
det.setText(testStrings[idx]);
match = det.detect();
if (match == null) {
if (testResults[idx] != null) {
errln("Unexpectedly got no results at index " + idx);
}
else {
logln("Got no result as expected at index " + idx);
}
continue;
}
if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
" at index " + idx + " with confidence " + match.getConfidence());
return;
}
}
}
public void TestDetection()
{
//

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -45,6 +45,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
for (escN=0; escN<escapeSequences.length; escN++) {
byte [] seq = escapeSequences[escN];
if ((textLen - i) < seq.length) {
continue checkEscapes;
}
for (j=1; j<seq.length; j++) {
if (seq[j] != text[i+j]) {
continue checkEscapes;

View File

@ -1,6 +1,6 @@
/*
****************************************************************************
* Copyright (C) 2005-2007, International Business Machines Corporation and *
* Copyright (C) 2005-2008, International Business Machines Corporation and *
* others. All Rights Reserved. *
****************************************************************************
*
@ -81,9 +81,18 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
if (doubleByteCharCount <= 10 && badCharCount== 0) {
// Not many multi-byte chars.
// ASCII or ISO file? It's probably not our encoding,
// but is not incompatible with our encoding, so don't give it a zero.
confidence = 10;
if (doubleByteCharCount == 0 && totalCharCount < 10) {
// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
// We don't have enough data to have any confidence.
// Statistical analysis of single byte non-ASCII charcters would probably help here.
confidence = 0;
}
else {
// ASCII or ISO file? It's probably not our encoding,
// but is not incompatible with our encoding, so don't give it a zero.
confidence = 10;
}
break detectBlock;
}