ICU-6229 detect buffer overflow in ISO-2022; improve confidence calculation; add buffer overflow tests.
X-SVN-Rev: 23853
This commit is contained in:
parent
835532c572
commit
f74dcc0253
@ -335,6 +335,57 @@ public class TestCharsetDetector extends TestFmwk
|
||||
}
|
||||
}
|
||||
|
||||
public void TestBufferOverflow()
|
||||
{
|
||||
byte testStrings[][] = {
|
||||
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b}, /* A partial ISO-2022 shift state at the end */
|
||||
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24}, /* A partial ISO-2022 shift state at the end */
|
||||
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28}, /* A partial ISO-2022 shift state at the end */
|
||||
{(byte) 0x80, (byte) 0x20, (byte) 0x54, (byte) 0x68, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x69, (byte) 0x73, (byte) 0x20, (byte) 0x45, (byte) 0x6E, (byte) 0x67, (byte) 0x6C, (byte) 0x69, (byte) 0x73, (byte) 0x68, (byte) 0x20, (byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end with a bad one at the start */
|
||||
{(byte) 0x1b, (byte) 0x24, (byte) 0x28, (byte) 0x44}, /* A complete ISO-2022 shift state at the end */
|
||||
{(byte) 0xa1}, /* Could be a single byte shift-jis at the end */
|
||||
{(byte) 0x74, (byte) 0x68, (byte) 0xa1}, /* Could be a single byte shift-jis at the end */
|
||||
{(byte) 0x74, (byte) 0x68, (byte) 0x65, (byte) 0xa1} /* Could be a single byte shift-jis at the end, but now we have English creeping in. */
|
||||
};
|
||||
|
||||
String testResults[] = {
|
||||
"windows-1252",
|
||||
"windows-1252",
|
||||
"windows-1252",
|
||||
"windows-1252",
|
||||
"ISO-2022-JP",
|
||||
null,
|
||||
null,
|
||||
"ISO-8859-1"
|
||||
};
|
||||
|
||||
CharsetDetector det = new CharsetDetector();
|
||||
CharsetMatch match;
|
||||
|
||||
det.setDeclaredEncoding("ISO-2022-JP");
|
||||
|
||||
for (int idx = 0; idx < testStrings.length; idx += 1) {
|
||||
det.setText(testStrings[idx]);
|
||||
match = det.detect();
|
||||
|
||||
if (match == null) {
|
||||
if (testResults[idx] != null) {
|
||||
errln("Unexpectedly got no results at index " + idx);
|
||||
}
|
||||
else {
|
||||
logln("Got no result as expected at index " + idx);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (testResults[idx] == null || ! testResults[idx].equals(match.getName())) {
|
||||
errln("Unexpectedly got " + match.getName() + " instead of " + testResults[idx] +
|
||||
" at index " + idx + " with confidence " + match.getConfidence());
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestDetection()
|
||||
{
|
||||
//
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* Copyright (C) 2005 - 2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -45,6 +45,10 @@ abstract class CharsetRecog_2022 extends CharsetRecognizer {
|
||||
for (escN=0; escN<escapeSequences.length; escN++) {
|
||||
byte [] seq = escapeSequences[escN];
|
||||
|
||||
if ((textLen - i) < seq.length) {
|
||||
continue checkEscapes;
|
||||
}
|
||||
|
||||
for (j=1; j<seq.length; j++) {
|
||||
if (seq[j] != text[i+j]) {
|
||||
continue checkEscapes;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
****************************************************************************
|
||||
* Copyright (C) 2005-2007, International Business Machines Corporation and *
|
||||
* Copyright (C) 2005-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
****************************************************************************
|
||||
*
|
||||
@ -81,9 +81,18 @@ abstract class CharsetRecog_mbcs extends CharsetRecognizer {
|
||||
|
||||
if (doubleByteCharCount <= 10 && badCharCount== 0) {
|
||||
// Not many multi-byte chars.
|
||||
// ASCII or ISO file? It's probably not our encoding,
|
||||
// but is not incompatible with our encoding, so don't give it a zero.
|
||||
confidence = 10;
|
||||
if (doubleByteCharCount == 0 && totalCharCount < 10) {
|
||||
// There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
|
||||
// We don't have enough data to have any confidence.
|
||||
// Statistical analysis of single byte non-ASCII charcters would probably help here.
|
||||
confidence = 0;
|
||||
}
|
||||
else {
|
||||
// ASCII or ISO file? It's probably not our encoding,
|
||||
// but is not incompatible with our encoding, so don't give it a zero.
|
||||
confidence = 10;
|
||||
}
|
||||
|
||||
break detectBlock;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user