diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java index 735b667cc8..454e81c97c 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_UTF8.java @@ -1,6 +1,6 @@ /** ******************************************************************************* -* Copyright (C) 2005 - 2012, International Business Machines Corporation and * +* Copyright (C) 2005 - 2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -85,8 +85,10 @@ class CharsetRecog_UTF8 extends CharsetRecognizer { } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { - // Plain ASCII. - confidence = 10; + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which + // accepts ASCII with confidence = 10. + // TODO: add plain ASCII as an explicitly detected type. + confidence = 15; } else if (numValid > numInvalid*10) { // Probably corruput utf-8 data. Valid sequences aren't likely by chance. confidence = 25; diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml index 671e885b50..1fd633dc7a 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml @@ -553,4 +553,9 @@ Conference Program foo 東京・銀座の歌舞伎座。4月に新調された4枚の緞帳のうち3枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員(有料・無料)にご登録いただくと、様々な特典・サービスが受けられます。 + + + + ,1,,,5 +