ICU-10532 Charset Detect of plain ASCII, boost UTF-8 likelyhood.
X-SVN-Rev: 34694
This commit is contained in:
parent
5f748fda1a
commit
7c5ff0aad6
@ -1,6 +1,6 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005 - 2012, International Business Machines Corporation and *
|
||||
* Copyright (C) 2005 - 2013, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -85,8 +85,10 @@ class CharsetRecog_UTF8 extends CharsetRecognizer {
|
||||
} else if (numValid > 0 && numInvalid == 0) {
|
||||
confidence = 80;
|
||||
} else if (numValid == 0 && numInvalid == 0) {
|
||||
// Plain ASCII.
|
||||
confidence = 10;
|
||||
// Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
|
||||
// accepts ASCII with confidence = 10.
|
||||
// TODO: add plain ASCII as an explicitly detected type.
|
||||
confidence = 15;
|
||||
} else if (numValid > numInvalid*10) {
|
||||
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
|
||||
confidence = 25;
|
||||
|
@ -553,4 +553,9 @@ Conference Program
|
||||
<test-case id="bug-10532-utf-16" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
|
||||
foo 東京・銀座の歌舞伎座。4月に新調された4枚の緞帳のうち3枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員(有料・無料)にご登録いただくと、様々な特典・サービスが受けられます。
|
||||
</test-case>
|
||||
|
||||
<test-case id="bug-10532-ASCII" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
|
||||
<!-- Note that plain 7 bit ASCII is detected as UTF-8 -->
|
||||
,1,,,5
|
||||
</test-case>
|
||||
</charset-detection-tests>
|
||||
|
Loading…
Reference in New Issue
Block a user