From e28793a4a646205f25de1e420d89ff535e194fed Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Tue, 10 Dec 2013 02:30:58 +0000 Subject: [PATCH] ICU-10532 ICU Charset Detector, port UTF-16 enhancements from Java to C X-SVN-Rev: 34730 --- icu4c/source/i18n/csrucode.cpp | 64 +++++++++++++++++---- icu4c/source/i18n/csrutf8.cpp | 9 +-- icu4c/source/test/testdata/csdetest.xml | 76 ++++++++++++++----------- 3 files changed, 102 insertions(+), 47 deletions(-) diff --git a/icu4c/source/i18n/csrucode.cpp b/icu4c/source/i18n/csrucode.cpp index 21239b7eaf..f098343068 100644 --- a/icu4c/source/i18n/csrucode.cpp +++ b/icu4c/source/i18n/csrucode.cpp @@ -29,17 +29,48 @@ const char *CharsetRecog_UTF_16_BE::getName() const return "UTF-16BE"; } +// UTF-16 confidence calculation. Very simple minded, but better than nothing. +// Any 8 bit non-control characters bump the confidence up. These have a zero high byte, +// and are very likely to be UTF-16, although they could also be part of a UTF-32 code. +// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32. +// NULs should be rare in actual text. + +static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) { + if (codeUnit == 0) { + confidence -= 10; + } else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) { + confidence += 10; + } + if (confidence < 0) { + confidence = 0; + } else if (confidence > 100) { + confidence = 100; + } + return confidence; +} + + UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; - int32_t confidence = 0; + int32_t confidence = 10; int32_t length = textIn->fRawLength; - if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) { - confidence = 100; + int32_t bytesToCheck = (length > 30) ? 30 : length; + for (int32_t charIndex=0; charIndexset(textIn, this, confidence); return (confidence > 0); } @@ -57,14 +88,27 @@ const char *CharsetRecog_UTF_16_LE::getName() const UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const { const uint8_t *input = textIn->fRawInput; - int32_t confidence = 0; + int32_t confidence = 10; int32_t length = textIn->fRawLength; - if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) { - confidence = 100; + int32_t bytesToCheck = (length > 30) ? 30 : length; + for (int32_t charIndex=0; charIndex= 4 && input[2] == 0 && input[3] == 0) { + confidence = 0; // UTF-32 BOM + } + break; + } + confidence = adjustConfidence(codeUnit, confidence); + if (confidence == 0 || confidence == 100) { + break; + } + } + if (bytesToCheck < 4 && confidence < 100) { + confidence = 0; } - - // TODO: Do some statastics to check for unsigned UTF-16LE results->set(textIn, this, confidence); return (confidence > 0); } diff --git a/icu4c/source/i18n/csrutf8.cpp b/icu4c/source/i18n/csrutf8.cpp index 420c66909d..bb0a909c04 100644 --- a/icu4c/source/i18n/csrutf8.cpp +++ b/icu4c/source/i18n/csrutf8.cpp @@ -1,6 +1,6 @@ /* ********************************************************************** - * Copyright (C) 2005-2012, International Business Machines + * Copyright (C) 2005-2013, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ @@ -86,7 +86,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { } - // Cook up some sort of confidence score, based on presense of a BOM + // Cook up some sort of confidence score, based on presence of a BOM // and the existence of valid and/or invalid multi-byte sequences. confidence = 0; if (hasBOM && numInvalid == 0) { @@ -98,8 +98,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const { } else if (numValid > 0 && numInvalid == 0) { confidence = 80; } else if (numValid == 0 && numInvalid == 0) { - // Plain ASCII. - confidence = 10; + // Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which + // accepts ASCII with confidence = 10. + confidence = 15; } else if (numValid > numInvalid*10) { // Probably corruput utf-8 data. Valid sequences aren't likely by chance. confidence = 25; diff --git a/icu4c/source/test/testdata/csdetest.xml b/icu4c/source/test/testdata/csdetest.xml index 03b0ac8583..8b151a3c4a 100644 --- a/icu4c/source/test/testdata/csdetest.xml +++ b/icu4c/source/test/testdata/csdetest.xml @@ -1,10 +1,10 @@ - + - + أوروبا, برمجيات الحاسوب + انترنيت : @@ -20,7 +20,7 @@ - + Europa, Software + Internet: @@ -36,7 +36,7 @@ - + Europa, Software + Internet: @@ -52,7 +52,7 @@ - + Europa, Software + das Internet: @@ -69,7 +69,7 @@ - + Europe, Software + the Internet: @@ -85,7 +85,7 @@ - + Europa, Software + el Internet: @@ -101,7 +101,7 @@ - + L'Europe, le logiciel et l'Internet : @@ -118,7 +118,7 @@ - + אירופה, תוכנה והאינטרנט: @@ -133,7 +133,7 @@ - + אירופה, תוכנה והאינטרנט: @@ -148,7 +148,7 @@ - + Európa, a Szoftver s az Internet - @@ -165,7 +165,7 @@ - + Európa, a Szoftver s az Internet - @@ -182,7 +182,7 @@ - + Europa, software e Internet: @@ -199,7 +199,7 @@ - + ヨーロッパ、ソフトウェア、そしてインターネット: @@ -214,7 +214,7 @@ - + 유럽, 소프트웨어 그리고 인터넷: @@ -230,7 +230,7 @@ - + Europa, Software + het Internet: @@ -247,7 +247,7 @@ - + Europa, Programvare og Internet: @@ -262,7 +262,7 @@ - + Europa, programvare og Internett: @@ -278,7 +278,7 @@ - + Europa, Software e a Internet: @@ -294,7 +294,7 @@ - + Europa, Software e a Internet: @@ -311,7 +311,7 @@ - + Europa, Software şi Internet: @@ -328,7 +328,7 @@ - + Европа, Программное обеспечение + Интернет: @@ -345,7 +345,7 @@ - + Europa, programvara och Internet: @@ -361,7 +361,7 @@ - + אײראָפּע: פּראָגראַמװאַרג און די װעלטנעץ: @@ -377,7 +377,7 @@ - + 歐洲,軟體及網際網路: @@ -393,7 +393,7 @@ - + 欧洲,软件+互联网 @@ -409,7 +409,7 @@ Conference Program - + Co je Unicode? @@ -432,7 +432,7 @@ Conference Program - + Τι είναι το Unicode; @@ -458,7 +458,7 @@ Conference Program - + Τι είναι το “Unicode”; @@ -484,7 +484,7 @@ Conference Program - + Czym jest Unikod ? @@ -505,7 +505,7 @@ Conference Program - + Evrensel Kod Nedir? @@ -527,7 +527,7 @@ Conference Program - + “Evrensel Kod” Nedir? @@ -548,4 +548,14 @@ Conference Program şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar. - \ No newline at end of file + + + + foo 東京・銀座の歌舞伎座。4月に新調された4枚の緞帳のうち3枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員(有料・無料)にご登録いただくと、様々な特典・サービスが受けられます。 + + + + + ,1,,,5 + +