ICU-10532 ICU Charset Detector, port UTF-16 enhancements from Java to C
X-SVN-Rev: 34730
This commit is contained in:
parent
63a9fef24b
commit
e28793a4a6
@ -29,17 +29,48 @@ const char *CharsetRecog_UTF_16_BE::getName() const
|
||||
return "UTF-16BE";
|
||||
}
|
||||
|
||||
// UTF-16 confidence calculation. Very simple minded, but better than nothing.
|
||||
// Any 8 bit non-control characters bump the confidence up. These have a zero high byte,
|
||||
// and are very likely to be UTF-16, although they could also be part of a UTF-32 code.
|
||||
// NULs are a contra-indication, they will appear commonly if the actual encoding is UTF-32.
|
||||
// NULs should be rare in actual text.
|
||||
|
||||
static int32_t adjustConfidence(UChar codeUnit, int32_t confidence) {
|
||||
if (codeUnit == 0) {
|
||||
confidence -= 10;
|
||||
} else if ((codeUnit >= 0x20 && codeUnit <= 0xff) || codeUnit == 0x0a) {
|
||||
confidence += 10;
|
||||
}
|
||||
if (confidence < 0) {
|
||||
confidence = 0;
|
||||
} else if (confidence > 100) {
|
||||
confidence = 100;
|
||||
}
|
||||
return confidence;
|
||||
}
|
||||
|
||||
|
||||
UBool CharsetRecog_UTF_16_BE::match(InputText* textIn, CharsetMatch *results) const
|
||||
{
|
||||
const uint8_t *input = textIn->fRawInput;
|
||||
int32_t confidence = 0;
|
||||
int32_t confidence = 10;
|
||||
int32_t length = textIn->fRawLength;
|
||||
|
||||
if (length >=2 && input[0] == 0xFE && input[1] == 0xFF) {
|
||||
confidence = 100;
|
||||
int32_t bytesToCheck = (length > 30) ? 30 : length;
|
||||
for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
|
||||
UChar codeUnit = (input[charIndex] << 8) | input[charIndex + 1];
|
||||
if (charIndex == 0 && codeUnit == 0xFEFF) {
|
||||
confidence = 100;
|
||||
break;
|
||||
}
|
||||
confidence = adjustConfidence(codeUnit, confidence);
|
||||
if (confidence == 0 || confidence == 100) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (bytesToCheck < 4 && confidence < 100) {
|
||||
confidence = 0;
|
||||
}
|
||||
|
||||
// TODO: Do some statastics to check for unsigned UTF-16BE
|
||||
results->set(textIn, this, confidence);
|
||||
return (confidence > 0);
|
||||
}
|
||||
@ -57,14 +88,27 @@ const char *CharsetRecog_UTF_16_LE::getName() const
|
||||
UBool CharsetRecog_UTF_16_LE::match(InputText* textIn, CharsetMatch *results) const
|
||||
{
|
||||
const uint8_t *input = textIn->fRawInput;
|
||||
int32_t confidence = 0;
|
||||
int32_t confidence = 10;
|
||||
int32_t length = textIn->fRawLength;
|
||||
|
||||
if (length >= 4 && input[0] == 0xFF && input[1] == 0xFE && (input[2] != 0x00 || input[3] != 0x00)) {
|
||||
confidence = 100;
|
||||
int32_t bytesToCheck = (length > 30) ? 30 : length;
|
||||
for (int32_t charIndex=0; charIndex<bytesToCheck-1; charIndex+=2) {
|
||||
UChar codeUnit = input[charIndex] | (input[charIndex + 1] << 8);
|
||||
if (charIndex == 0 && codeUnit == 0xFEFF) {
|
||||
confidence = 100; // UTF-16 BOM
|
||||
if (length >= 4 && input[2] == 0 && input[3] == 0) {
|
||||
confidence = 0; // UTF-32 BOM
|
||||
}
|
||||
break;
|
||||
}
|
||||
confidence = adjustConfidence(codeUnit, confidence);
|
||||
if (confidence == 0 || confidence == 100) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (bytesToCheck < 4 && confidence < 100) {
|
||||
confidence = 0;
|
||||
}
|
||||
|
||||
// TODO: Do some statastics to check for unsigned UTF-16LE
|
||||
results->set(textIn, this, confidence);
|
||||
return (confidence > 0);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2005-2012, International Business Machines
|
||||
* Copyright (C) 2005-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
@ -86,7 +86,7 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
|
||||
|
||||
}
|
||||
|
||||
// Cook up some sort of confidence score, based on presense of a BOM
|
||||
// Cook up some sort of confidence score, based on presence of a BOM
|
||||
// and the existence of valid and/or invalid multi-byte sequences.
|
||||
confidence = 0;
|
||||
if (hasBOM && numInvalid == 0) {
|
||||
@ -98,8 +98,9 @@ UBool CharsetRecog_UTF8::match(InputText* input, CharsetMatch *results) const {
|
||||
} else if (numValid > 0 && numInvalid == 0) {
|
||||
confidence = 80;
|
||||
} else if (numValid == 0 && numInvalid == 0) {
|
||||
// Plain ASCII.
|
||||
confidence = 10;
|
||||
// Plain ASCII. Confidence must be > 10, it's more likely than UTF-16, which
|
||||
// accepts ASCII with confidence = 10.
|
||||
confidence = 15;
|
||||
} else if (numValid > numInvalid*10) {
|
||||
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
|
||||
confidence = 25;
|
||||
|
76
icu4c/source/test/testdata/csdetest.xml
vendored
76
icu4c/source/test/testdata/csdetest.xml
vendored
@ -1,10 +1,10 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<!-- Copyright (c) 2005-2007 IBM Corporation and others. All rights reserved -->
|
||||
<!-- Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved -->
|
||||
<!-- See individual test cases for their specific copyright. -->
|
||||
|
||||
<charset-detection-tests>
|
||||
<test-case id="IUC10-ar" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
|
||||
<test-case id="IUC10-ar" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-6/ar windows-1256/ar">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
أوروبا, برمجيات الحاسوب + انترنيت :
|
||||
@ -20,7 +20,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-da-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1252/da">
|
||||
<test-case id="IUC10-da-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1252/da">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software + Internet:
|
||||
@ -36,7 +36,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-da" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
|
||||
<test-case id="IUC10-da" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software + Internet:
|
||||
@ -52,7 +52,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-de" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/de">
|
||||
<test-case id="IUC10-de" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/de">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software + das Internet:
|
||||
@ -69,7 +69,7 @@
|
||||
</test-case>
|
||||
|
||||
<!-- No UTF-8 in this test because there are no non-ASCII characters. -->
|
||||
<test-case id="IUC10-en" encodings="UTF-32BE UTF-32LE ISO-8859-1/en">
|
||||
<test-case id="IUC10-en" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/en">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europe, Software + the Internet:
|
||||
@ -85,7 +85,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-es" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/es">
|
||||
<test-case id="IUC10-es" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/es">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software + el Internet:
|
||||
@ -101,7 +101,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-fr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/fr">
|
||||
<test-case id="IUC10-fr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/fr">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
L'Europe, le logiciel et l'Internet :
|
||||
@ -118,7 +118,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-he" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-8-I/he">
|
||||
<test-case id="IUC10-he" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-8-I/he">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
אירופה, תוכנה והאינטרנט:
|
||||
@ -133,7 +133,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-he-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1255/he">
|
||||
<test-case id="IUC10-he-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1255/he">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
אירופה, תוכנה והאינטרנט:
|
||||
@ -148,7 +148,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-hu" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/hu">
|
||||
<test-case id="IUC10-hu" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/hu">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Európa, a Szoftver s az Internet -
|
||||
@ -165,7 +165,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1250/hu">
|
||||
<test-case id="IUC10-hu-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1250/hu">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Európa, a Szoftver s az Internet -
|
||||
@ -182,7 +182,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-it" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/it">
|
||||
<test-case id="IUC10-it" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/it">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, software e Internet:
|
||||
@ -199,7 +199,7 @@
|
||||
</test-case>
|
||||
|
||||
<!-- No EUC-JP in this test because it detects as GB18030 -->
|
||||
<test-case id="IUC10-jp" encodings="UTF-8 UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
|
||||
<test-case id="IUC10-jp" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Shift_JIS/ja ISO-2022-JP">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
ヨーロッパ、ソフトウェア、そしてインターネット:
|
||||
@ -214,7 +214,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-ko" encodings="UTF-8 UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
|
||||
<test-case id="IUC10-ko" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE EUC-KR/ko ISO-2022-KR">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
유럽, 소프트웨어 그리고 인터넷:
|
||||
@ -230,7 +230,7 @@
|
||||
</test-case>
|
||||
|
||||
<!-- No UTF-8 in this test because there are no non-ASCII characters. -->
|
||||
<test-case id="IUC10-nl" encodings="UTF-32BE UTF-32LE ISO-8859-1/nl">
|
||||
<test-case id="IUC10-nl" encodings="UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/nl">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software + het Internet:
|
||||
@ -247,7 +247,7 @@
|
||||
</test-case>
|
||||
|
||||
<!-- No language for ISO-8859-1 in this test because no-NO is recogonized as Danish... -->
|
||||
<test-case id="IUC10-no-NO" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/da">
|
||||
<test-case id="IUC10-no-NO" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/da">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Programvare og Internet:
|
||||
@ -262,7 +262,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/no">
|
||||
<test-case id="IUC10-no-NO-NY" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/no">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, programvare og Internett:
|
||||
@ -278,7 +278,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
|
||||
<test-case id="IUC10-pt-BR" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software e a Internet:
|
||||
@ -294,7 +294,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/pt">
|
||||
<test-case id="IUC10-pt-PT" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/pt">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software e a Internet:
|
||||
@ -311,7 +311,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-ro" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/ro">
|
||||
<test-case id="IUC10-ro" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/ro">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, Software şi Internet:
|
||||
@ -328,7 +328,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-ru" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
|
||||
<test-case id="IUC10-ru" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-5/ru windows-1251/ru KOI8-R/ru">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Европа, Программное обеспечение + Интернет:
|
||||
@ -345,7 +345,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-sv" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-1/sv">
|
||||
<test-case id="IUC10-sv" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-1/sv">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Europa, programvara och Internet:
|
||||
@ -361,7 +361,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-yi" encodings="UTF-8 UTF-32BE UTF-32LE">
|
||||
<test-case id="IUC10-yi" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
אײראָפּע: פּראָגראַמװאַרג און די װעלטנעץ:
|
||||
@ -377,7 +377,7 @@
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-32BE UTF-32LE Big5/zh">
|
||||
<test-case id="IUC10-zh-Hant" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE Big5/zh">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
歐洲,軟體及網際網路:
|
||||
@ -393,7 +393,7 @@
|
||||
</test-case>
|
||||
|
||||
<!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
|
||||
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
|
||||
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
欧洲,软件+互联网
|
||||
@ -409,7 +409,7 @@
|
||||
Conference Program
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-cz" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/cs">
|
||||
<test-case id="WIU-cz" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/cs">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Co je Unicode?
|
||||
@ -432,7 +432,7 @@ Conference Program
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-el" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-7/el">
|
||||
<test-case id="WIU-el" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-7/el">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Τι είναι το Unicode;
|
||||
@ -458,7 +458,7 @@ Conference Program
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-el-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1253/el">
|
||||
<test-case id="WIU-el-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1253/el">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Τι είναι το “Unicode”;
|
||||
@ -484,7 +484,7 @@ Conference Program
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-pl" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-2/pl">
|
||||
<test-case id="WIU-pl" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-2/pl">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Czym jest Unikod ?
|
||||
@ -505,7 +505,7 @@ Conference Program
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-tr" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-9/tr">
|
||||
<test-case id="WIU-tr" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-8859-9/tr">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
Evrensel Kod Nedir?
|
||||
@ -527,7 +527,7 @@ Conference Program
|
||||
|
||||
</test-case>
|
||||
|
||||
<test-case id="WIU-tr-Q" encodings="UTF-8 UTF-32BE UTF-32LE windows-1254/tr">
|
||||
<test-case id="WIU-tr-Q" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE windows-1254/tr">
|
||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||
|
||||
“Evrensel Kod” Nedir?
|
||||
@ -548,4 +548,14 @@ Conference Program
|
||||
şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar.
|
||||
|
||||
</test-case>
|
||||
</charset-detection-tests>
|
||||
|
||||
|
||||
<test-case id="bug-10532-utf-16" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
|
||||
foo 東京・銀座の歌舞伎座。4月に新調された4枚の緞帳のうち3枚は、京都の川島織物セルコンが織った朝日新聞デジタル会員(有料・無料)にご登録いただくと、様々な特典・サービスが受けられます。
|
||||
</test-case>
|
||||
|
||||
<test-case id="bug-10532-ASCII" encodings="UTF-8 UTF-16BE UTF-16LE UTF-32BE UTF-32LE">
|
||||
<!-- Note that plain 7 bit ASCII is detected as UTF-8 -->
|
||||
,1,,,5
|
||||
</test-case>
|
||||
</charset-detection-tests>
|
||||
|
Loading…
Reference in New Issue
Block a user