ICU-11498 TestCharsetDetector: hardcode ISO-2022-CN bytes in CharsetDetectionTests.xml and do not check roundtrip conversion for it
X-SVN-Rev: 36998
This commit is contained in:
parent
774a23e4a7
commit
6f38f0a727
@ -1,6 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
<!-- Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved -->
|
<!-- Copyright (c) 2005-2015 IBM Corporation and others. All rights reserved -->
|
||||||
<!-- See individual test cases for their specific copyright. -->
|
<!-- See individual test cases for their specific copyright. -->
|
||||||
|
|
||||||
<charset-detection-tests>
|
<charset-detection-tests>
|
||||||
@ -392,8 +392,8 @@
|
|||||||
|
|
||||||
</test-case>
|
</test-case>
|
||||||
|
|
||||||
<!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
|
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN//noroundtrip GB18030/zh">
|
||||||
<test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
|
<bytes encoding="ISO-2022-CN">\n\ \ \ \ \n\ \ \ \ \n\n\ \ \ \ \x1b$)A\x0eE7V^#,Hm\x3c~#+;%A*Mx\x0f\n\ \ \ \ \x1b$)A\x0eSCM3R;Bk\x0f\ (Unicode)\ \x0eW_1iJ@=g\x0f\n\ \ \ \ IUC10\n\ \ \ \ \x1b$)A\x0e=+SZ\x0f1997\x0eDj\x0f\ 3\ \x0eTB\x0f10\x0eHU#-\x0f12\x0eHUTZ5B9z\x0f\ Mainz\ \x0eJP\x3eYPP5D5ZJ.=lM3R;Bk9z\x3cJQPLV;aOVTZ?*J\x3cW"2a!#\x0f\n\ \ \ \ \x1b$)A\x0e1\x3e4N;aRi=+;c\x3c/8w7=Cf5DW(\x3cR!#If\x3c05DAlSr0|@(#:9z\x3cJ;%A*Mx:MM3R;Bk#,9z\x3cJ;/:M1\x3e5X;/#,\x0f\n\ \ \ \ \x1b$)A\x0eM3R;BkTZ2YWwO5M3:MS\x26SCHm\x3c~VP5DJ5OV#,WVPM#,ND1\x3e8qJ=RT\x3c06`NDVV\x3cFKc5H!#\x0f\n\n\ \ \ \ Unicode\n\ \ \ \ \x1b$)A\x0e51J@=gPhR*95M(J1#,GkSC\x0fUnicode\x0e#!\x0f\n\nConference\ Program\n\ \ \ \ </bytes>
|
||||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||||
|
|
||||||
欧洲,软件+互联网
|
欧洲,软件+互联网
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* Copyright (C) 2005-2013, International Business Machines Corporation and *
|
* Copyright (C) 2005-2015, International Business Machines Corporation and
|
||||||
* others. All Rights Reserved. *
|
* others. All Rights Reserved.
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*/
|
*/
|
||||||
package com.ibm.icu.dev.test.charsetdet;
|
package com.ibm.icu.dev.test.charsetdet;
|
||||||
@ -12,6 +12,8 @@ import java.io.InputStream;
|
|||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import javax.xml.parsers.DocumentBuilder;
|
import javax.xml.parsers.DocumentBuilder;
|
||||||
import javax.xml.parsers.DocumentBuilderFactory;
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
@ -23,6 +25,7 @@ import org.w3c.dom.Node;
|
|||||||
import org.w3c.dom.NodeList;
|
import org.w3c.dom.NodeList;
|
||||||
|
|
||||||
import com.ibm.icu.dev.test.TestFmwk;
|
import com.ibm.icu.dev.test.TestFmwk;
|
||||||
|
import com.ibm.icu.impl.Utility;
|
||||||
import com.ibm.icu.text.CharsetDetector;
|
import com.ibm.icu.text.CharsetDetector;
|
||||||
import com.ibm.icu.text.CharsetMatch;
|
import com.ibm.icu.text.CharsetMatch;
|
||||||
|
|
||||||
@ -328,8 +331,6 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
errln("Could not open test data file CharsetDetectionTests.xml");
|
errln("Could not open test data file CharsetDetectionTests.xml");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
//isr = new InputStreamReader(is, "UTF-8");
|
|
||||||
|
|
||||||
// Set up an xml parser.
|
// Set up an xml parser.
|
||||||
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
|
||||||
@ -345,6 +346,7 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
NodeList testCases = root.getElementsByTagName("test-case");
|
NodeList testCases = root.getElementsByTagName("test-case");
|
||||||
|
|
||||||
// Process each test case
|
// Process each test case
|
||||||
|
Map<String, byte[]> encToBytes = new TreeMap<String, byte[]>();
|
||||||
for (int n = 0; n < testCases.getLength(); n += 1) {
|
for (int n = 0; n < testCases.getLength(); n += 1) {
|
||||||
Node testCase = testCases.item(n);
|
Node testCase = testCases.item(n);
|
||||||
NamedNodeMap attrs = testCase.getAttributes();
|
NamedNodeMap attrs = testCase.getAttributes();
|
||||||
@ -352,20 +354,53 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
StringBuffer testText = new StringBuffer();
|
StringBuffer testText = new StringBuffer();
|
||||||
String id = attrs.getNamedItem("id").getNodeValue();
|
String id = attrs.getNamedItem("id").getNodeValue();
|
||||||
String encodings = attrs.getNamedItem("encodings").getNodeValue();
|
String encodings = attrs.getNamedItem("encodings").getNodeValue();
|
||||||
|
|
||||||
// Collect the test case text.
|
// Collect the test case text and optional bytes.
|
||||||
|
// A <bytes encoding="name">ASCII with \xhh</bytes> element
|
||||||
|
// specifies the byte sequence to be tested.
|
||||||
|
// This is useful when not all platforms encode the test text the same way
|
||||||
|
// (or do not support encoding for that charset).
|
||||||
for (int t = 0; t < testData.getLength(); t += 1) {
|
for (int t = 0; t < testData.getLength(); t += 1) {
|
||||||
Node textNode = testData.item(t);
|
Node node = testData.item(t);
|
||||||
|
if (node.getNodeType() == Node.TEXT_NODE) {
|
||||||
testText.append(textNode.getNodeValue());
|
testText.append(node.getNodeValue());
|
||||||
|
} else if (node.getNodeType() == Node.ELEMENT_NODE &&
|
||||||
|
node.getNodeName().equals("bytes")) {
|
||||||
|
String name = node.getAttributes().getNamedItem("encoding").getNodeValue();
|
||||||
|
Node valueNode = node.getFirstChild();
|
||||||
|
if (valueNode.getNodeType() != Node.TEXT_NODE) {
|
||||||
|
throw new IllegalArgumentException("<bytes> node does not contain text");
|
||||||
|
}
|
||||||
|
// The bytes are stored as ASCII characters and \xhh escaped bytes.
|
||||||
|
// We unescape the string to turn the \xhh into chars U+0000..U+00ff,
|
||||||
|
// then use the deprecated String.getBytes() to turn those into bytes
|
||||||
|
// by essentially casting each char to a byte.
|
||||||
|
String bytesString = Utility.unescape(valueNode.getNodeValue());
|
||||||
|
byte[] bytes = new byte[bytesString.length()];
|
||||||
|
bytesString.getBytes(0, bytesString.length(), bytes, 0);
|
||||||
|
encToBytes.put(name, bytes);
|
||||||
|
} else {
|
||||||
|
throw new IllegalArgumentException("unknown <test-case> child node: " + node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process test text with each encoding / language pair.
|
// Process test text with each encoding / language pair.
|
||||||
String testString = testText.toString();
|
String testString = testText.toString();
|
||||||
String[] encodingList = encodings.split(" ");
|
String[] encodingList = encodings.split(" ");
|
||||||
for (int e = 0; e < encodingList.length; e += 1) {
|
for (int e = 0; e < encodingList.length; e += 1) {
|
||||||
checkEncoding(testString, encodingList[e], id);
|
String[] params = encodingList[e].split("/");
|
||||||
|
String encoding = params[0];
|
||||||
|
String language = params.length == 1 || params[1].length() == 0 ? null : params[1];
|
||||||
|
|
||||||
|
// With a few charsets, the conversion back to Unicode
|
||||||
|
// may depend on the implementation.
|
||||||
|
boolean checkRoundtrip =
|
||||||
|
!encoding.startsWith("UTF-32") &&
|
||||||
|
!(params.length >= 3 && params[2].equals("noroundtrip"));
|
||||||
|
checkEncoding(testString, encoding, language, checkRoundtrip,
|
||||||
|
encToBytes.get(encoding), id);
|
||||||
}
|
}
|
||||||
|
encToBytes.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@ -373,11 +408,9 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
|
private void checkMatch(CharsetDetector det, String testString,
|
||||||
{
|
String encoding, String language, boolean checkRoundtrip, String id) throws Exception {
|
||||||
CharsetMatch m = det.detect();
|
CharsetMatch m = det.detect();
|
||||||
String decoded;
|
|
||||||
|
|
||||||
if (! m.getName().equals(encoding)) {
|
if (! m.getName().equals(encoding)) {
|
||||||
errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
|
errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
|
||||||
return;
|
return;
|
||||||
@ -390,12 +423,12 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
{
|
{
|
||||||
errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
|
errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (encoding.startsWith("UTF-32")) {
|
if (!checkRoundtrip) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
decoded = m.getString();
|
String decoded = m.getString();
|
||||||
|
|
||||||
if (! testString.equals(decoded)) {
|
if (! testString.equals(decoded)) {
|
||||||
errln(id + ", " + encoding + ": getString() didn't return the original string!");
|
errln(id + ", " + encoding + ": getString() didn't return the original string!");
|
||||||
@ -408,62 +441,35 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkEncoding(String testString, String encoding, String id)
|
private void checkEncoding(String testString,
|
||||||
{
|
String encoding, String language, boolean checkRoundtrip,
|
||||||
String enc = null, lang = null;
|
byte[] bytes, String id) {
|
||||||
String[] split = encoding.split("/");
|
if (bytes == null) {
|
||||||
|
try {
|
||||||
enc = split[0];
|
bytes = testString.getBytes(encoding);
|
||||||
|
} catch (UnsupportedOperationException uoe) {
|
||||||
if (split.length > 1) {
|
// Ignore any converters that can't
|
||||||
lang = split[1];
|
// convert from Unicode.
|
||||||
|
logln("Unsupported encoding for conversion from Unicode: " + encoding);
|
||||||
|
return;
|
||||||
|
} catch (UnsupportedEncodingException uee) {
|
||||||
|
// Ignore any encodings that this runtime
|
||||||
|
// doesn't support.
|
||||||
|
logln("Unsupported encoding: " + encoding);
|
||||||
|
return;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
CharsetDetector det = new CharsetDetector();
|
CharsetDetector det = new CharsetDetector();
|
||||||
byte[] bytes;
|
|
||||||
|
|
||||||
//if (enc.startsWith("UTF-32")) {
|
|
||||||
// UTF32 utf32 = UTF32.getInstance(enc);
|
|
||||||
|
|
||||||
// bytes = utf32.toBytes(testString);
|
|
||||||
//} else {
|
|
||||||
String from = enc;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
bytes = testString.getBytes(from);
|
|
||||||
} catch (UnsupportedOperationException uoe) {
|
|
||||||
// In some runtimes, the ISO-2022-CN converter
|
|
||||||
// only converts *to* Unicode - we have to use
|
|
||||||
// x-ISO-2022-CN-GB to convert *from* Unicode.
|
|
||||||
if (from.equals("ISO-2022-CN")) {
|
|
||||||
from = "x-ISO-2022-CN-GB";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Ignore any other converters that can't
|
|
||||||
// convert from Unicode.
|
|
||||||
logln("Unsupported encoding" + from);
|
|
||||||
return;
|
|
||||||
} catch (UnsupportedEncodingException uee) {
|
|
||||||
// Ignore any encodings that this runtime
|
|
||||||
// doesn't support.
|
|
||||||
logln("Unsupported encoding" + from);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
//}
|
|
||||||
|
|
||||||
det.setText(bytes);
|
det.setText(bytes);
|
||||||
checkMatch(det, testString, enc, lang, id);
|
checkMatch(det, testString, encoding, language, checkRoundtrip, id);
|
||||||
|
|
||||||
det.setText(new ByteArrayInputStream(bytes));
|
det.setText(new ByteArrayInputStream(bytes));
|
||||||
checkMatch(det, testString, enc, lang, id);
|
checkMatch(det, testString, encoding, language, checkRoundtrip, id);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
errln(id + ": " + e.toString() + "enc=" + enc);
|
errln(id + ": " + e.toString() + "enc=" + encoding);
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user