From 6f38f0a727feba5c44362a31ea86eb23ccfee4b1 Mon Sep 17 00:00:00 2001
From: Markus Scherer <markus.icu@gmail.com>
Date: Mon, 2 Feb 2015 22:53:15 +0000
Subject: [PATCH] ICU-11498 TestCharsetDetector: hardcode ISO-2022-CN bytes in
 CharsetDetectionTests.xml and do not check roundtrip conversion for it

X-SVN-Rev: 36998
---
 .../test/charsetdet/CharsetDetectionTests.xml |   6 +-
 .../test/charsetdet/TestCharsetDetector.java  | 138 +++++++++---------
 2 files changed, 75 insertions(+), 69 deletions(-)
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
index 1fd633dc7a..07f82fae6b 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 
-<!-- Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved -->
+<!-- Copyright (c) 2005-2015 IBM Corporation and others. All rights reserved -->
 <!-- See individual test cases for their specific copyright. -->
 
 <charset-detection-tests>
@@ -392,8 +392,8 @@
 
     </test-case>
 
-    <!-- No ISO-2022-CN in this test because Java doesn't support it in both directions :-( -->
-    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN GB18030/zh">
+    <test-case id="IUC10-zh-Hans" encodings="UTF-8 UTF-16LE UTF-16BE UTF-32BE UTF-32LE ISO-2022-CN//noroundtrip GB18030/zh">
+    <bytes encoding="ISO-2022-CN">\n\ \ \ \ \n\ \ \ \ \n\n\ \ \ \ \x1b$)A\x0eE7V^#,Hm\x3c~#+;%A*Mx\x0f\n\ \ \ \ \x1b$)A\x0eSCM3R;Bk\x0f\ (Unicode)\ \x0eW_1iJ@=g\x0f\n\ \ \ \ IUC10\n\ \ \ \ \x1b$)A\x0e=+SZ\x0f1997\x0eDj\x0f\ 3\ \x0eTB\x0f10\x0eHU#-\x0f12\x0eHUTZ5B9z\x0f\ Mainz\ \x0eJP\x3eYPP5D5ZJ.=lM3R;Bk9z\x3cJQPLV;aOVTZ?*J\x3cW"2a!#\x0f\n\ \ \ \ \x1b$)A\x0e1\x3e4N;aRi=+;c\x3c/8w7=Cf5DW(\x3cR!#If\x3c05DAlSr0|@(#:9z\x3cJ;%A*Mx:MM3R;Bk#,9z\x3cJ;/:M1\x3e5X;/#,\x0f\n\ \ \ \ \x1b$)A\x0eM3R;BkTZ2YWwO5M3:MS\x26SCHm\x3c~VP5DJ5OV#,WVPM#,ND1\x3e8qJ=RT\x3c06`NDVV\x3cFKc5H!#\x0f\n\n\ \ \ \ Unicode\n\ \ \ \ \x1b$)A\x0e51J@=gPhR*95M(J1#,GkSC\x0fUnicode\x0e#!\x0f\n\nConference\ Program\n\ \ \ \ </bytes>
     <!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
 
     欧洲，软件＋互联网
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
index 8fd6de0655..577469893e 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@@ -1,7 +1,7 @@
 /**
  *******************************************************************************
- * Copyright (C) 2005-2013, International Business Machines Corporation and    *
- * others. All Rights Reserved.                                                *
+ * Copyright (C) 2005-2015, International Business Machines Corporation and
+ * others. All Rights Reserved.
  *******************************************************************************
  */
 package com.ibm.icu.dev.test.charsetdet;
@@ -12,6 +12,8 @@ import java.io.InputStream;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
 import java.util.HashSet;
+import java.util.Map;
+import java.util.TreeMap;
 
 import javax.xml.parsers.DocumentBuilder;
 import javax.xml.parsers.DocumentBuilderFactory;
@@ -23,6 +25,7 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 
 import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.impl.Utility;
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;
 
@@ -328,8 +331,6 @@ public class TestCharsetDetector extends TestFmwk
                 errln("Could not open test data file CharsetDetectionTests.xml");
                 return;
             }
-            
-            //isr = new InputStreamReader(is, "UTF-8"); 
 
             // Set up an xml parser.
             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -345,6 +346,7 @@ public class TestCharsetDetector extends TestFmwk
             NodeList testCases = root.getElementsByTagName("test-case");
             
             // Process each test case
+            Map<String, byte[]> encToBytes = new TreeMap<String, byte[]>();
             for (int n = 0; n < testCases.getLength(); n += 1) {
                 Node testCase = testCases.item(n);
                 NamedNodeMap attrs = testCase.getAttributes();
@@ -352,20 +354,53 @@ public class TestCharsetDetector extends TestFmwk
                 StringBuffer testText = new StringBuffer();
                 String id = attrs.getNamedItem("id").getNodeValue();
                 String encodings = attrs.getNamedItem("encodings").getNodeValue();
-                
-                // Collect the test case text.
+
+                // Collect the test case text and optional bytes.
+                // A <bytes encoding="name">ASCII with \xhh</bytes> element
+                // specifies the byte sequence to be tested.
+                // This is useful when not all platforms encode the test text the same way
+                // (or do not support encoding for that charset).
                 for (int t = 0; t < testData.getLength(); t += 1) {
-                    Node textNode = testData.item(t);
-                    
-                    testText.append(textNode.getNodeValue());                    
+                    Node node = testData.item(t);
+                    if (node.getNodeType() == Node.TEXT_NODE) {
+                        testText.append(node.getNodeValue());
+                    } else if (node.getNodeType() == Node.ELEMENT_NODE &&
+                            node.getNodeName().equals("bytes")) {
+                        String name = node.getAttributes().getNamedItem("encoding").getNodeValue();
+                        Node valueNode = node.getFirstChild();
+                        if (valueNode.getNodeType() != Node.TEXT_NODE) {
+                            throw new IllegalArgumentException("<bytes> node does not contain text");
+                        }
+                        // The bytes are stored as ASCII characters and \xhh escaped bytes.
+                        // We unescape the string to turn the \xhh into chars U+0000..U+00ff,
+                        // then use the deprecated String.getBytes() to turn those into bytes
+                        // by essentially casting each char to a byte.
+                        String bytesString = Utility.unescape(valueNode.getNodeValue());
+                        byte[] bytes = new byte[bytesString.length()];
+                        bytesString.getBytes(0, bytesString.length(), bytes, 0);
+                        encToBytes.put(name, bytes);
+                    } else {
+                        throw new IllegalArgumentException("unknown <test-case> child node: " + node);
+                    }
                 }
-                
+
                 // Process test text with each encoding / language pair.
                 String testString = testText.toString();
                 String[] encodingList = encodings.split(" ");
                 for (int e = 0; e < encodingList.length; e += 1) {
-                    checkEncoding(testString, encodingList[e], id);
+                    String[] params = encodingList[e].split("/");
+                    String encoding = params[0];
+                    String language = params.length == 1 || params[1].length() == 0 ? null : params[1];
+
+                    // With a few charsets, the conversion back to Unicode
+                    // may depend on the implementation.
+                    boolean checkRoundtrip =
+                            !encoding.startsWith("UTF-32") &&
+                            !(params.length >= 3 && params[2].equals("noroundtrip"));
+                    checkEncoding(testString, encoding, language, checkRoundtrip,
+                            encToBytes.get(encoding), id);
                 }
+                encToBytes.clear();
             }
             
         } catch (Exception e) {
@@ -373,11 +408,9 @@ public class TestCharsetDetector extends TestFmwk
         }
     }
 
-    private void checkMatch(CharsetDetector det, String testString, String encoding, String language, String id) throws Exception
-    {
+    private void checkMatch(CharsetDetector det, String testString,
+            String encoding, String language, boolean checkRoundtrip, String id) throws Exception {
         CharsetMatch m = det.detect();
-        String decoded;
-        
         if (! m.getName().equals(encoding)) {
             errln(id + ": encoding detection failure - expected " + encoding + ", got " + m.getName());
             return;
@@ -390,12 +423,12 @@ public class TestCharsetDetector extends TestFmwk
         {
             errln(id + ", " + encoding + ": language detection failure - expected " + language + ", got " + m.getLanguage());
         }
-        
-        if (encoding.startsWith("UTF-32")) {
+
+        if (!checkRoundtrip) {
             return;
         }
         
-        decoded = m.getString();
+        String decoded = m.getString();
         
         if (! testString.equals(decoded)) {
             errln(id + ", " + encoding + ": getString() didn't return the original string!");
@@ -408,62 +441,35 @@ public class TestCharsetDetector extends TestFmwk
         }
     }
     
-    private void checkEncoding(String testString, String encoding, String id)
-    {
-        String enc = null, lang = null;
-        String[] split = encoding.split("/");
-        
-        enc = split[0];
-        
-        if (split.length > 1) {
-            lang = split[1];
+    private void checkEncoding(String testString,
+            String encoding, String language, boolean checkRoundtrip,
+            byte[] bytes, String id) {
+        if (bytes == null) {
+            try {
+                bytes = testString.getBytes(encoding);
+            } catch (UnsupportedOperationException uoe) {
+                // Ignore any converters that can't
+                // convert from Unicode.
+                logln("Unsupported encoding for conversion from Unicode: " + encoding);
+                return;
+            } catch (UnsupportedEncodingException uee) {
+                // Ignore any encodings that this runtime
+                // doesn't support.
+                logln("Unsupported encoding: " + encoding);
+                return;
+            }
         }
 
         try {
             CharsetDetector det = new CharsetDetector();
-            byte[] bytes;
-            
-            //if (enc.startsWith("UTF-32")) {
-            //    UTF32 utf32 = UTF32.getInstance(enc);
-                
-            //    bytes = utf32.toBytes(testString);
-            //} else {
-                String from = enc;
 
-                while (true) {
-                    try {
-                        bytes = testString.getBytes(from);
-                    } catch (UnsupportedOperationException uoe) {
-                         // In some runtimes, the ISO-2022-CN converter
-                         // only converts *to* Unicode - we have to use
-                         // x-ISO-2022-CN-GB to convert *from* Unicode.
-                        if (from.equals("ISO-2022-CN")) {
-                            from = "x-ISO-2022-CN-GB";
-                            continue;
-                        }
-                        
-                        // Ignore any other converters that can't
-                        // convert from Unicode.
-                        logln("Unsupported encoding" + from);
-                        return;
-                    } catch (UnsupportedEncodingException uee) {
-                        // Ignore any encodings that this runtime
-                        // doesn't support.
-                        logln("Unsupported encoding" + from);
-                        return;
-                    }
-                    
-                    break;
-                }
-            //}
-        
             det.setText(bytes);
-            checkMatch(det, testString, enc, lang, id);
-            
+            checkMatch(det, testString, encoding, language, checkRoundtrip, id);
+
             det.setText(new ByteArrayInputStream(bytes));
-            checkMatch(det, testString, enc, lang, id);
+            checkMatch(det, testString, encoding, language, checkRoundtrip, id);
          } catch (Exception e) {
-            errln(id + ": " + e.toString() + "enc=" + enc);
+            errln(id + ": " + e.toString() + "enc=" + encoding);
             e.printStackTrace();
         }
     }