ICU-8130 UTS 46 conformance test using Unicode IdnaTest.txt

X-SVN-Rev: 40130
2017-05-23 04:44:58 +00:00 · 2017-05-23 04:44:58 +00:00 · b2ead3e2e1
commit b2ead3e2e1
parent 1b2cc7d1fb
9 changed files with 15969 additions and 68 deletions
--- a/icu4c/source/common/uts46.cpp
+++ b/icu4c/source/common/uts46.cpp
@ -1015,8 +1015,8 @@ UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c
    ) {
        info.isOkBiDi=FALSE;
    }
-    // Get the directionalities of the intervening characters.
-    uint32_t mask=0;
+    // Add the directionalities of the intervening characters.
+    uint32_t mask=firstMask|lastMask;
    while(i<labelLength) {
        U16_NEXT_UNSAFE(label, i, c);
        mask|=U_MASK(u_charDirection(c));
@ -1045,7 +1045,7 @@ UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c
    // label. [...]
    // The following rule, consisting of six conditions, applies to labels
    // in BIDI domain names.
-    if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
+    if((mask&R_AL_AN_MASK)!=0) {
        info.isBiDi=TRUE;
    }
 }
--- a/icu4c/source/data/unidata/changes.txt
+++ b/icu4c/source/data/unidata/changes.txt
@ -373,7 +373,7 @@ or
    cd $ICU_SRC/icu4c/source/data/unidata
    cp confusables.txt confusablesWholeScript.txt NormalizationCorrections.txt NormalizationTest.txt SpecialCasing.txt UnicodeData.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
    cd ../../test/testdata
-    cp BidiCharacterTest.txt BidiTest.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
+    cp BidiCharacterTest.txt BidiTest.txt IdnaTest.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
    cp $UNICODE_DATA/ucd/CompositionExclusions.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode

 * run & fix ICU4J tests
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@ -26,8 +26,10 @@
 #include "unicode/stringpiece.h"
 #include "unicode/uidna.h"
 #include "unicode/unistr.h"
-#include "intltest.h"
+#include "charstr.h"
 #include "cmemory.h"
+#include "intltest.h"
+#include "uparse.h"

 class UTS46Test : public IntlTest {
 public:
@ -38,6 +40,13 @@ public:
    void TestAPI();
    void TestNotSTD3();
    void TestSomeCases();
+    void IdnaTest();
+
+    void checkIdnaTestResult(const char *line, const char *type,
+                             const UnicodeString &expected, const UnicodeString &result,
+                             const IDNAInfo &info);
+    void idnaTestOneLine(char *fields[][2], UErrorCode &errorCode);
+
 private:
    IDNA *trans, *nontrans;
 };
@ -74,6 +83,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
    TESTCASE_AUTO(TestAPI);
    TESTCASE_AUTO(TestNotSTD3);
    TESTCASE_AUTO(TestSomeCases);
+    TESTCASE_AUTO(IdnaTest);
    TESTCASE_AUTO_END;
 }

@ -517,8 +527,11 @@ static const TestCase testCases[]={
    { "\\u05D07\\u05EA", "B", "\\u05D07\\u05EA", 0 },
    { "\\u05D0\\u0667\\u05EA", "B", "\\u05D0\\u0667\\u05EA", 0 },  // Arabic 7 in the middle
    { "a7\\u0667z", "B", "a7\\u0667z", UIDNA_ERROR_BIDI },  // AN digit in LTR
+    { "a7\\u0667", "B", "a7\\u0667", UIDNA_ERROR_BIDI },  // AN digit in LTR
    { "\\u05D07\\u0667\\u05EA", "B",  // mixed EN/AN digits in RTL
      "\\u05D07\\u0667\\u05EA", UIDNA_ERROR_BIDI },
+    { "\\u05D07\\u0667", "B",  // mixed EN/AN digits in RTL
+      "\\u05D07\\u0667", UIDNA_ERROR_BIDI },
    // ZWJ
    { "\\u0BB9\\u0BCD\\u200D", "N", "\\u0BB9\\u0BCD\\u200D", 0 },  // Virama+ZWJ
    { "\\u0BB9\\u200D", "N", "\\u0BB9\\u200D", UIDNA_ERROR_CONTEXTJ },  // no Virama
@ -881,4 +894,117 @@ void UTS46Test::TestSomeCases() {
    }
 }

+namespace {
+
+const int32_t kNumFields = 4;  // Will need 5 when we read NV8 from the optional fifth column.
+
+void U_CALLCONV
+idnaTestLineFn(void *context,
+               char *fields[][2], int32_t /* fieldCount */,
+               UErrorCode *pErrorCode) {
+    reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
+}
+
+}  // namespace
+
+void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
+                                    const UnicodeString &expected, const UnicodeString &result,
+                                    const IDNAInfo &info) {
+    // An error in toUnicode or toASCII is indicated by a value in square brackets,
+    // such as "[B5 B6]".
+    UBool expectedHasErrors = !expected.isEmpty() && expected[0] == u'[';
+    if (expectedHasErrors != info.hasErrors()) {
+        errln("%s  expected errors %d != %d = actual has errors: %04lx\n    %s",
+              type, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
+    }
+    if (!expectedHasErrors && expected != result) {
+        errln("%s  expected != actual\n    %s", type, line);
+        errln(UnicodeString(u"    ") + expected);
+        errln(UnicodeString(u"    ") + result);
+    }
+}
+
+void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
+    // Column 1: type - T for transitional, N for nontransitional, B for both
+    const char *typePtr = u_skipWhitespace(fields[0][0]);
+    const char *limit;
+    char typeChar;
+    if (typePtr == fields[0][1] ||
+            ((typeChar = *typePtr) != 'B' && typeChar != 'N' && typeChar != 'T') ||
+            (limit = u_skipWhitespace(typePtr + 1)) != fields[0][1]) {
+        errln("empty or unknown type field: %s", fields[0][0]);
+        errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+        return;
+    }
+
+    // Column 2: source - the source string to be tested
+    int32_t length = (int32_t)(fields[1][1] - fields[1][0]);
+    UnicodeString source16 = UnicodeString::fromUTF8(StringPiece(fields[1][0], length)).
+        trim().unescape();
+
+    // Column 3: toUnicode - the result of applying toUnicode to the source.
+    // A blank value means the same as the source value.
+    length = (int32_t)(fields[2][1] - fields[2][0]);
+    UnicodeString unicode16 = UnicodeString::fromUTF8(StringPiece(fields[2][0], length)).
+        trim().unescape();
+    if (unicode16.isEmpty()) {
+        unicode16 = source16;
+    }
+
+    // Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
+    // A blank value means the same as the toUnicode value.
+    length = (int32_t)(fields[3][1] - fields[3][0]);
+    UnicodeString ascii16 = UnicodeString::fromUTF8(StringPiece(fields[3][0], length)).
+        trim().unescape();
+    if (ascii16.isEmpty()) {
+        ascii16 = unicode16;
+    }
+
+    // Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
+    // Ignored as long as we do not implement and test vanilla IDNA2008.
+
+    // ToASCII/ToUnicode, transitional/nontransitional
+    UnicodeString uN, aN, aT;
+    IDNAInfo uNInfo, aNInfo, aTInfo;
+    nontrans->nameToUnicode(source16, uN, uNInfo, errorCode);
+    checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", unicode16, uN, uNInfo);
+    if (typeChar == 'T' || typeChar == 'B') {
+        trans->nameToASCII(source16, aT, aTInfo, errorCode);
+        checkIdnaTestResult(fields[0][0], "toASCIITrans", ascii16, aT, aTInfo);
+    }
+    if (typeChar == 'N' || typeChar == 'B') {
+        nontrans->nameToASCII(source16, aN, aNInfo, errorCode);
+        checkIdnaTestResult(fields[0][0], "toASCIINontrans", ascii16, aN, aNInfo);
+    }
+}
+
+// TODO: de-duplicate
+U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
+
+// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
+void UTS46Test::IdnaTest() {
+    IcuTestErrorCode errorCode(*this, "IdnaTest");
+    const char *sourceTestDataPath = getSourceTestData(errorCode);
+    if (errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
+                                       "folder (getSourceTestData())")) {
+        return;
+    }
+    CharString path(sourceTestDataPath, errorCode);
+    path.appendPathPart("IdnaTest.txt", errorCode);
+    LocalStdioFilePointer idnaTestFile(fopen(path.data(), "r"));
+    if (idnaTestFile.isNull()) {
+        errln("unable to open %s", path.data());
+        return;
+    }
+
+    // Columns (c1, c2,...) are separated by semicolons.
+    // Leading and trailing spaces and tabs in each column are ignored.
+    // Comments are indicated with hash marks.
+    char *fields[kNumFields][2];
+    u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
+    if (errorCode.logIfFailureAndReset("error parsing IdnaTest.txt")) {
+        return;
+    }
+}
+
 #endif  // UCONFIG_NO_IDNA
--- a/icu4c/source/test/testdata/IdnaTest.txt
+++ b/icu4c/source/test/testdata/IdnaTest.txt
--- a/icu4c/source/tools/toolutil/uparse.cpp
+++ b/icu4c/source/tools/toolutil/uparse.cpp
@ -77,7 +77,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
                     UParseLineFn *lineFn, void *context,
                     UErrorCode *pErrorCode) {
    FileStream *file;
-    char line[300];
+    char line[10000];
    char *start, *limit;
    int32_t i, length;

@ -163,7 +163,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
            }
        }

-        /* error in a field function? */
+        /* too few fields? */
        if(U_FAILURE(*pErrorCode)) {
            break;
        }
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
@ -586,8 +586,8 @@ public final class UTS46 extends IDNA {
        ) {
            setNotOkBiDi(info);
        }
-        // Get the directionalities of the intervening characters.
-        int mask=0;
+        // Add the directionalities of the intervening characters.
+        int mask=firstMask|lastMask;
        while(i<labelLimit) {
            c=Character.codePointAt(label, i);
            i+=Character.charCount(c);
@ -617,7 +617,7 @@ public final class UTS46 extends IDNA {
        // label. [...]
        // The following rule, consisting of six conditions, applies to labels
        // in BIDI domain names.
-        if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
+        if((mask&R_AL_AN_MASK)!=0) {
            setBiDi(info);
        }
    }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/IdnaTest.txt
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/IdnaTest.txt
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@ -8,16 +8,21 @@
 */
 package com.ibm.icu.dev.test.normalizer;

+import java.io.BufferedReader;
+import java.io.IOException;
 import java.util.Collections;
 import java.util.EnumSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
+import java.util.regex.Pattern;

 import org.junit.Test;

 import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
+import com.ibm.icu.impl.Utility;
 import com.ibm.icu.text.IDNA;

 /**
@ -413,8 +418,11 @@ public class UTS46Test extends TestFmwk {
        { "\u05D07\u05EA", "B", "\u05D07\u05EA", "" },
        { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" },  // Arabic 7 in the middle
        { "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" },  // AN digit in LTR
+        { "a7\u0667", "B", "a7\u0667", "UIDNA_ERROR_BIDI" },  // AN digit in LTR
        { "\u05D07\u0667\u05EA", "B",  // mixed EN/AN digits in RTL
          "\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" },
+        { "\u05D07\u0667", "B",  // mixed EN/AN digits in RTL
+          "\u05D07\u0667", "UIDNA_ERROR_BIDI" },
        // ZWJ
        { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" },  // Virama+ZWJ
        { "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" },  // no Virama
@ -716,6 +724,88 @@ public class UTS46Test extends TestFmwk {
        }
    }

+    private void checkIdnaTestResult(String line, String type,
+            String expected, CharSequence result, IDNA.Info info) {
+        // An error in toUnicode or toASCII is indicated by a value in square brackets,
+        // such as "[B5 B6]".
+        boolean expectedHasErrors = !expected.isEmpty() && expected.charAt(0) == '[';
+        if (expectedHasErrors != info.hasErrors()) {
+            errln(String.format(
+                    "%s  expected errors %b != %b = actual has errors: %s\n    %s",
+                    type, expectedHasErrors, info.hasErrors(), info.getErrors(), line));
+        }
+        if (!expectedHasErrors && !UTF16Plus.equal(expected, result)) {
+            errln(String.format("%s  expected != actual\n    %s", type, line));
+            errln("    " + expected);
+            errln("    " + result);
+        }
+    }
+
+    @Test
+    public void IdnaTest() throws IOException {
+        BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTest.txt");
+        Pattern semi = Pattern.compile(";");
+        try {
+            String line;
+            while ((line = idnaTestFile.readLine()) != null) {
+                // Remove trailing comments and whitespace.
+                int commentStart = line.indexOf('#');
+                if (commentStart >= 0) {
+                    line = line.substring(0, commentStart);
+                }
+                String[] fields = semi.split(line, -1);
+                if (fields.length <= 1) {
+                    continue;  // Skip empty and comment-only lines.
+                }
+
+                // Column 1: type - T for transitional, N for nontransitional, B for both
+                String type = fields[0].trim();
+                char typeChar;
+                if (type.length() != 1 ||
+                        ((typeChar = type.charAt(0)) != 'B' && typeChar != 'N' && typeChar != 'T')) {
+                    errln("empty or unknown type field: " + line);
+                    return;
+                }
+
+                // Column 2: source - the source string to be tested
+                String source16 = Utility.unescape(fields[1].trim());
+
+                // Column 3: toUnicode - the result of applying toUnicode to the source.
+                // A blank value means the same as the source value.
+                String unicode16 = Utility.unescape(fields[2].trim());
+                if (unicode16.isEmpty()) {
+                    unicode16 = source16;
+                }
+
+                // Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
+                // A blank value means the same as the toUnicode value.
+                String ascii16 = Utility.unescape(fields[3].trim());
+                if (ascii16.isEmpty()) {
+                    ascii16 = unicode16;
+                }
+
+                // Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
+                // Ignored as long as we do not implement and test vanilla IDNA2008.
+
+                // ToASCII/ToUnicode, transitional/nontransitional
+                StringBuilder uN, aN, aT;
+                IDNA.Info uNInfo, aNInfo, aTInfo;
+                nontrans.nameToUnicode(source16, uN = new StringBuilder(), uNInfo = new IDNA.Info());
+                checkIdnaTestResult(line, "toUnicodeNontrans", unicode16, uN, uNInfo);
+                if (typeChar == 'T' || typeChar == 'B') {
+                    trans.nameToASCII(source16, aT = new StringBuilder(), aTInfo = new IDNA.Info());
+                    checkIdnaTestResult(line, "toASCIITrans", ascii16, aT, aTInfo);
+                }
+                if (typeChar == 'N' || typeChar == 'B') {
+                    nontrans.nameToASCII(source16, aN = new StringBuilder(), aNInfo = new IDNA.Info());
+                    checkIdnaTestResult(line, "toASCIINontrans", ascii16, aN, aNInfo);
+                }
+            }
+        } finally {
+            idnaTestFile.close();
+        }
+    }
+
    private final IDNA trans, nontrans;

    private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of(
--- a/tools/unicode/py/preparseucd.py
+++ b/tools/unicode/py/preparseucd.py
@ -1493,65 +1493,63 @@ _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")

 def CopyAndStripWithOptionalMerge(s, t, do_merge):
  # TODO: We do not seem to need the do_merge argument and logic any more.
-  # TODO: With Python 2.7+, combine the two with statements into one.
-  with open(s, "r") as in_file:
-    with open(t, "w") as out_file:
-      first = -1  # First code point with first_data.
-      last = -1  # Last code point with first_data.
-      first_data = ""  # Common data for code points [first..last].
-      for line in in_file:
-        match = _strip_re.match(line)
+  with open(s, "r") as in_file, open(t, "w") as out_file:
+    first = -1  # First code point with first_data.
+    last = -1  # Last code point with first_data.
+    first_data = ""  # Common data for code points [first..last].
+    for line in in_file:
+      match = _strip_re.match(line)
+      if match:
+        line = match.group(1)
+      else:
+        line = line.rstrip()
+      if do_merge:
+        match = _code_point_re.match(line)
        if match:
-          line = match.group(1)
+          c = int(match.group(1), 16)
+          data = line[match.end() - 1:]
        else:
-          line = line.rstrip()
-        if do_merge:
-          match = _code_point_re.match(line)
-          if match:
-            c = int(match.group(1), 16)
-            data = line[match.end() - 1:]
+          c = -1
+          data = ""
+        if last >= 0 and (c != (last + 1) or data != first_data):
+          # output the current range
+          if first == last:
+            out_file.write("%04X%s\n" % (first, first_data))
          else:
-            c = -1
-            data = ""
-          if last >= 0 and (c != (last + 1) or data != first_data):
-            # output the current range
-            if first == last:
-              out_file.write("%04X%s\n" % (first, first_data))
-            else:
-              out_file.write("%04X..%04X%s\n" % (first, last, first_data))
-            first = -1
-            last = -1
-            first_data = ""
-          if c < 0:
-            # no data on this line, output as is
-            out_file.write(line)
-            out_file.write("\n")
-          else:
-            # data on this line, store for possible range compaction
-            if last < 0:
-              # set as the first line in a possible range
-              first = c
-              last = c
-              first_data = data
-            else:
-              # must be c == (last + 1) and data == first_data
-              # because of previous conditions
-              # continue with the current range
-              last = c
-        else:
-          # Only strip, don't merge: just output the stripped line.
+            out_file.write("%04X..%04X%s\n" % (first, last, first_data))
+          first = -1
+          last = -1
+          first_data = ""
+        if c < 0:
+          # no data on this line, output as is
          out_file.write(line)
          out_file.write("\n")
-      if do_merge and last >= 0:
-        # output the last range in the file
-        if first == last:
-          out_file.write("%04X%s\n" % (first, first_data))
        else:
-          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
-        first = -1
-        last = -1
-        first_data = ""
-      out_file.flush()
+          # data on this line, store for possible range compaction
+          if last < 0:
+            # set as the first line in a possible range
+            first = c
+            last = c
+            first_data = data
+          else:
+            # must be c == (last + 1) and data == first_data
+            # because of previous conditions
+            # continue with the current range
+            last = c
+      else:
+        # Only strip, don't merge: just output the stripped line.
+        out_file.write(line)
+        out_file.write("\n")
+    if do_merge and last >= 0:
+      # output the last range in the file
+      if first == last:
+        out_file.write("%04X%s\n" % (first, first_data))
+      else:
+        out_file.write("%04X..%04X%s\n" % (first, last, first_data))
+      first = -1
+      last = -1
+      first_data = ""
+    out_file.flush()
  return t


@ -1571,11 +1569,9 @@ def CopyAndStripAndMerge(s, t):


 def PrependBOM(s, t):
-  # TODO: With Python 2.7+, combine the two with statements into one.
-  with open(s, "r") as in_file:
-    with open(t, "w") as out_file:
-      out_file.write("\xef\xbb\xbf")  # UTF-8 BOM for ICU svn
-      shutil.copyfileobj(in_file, out_file)
+  with open(s, "r") as in_file, open(t, "w") as out_file:
+    out_file.write("\xef\xbb\xbf")  # UTF-8 BOM for ICU svn
+    shutil.copyfileobj(in_file, out_file)
  return t


@ -1613,6 +1609,7 @@ _files = {
  "emoji-data.txt": (DontCopy, ParseNamedProperties),
  "GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
  "GraphemeBreakTest.txt": (PrependBOM, "testdata"),
+  "IdnaTest.txt": (CopyOnly, "testdata"),
  "IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
  "IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
  "LineBreak.txt": (DontCopy, ParseLineBreak),