ICU-8130 UTS 46 conformance test using Unicode IdnaTest.txt

X-SVN-Rev: 40130
This commit is contained in:
Markus Scherer 2017-05-23 04:44:58 +00:00
parent 1b2cc7d1fb
commit b2ead3e2e1
9 changed files with 15969 additions and 68 deletions

View File

@ -1015,8 +1015,8 @@ UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c
) {
info.isOkBiDi=FALSE;
}
// Get the directionalities of the intervening characters.
uint32_t mask=0;
// Add the directionalities of the intervening characters.
uint32_t mask=firstMask|lastMask;
while(i<labelLength) {
U16_NEXT_UNSAFE(label, i, c);
mask|=U_MASK(u_charDirection(c));
@ -1045,7 +1045,7 @@ UTS46::checkLabelBiDi(const UChar *label, int32_t labelLength, IDNAInfo &info) c
// label. [...]
// The following rule, consisting of six conditions, applies to labels
// in BIDI domain names.
if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
if((mask&R_AL_AN_MASK)!=0) {
info.isBiDi=TRUE;
}
}

View File

@ -373,7 +373,7 @@ or
cd $ICU_SRC/icu4c/source/data/unidata
cp confusables.txt confusablesWholeScript.txt NormalizationCorrections.txt NormalizationTest.txt SpecialCasing.txt UnicodeData.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
cd ../../test/testdata
cp BidiCharacterTest.txt BidiTest.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
cp BidiCharacterTest.txt BidiTest.txt IdnaTest.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
cp $UNICODE_DATA/ucd/CompositionExclusions.txt $ICU_SRC/icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode
* run & fix ICU4J tests

View File

@ -26,8 +26,10 @@
#include "unicode/stringpiece.h"
#include "unicode/uidna.h"
#include "unicode/unistr.h"
#include "intltest.h"
#include "charstr.h"
#include "cmemory.h"
#include "intltest.h"
#include "uparse.h"
class UTS46Test : public IntlTest {
public:
@ -38,6 +40,13 @@ public:
void TestAPI();
void TestNotSTD3();
void TestSomeCases();
void IdnaTest();
void checkIdnaTestResult(const char *line, const char *type,
const UnicodeString &expected, const UnicodeString &result,
const IDNAInfo &info);
void idnaTestOneLine(char *fields[][2], UErrorCode &errorCode);
private:
IDNA *trans, *nontrans;
};
@ -74,6 +83,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestNotSTD3);
TESTCASE_AUTO(TestSomeCases);
TESTCASE_AUTO(IdnaTest);
TESTCASE_AUTO_END;
}
@ -517,8 +527,11 @@ static const TestCase testCases[]={
{ "\\u05D07\\u05EA", "B", "\\u05D07\\u05EA", 0 },
{ "\\u05D0\\u0667\\u05EA", "B", "\\u05D0\\u0667\\u05EA", 0 }, // Arabic 7 in the middle
{ "a7\\u0667z", "B", "a7\\u0667z", UIDNA_ERROR_BIDI }, // AN digit in LTR
{ "a7\\u0667", "B", "a7\\u0667", UIDNA_ERROR_BIDI }, // AN digit in LTR
{ "\\u05D07\\u0667\\u05EA", "B", // mixed EN/AN digits in RTL
"\\u05D07\\u0667\\u05EA", UIDNA_ERROR_BIDI },
{ "\\u05D07\\u0667", "B", // mixed EN/AN digits in RTL
"\\u05D07\\u0667", UIDNA_ERROR_BIDI },
// ZWJ
{ "\\u0BB9\\u0BCD\\u200D", "N", "\\u0BB9\\u0BCD\\u200D", 0 }, // Virama+ZWJ
{ "\\u0BB9\\u200D", "N", "\\u0BB9\\u200D", UIDNA_ERROR_CONTEXTJ }, // no Virama
@ -881,4 +894,117 @@ void UTS46Test::TestSomeCases() {
}
}
namespace {
const int32_t kNumFields = 4; // Will need 5 when we read NV8 from the optional fifth column.
void U_CALLCONV
idnaTestLineFn(void *context,
char *fields[][2], int32_t /* fieldCount */,
UErrorCode *pErrorCode) {
reinterpret_cast<UTS46Test *>(context)->idnaTestOneLine(fields, *pErrorCode);
}
} // namespace
void UTS46Test::checkIdnaTestResult(const char *line, const char *type,
const UnicodeString &expected, const UnicodeString &result,
const IDNAInfo &info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
// such as "[B5 B6]".
UBool expectedHasErrors = !expected.isEmpty() && expected[0] == u'[';
if (expectedHasErrors != info.hasErrors()) {
errln("%s expected errors %d != %d = actual has errors: %04lx\n %s",
type, expectedHasErrors, info.hasErrors(), (long)info.getErrors(), line);
}
if (!expectedHasErrors && expected != result) {
errln("%s expected != actual\n %s", type, line);
errln(UnicodeString(u" ") + expected);
errln(UnicodeString(u" ") + result);
}
}
void UTS46Test::idnaTestOneLine(char *fields[][2], UErrorCode &errorCode) {
// Column 1: type - T for transitional, N for nontransitional, B for both
const char *typePtr = u_skipWhitespace(fields[0][0]);
const char *limit;
char typeChar;
if (typePtr == fields[0][1] ||
((typeChar = *typePtr) != 'B' && typeChar != 'N' && typeChar != 'T') ||
(limit = u_skipWhitespace(typePtr + 1)) != fields[0][1]) {
errln("empty or unknown type field: %s", fields[0][0]);
errorCode = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
// Column 2: source - the source string to be tested
int32_t length = (int32_t)(fields[1][1] - fields[1][0]);
UnicodeString source16 = UnicodeString::fromUTF8(StringPiece(fields[1][0], length)).
trim().unescape();
// Column 3: toUnicode - the result of applying toUnicode to the source.
// A blank value means the same as the source value.
length = (int32_t)(fields[2][1] - fields[2][0]);
UnicodeString unicode16 = UnicodeString::fromUTF8(StringPiece(fields[2][0], length)).
trim().unescape();
if (unicode16.isEmpty()) {
unicode16 = source16;
}
// Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
// A blank value means the same as the toUnicode value.
length = (int32_t)(fields[3][1] - fields[3][0]);
UnicodeString ascii16 = UnicodeString::fromUTF8(StringPiece(fields[3][0], length)).
trim().unescape();
if (ascii16.isEmpty()) {
ascii16 = unicode16;
}
// Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
// Ignored as long as we do not implement and test vanilla IDNA2008.
// ToASCII/ToUnicode, transitional/nontransitional
UnicodeString uN, aN, aT;
IDNAInfo uNInfo, aNInfo, aTInfo;
nontrans->nameToUnicode(source16, uN, uNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toUnicodeNontrans", unicode16, uN, uNInfo);
if (typeChar == 'T' || typeChar == 'B') {
trans->nameToASCII(source16, aT, aTInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIITrans", ascii16, aT, aTInfo);
}
if (typeChar == 'N' || typeChar == 'B') {
nontrans->nameToASCII(source16, aN, aNInfo, errorCode);
checkIdnaTestResult(fields[0][0], "toASCIINontrans", ascii16, aN, aNInfo);
}
}
// TODO: de-duplicate
U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
// http://www.unicode.org/Public/idna/latest/IdnaTest.txt
void UTS46Test::IdnaTest() {
IcuTestErrorCode errorCode(*this, "IdnaTest");
const char *sourceTestDataPath = getSourceTestData(errorCode);
if (errorCode.logIfFailureAndReset("unable to find the source/test/testdata "
"folder (getSourceTestData())")) {
return;
}
CharString path(sourceTestDataPath, errorCode);
path.appendPathPart("IdnaTest.txt", errorCode);
LocalStdioFilePointer idnaTestFile(fopen(path.data(), "r"));
if (idnaTestFile.isNull()) {
errln("unable to open %s", path.data());
return;
}
// Columns (c1, c2,...) are separated by semicolons.
// Leading and trailing spaces and tabs in each column are ignored.
// Comments are indicated with hash marks.
char *fields[kNumFields][2];
u_parseDelimitedFile(path.data(), ';', fields, kNumFields, idnaTestLineFn, this, errorCode);
if (errorCode.logIfFailureAndReset("error parsing IdnaTest.txt")) {
return;
}
}
#endif // UCONFIG_NO_IDNA

7844
icu4c/source/test/testdata/IdnaTest.txt vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -77,7 +77,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
UParseLineFn *lineFn, void *context,
UErrorCode *pErrorCode) {
FileStream *file;
char line[300];
char line[10000];
char *start, *limit;
int32_t i, length;
@ -163,7 +163,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
}
}
/* error in a field function? */
/* too few fields? */
if(U_FAILURE(*pErrorCode)) {
break;
}

View File

@ -586,8 +586,8 @@ public final class UTS46 extends IDNA {
) {
setNotOkBiDi(info);
}
// Get the directionalities of the intervening characters.
int mask=0;
// Add the directionalities of the intervening characters.
int mask=firstMask|lastMask;
while(i<labelLimit) {
c=Character.codePointAt(label, i);
i+=Character.charCount(c);
@ -617,7 +617,7 @@ public final class UTS46 extends IDNA {
// label. [...]
// The following rule, consisting of six conditions, applies to labels
// in BIDI domain names.
if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
if((mask&R_AL_AN_MASK)!=0) {
setBiDi(info);
}
}

File diff suppressed because it is too large Load Diff

View File

@ -8,16 +8,21 @@
*/
package com.ibm.icu.dev.test.normalizer;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.Collections;
import java.util.EnumSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.regex.Pattern;
import org.junit.Test;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.text.IDNA;
/**
@ -413,8 +418,11 @@ public class UTS46Test extends TestFmwk {
{ "\u05D07\u05EA", "B", "\u05D07\u05EA", "" },
{ "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" }, // Arabic 7 in the middle
{ "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" }, // AN digit in LTR
{ "a7\u0667", "B", "a7\u0667", "UIDNA_ERROR_BIDI" }, // AN digit in LTR
{ "\u05D07\u0667\u05EA", "B", // mixed EN/AN digits in RTL
"\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" },
{ "\u05D07\u0667", "B", // mixed EN/AN digits in RTL
"\u05D07\u0667", "UIDNA_ERROR_BIDI" },
// ZWJ
{ "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" }, // Virama+ZWJ
{ "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama
@ -716,6 +724,88 @@ public class UTS46Test extends TestFmwk {
}
}
private void checkIdnaTestResult(String line, String type,
String expected, CharSequence result, IDNA.Info info) {
// An error in toUnicode or toASCII is indicated by a value in square brackets,
// such as "[B5 B6]".
boolean expectedHasErrors = !expected.isEmpty() && expected.charAt(0) == '[';
if (expectedHasErrors != info.hasErrors()) {
errln(String.format(
"%s expected errors %b != %b = actual has errors: %s\n %s",
type, expectedHasErrors, info.hasErrors(), info.getErrors(), line));
}
if (!expectedHasErrors && !UTF16Plus.equal(expected, result)) {
errln(String.format("%s expected != actual\n %s", type, line));
errln(" " + expected);
errln(" " + result);
}
}
@Test
public void IdnaTest() throws IOException {
BufferedReader idnaTestFile = TestUtil.getDataReader("unicode/IdnaTest.txt");
Pattern semi = Pattern.compile(";");
try {
String line;
while ((line = idnaTestFile.readLine()) != null) {
// Remove trailing comments and whitespace.
int commentStart = line.indexOf('#');
if (commentStart >= 0) {
line = line.substring(0, commentStart);
}
String[] fields = semi.split(line, -1);
if (fields.length <= 1) {
continue; // Skip empty and comment-only lines.
}
// Column 1: type - T for transitional, N for nontransitional, B for both
String type = fields[0].trim();
char typeChar;
if (type.length() != 1 ||
((typeChar = type.charAt(0)) != 'B' && typeChar != 'N' && typeChar != 'T')) {
errln("empty or unknown type field: " + line);
return;
}
// Column 2: source - the source string to be tested
String source16 = Utility.unescape(fields[1].trim());
// Column 3: toUnicode - the result of applying toUnicode to the source.
// A blank value means the same as the source value.
String unicode16 = Utility.unescape(fields[2].trim());
if (unicode16.isEmpty()) {
unicode16 = source16;
}
// Column 4: toASCII - the result of applying toASCII to the source, using the specified type.
// A blank value means the same as the toUnicode value.
String ascii16 = Utility.unescape(fields[3].trim());
if (ascii16.isEmpty()) {
ascii16 = unicode16;
}
// Column 5: NV8 - present if the toUnicode value would not be a valid domain name under IDNA2008. Not a normative field.
// Ignored as long as we do not implement and test vanilla IDNA2008.
// ToASCII/ToUnicode, transitional/nontransitional
StringBuilder uN, aN, aT;
IDNA.Info uNInfo, aNInfo, aTInfo;
nontrans.nameToUnicode(source16, uN = new StringBuilder(), uNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toUnicodeNontrans", unicode16, uN, uNInfo);
if (typeChar == 'T' || typeChar == 'B') {
trans.nameToASCII(source16, aT = new StringBuilder(), aTInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIITrans", ascii16, aT, aTInfo);
}
if (typeChar == 'N' || typeChar == 'B') {
nontrans.nameToASCII(source16, aN = new StringBuilder(), aNInfo = new IDNA.Info());
checkIdnaTestResult(line, "toASCIINontrans", ascii16, aN, aNInfo);
}
}
} finally {
idnaTestFile.close();
}
}
private final IDNA trans, nontrans;
private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of(

View File

@ -1493,65 +1493,63 @@ _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
def CopyAndStripWithOptionalMerge(s, t, do_merge):
# TODO: We do not seem to need the do_merge argument and logic any more.
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
first = -1 # First code point with first_data.
last = -1 # Last code point with first_data.
first_data = "" # Common data for code points [first..last].
for line in in_file:
match = _strip_re.match(line)
with open(s, "r") as in_file, open(t, "w") as out_file:
first = -1 # First code point with first_data.
last = -1 # Last code point with first_data.
first_data = "" # Common data for code points [first..last].
for line in in_file:
match = _strip_re.match(line)
if match:
line = match.group(1)
else:
line = line.rstrip()
if do_merge:
match = _code_point_re.match(line)
if match:
line = match.group(1)
c = int(match.group(1), 16)
data = line[match.end() - 1:]
else:
line = line.rstrip()
if do_merge:
match = _code_point_re.match(line)
if match:
c = int(match.group(1), 16)
data = line[match.end() - 1:]
c = -1
data = ""
if last >= 0 and (c != (last + 1) or data != first_data):
# output the current range
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
c = -1
data = ""
if last >= 0 and (c != (last + 1) or data != first_data):
# output the current range
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
if c < 0:
# no data on this line, output as is
out_file.write(line)
out_file.write("\n")
else:
# data on this line, store for possible range compaction
if last < 0:
# set as the first line in a possible range
first = c
last = c
first_data = data
else:
# must be c == (last + 1) and data == first_data
# because of previous conditions
# continue with the current range
last = c
else:
# Only strip, don't merge: just output the stripped line.
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
if c < 0:
# no data on this line, output as is
out_file.write(line)
out_file.write("\n")
if do_merge and last >= 0:
# output the last range in the file
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
out_file.flush()
# data on this line, store for possible range compaction
if last < 0:
# set as the first line in a possible range
first = c
last = c
first_data = data
else:
# must be c == (last + 1) and data == first_data
# because of previous conditions
# continue with the current range
last = c
else:
# Only strip, don't merge: just output the stripped line.
out_file.write(line)
out_file.write("\n")
if do_merge and last >= 0:
# output the last range in the file
if first == last:
out_file.write("%04X%s\n" % (first, first_data))
else:
out_file.write("%04X..%04X%s\n" % (first, last, first_data))
first = -1
last = -1
first_data = ""
out_file.flush()
return t
@ -1571,11 +1569,9 @@ def CopyAndStripAndMerge(s, t):
def PrependBOM(s, t):
# TODO: With Python 2.7+, combine the two with statements into one.
with open(s, "r") as in_file:
with open(t, "w") as out_file:
out_file.write("\xef\xbb\xbf") # UTF-8 BOM for ICU svn
shutil.copyfileobj(in_file, out_file)
with open(s, "r") as in_file, open(t, "w") as out_file:
out_file.write("\xef\xbb\xbf") # UTF-8 BOM for ICU svn
shutil.copyfileobj(in_file, out_file)
return t
@ -1613,6 +1609,7 @@ _files = {
"emoji-data.txt": (DontCopy, ParseNamedProperties),
"GraphemeBreakProperty.txt": (DontCopy, ParseGraphemeBreakProperty),
"GraphemeBreakTest.txt": (PrependBOM, "testdata"),
"IdnaTest.txt": (CopyOnly, "testdata"),
"IndicPositionalCategory.txt": (DontCopy, ParseIndicPositionalCategory),
"IndicSyllabicCategory.txt": (DontCopy, ParseIndicSyllabicCategory),
"LineBreak.txt": (DontCopy, ParseLineBreak),