From e19d12997b1ab9599125c77709bfaf3c8c9ec09f Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 13 Aug 2020 15:35:17 -0700 Subject: [PATCH] ICU-21212 full range check for Punycode digits --- icu4c/source/common/punycode.cpp | 50 +++++------- icu4c/source/test/intltest/uts46test.cpp | 67 ++++++++++++++++ .../core/src/com/ibm/icu/impl/Punycode.java | 79 ++++++++----------- .../icu/dev/test/normalizer/UTS46Test.java | 49 +++++++++++- 4 files changed, 170 insertions(+), 75 deletions(-) diff --git a/icu4c/source/common/punycode.cpp b/icu4c/source/common/punycode.cpp index 90fe1ec3c8..8f14a7abbe 100644 --- a/icu4c/source/common/punycode.cpp +++ b/icu4c/source/common/punycode.cpp @@ -107,36 +107,26 @@ digitToBasic(int32_t digit, UBool uppercase) { } /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. + * @return the numeric value of a basic code point (for use in representing integers) + * in the range 0 to BASE-1, or a negative value if cp is invalid. */ -static const int8_t -basicToDigit[256]={ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 -}; +static int32_t decodeDigit(int32_t cp) { + if(cp<=u'Z') { + if(cp<=u'9') { + if(cp 26..35 + } + } else { + return cp-u'A'; // A-Z -> 0..25 + } + } else if(cp<=u'z') { + return cp-'a'; // a..z -> 0..25 + } else { + return -1; + } +} static inline char asciiCaseMap(char b, UBool uppercase) { @@ -455,7 +445,7 @@ u_strFromPunycode(const UChar *src, int32_t srcLength, return 0; } - digit=basicToDigit[(uint8_t)src[in++]]; + digit=decodeDigit(src[in++]); if(digit<0) { *pErrorCode=U_INVALID_CHAR_FOUND; return 0; diff --git a/icu4c/source/test/intltest/uts46test.cpp b/icu4c/source/test/intltest/uts46test.cpp index b399d2dd72..39ba01d385 100644 --- a/icu4c/source/test/intltest/uts46test.cpp +++ b/icu4c/source/test/intltest/uts46test.cpp @@ -39,6 +39,7 @@ public: void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL); void TestAPI(); void TestNotSTD3(); + void TestInvalidPunycodeDigits(); void TestSomeCases(); void IdnaTest(); @@ -82,6 +83,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha TESTCASE_AUTO_BEGIN; TESTCASE_AUTO(TestAPI); TESTCASE_AUTO(TestNotSTD3); + TESTCASE_AUTO(TestInvalidPunycodeDigits); TESTCASE_AUTO(TestSomeCases); TESTCASE_AUTO(IdnaTest); TESTCASE_AUTO_END; @@ -245,6 +247,71 @@ void UTS46Test::TestNotSTD3() { } } +void UTS46Test::TestInvalidPunycodeDigits() { + IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()"); + LocalPointer idna(IDNA::createUTS46Instance(0, errorCode)); + if(errorCode.isFailure()) { + return; + } + UnicodeString result; + { + IDNAInfo info; + idna->nameToUnicode(u"xn--pleP", result, info, errorCode); // P=U+0050 + assertFalse("nameToUnicode() should succeed", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + assertEquals("normal result", u"ᔼᔴ", result); + } + { + IDNAInfo info; + idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode); // ends with non-ASCII U+0450 + assertTrue("nameToUnicode() should detect non-ASCII", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + // Test with ASCII characters adjacent to LDH. + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple/", result, info, errorCode); + assertTrue("nameToUnicode() should detect '/'", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple:", result, info, errorCode); + assertTrue("nameToUnicode() should detect ':'", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple@", result, info, errorCode); + assertTrue("nameToUnicode() should detect '@'", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple[", result, info, errorCode); + assertTrue("nameToUnicode() should detect '['", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple`", result, info, errorCode); + assertTrue("nameToUnicode() should detect '`'", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } + + { + IDNAInfo info; + idna->nameToUnicode(u"xn--ple{", result, info, errorCode); + assertTrue("nameToUnicode() should detect '{'", + (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0); + } +} + struct TestCase { // Input string and options string (Nontransitional/Transitional/Both). const char *s, *o; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java index 5cdcdb88fb..7b2395ca26 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java @@ -13,7 +13,7 @@ import com.ibm.icu.text.StringPrepParseException; import com.ibm.icu.text.UTF16; /** - * Ported code from ICU punycode.c + * Ported code from ICU punycode.c * @author ram */ public final class Punycode { @@ -26,17 +26,17 @@ public final class Punycode { private static final int DAMP = 700; private static final int INITIAL_BIAS = 72; private static final int INITIAL_N = 0x80; - + /* "Basic" Unicode/ASCII code points */ private static final char HYPHEN = 0x2d; private static final char DELIMITER = HYPHEN; - + private static final int ZERO = 0x30; //private static final int NINE = 0x39; - + private static final int SMALL_A = 0x61; private static final int SMALL_Z = 0x7a; - + private static final int CAPITAL_A = 0x41; private static final int CAPITAL_Z = 0x5a; @@ -53,39 +53,30 @@ public final class Punycode { delta/=(BASE-TMIN); } - return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); + return count+(((BASE-TMIN+1)*delta)/(delta+SKEW)); } /** - * basicToDigit[] contains the numeric value of a basic code - * point (for use in representing integers) in the range 0 to - * BASE-1, or -1 if b is does not represent a value. + * @return the numeric value of a basic code point (for use in representing integers) + * in the range 0 to BASE-1, or a negative value if cp is invalid. */ - static final int[] basicToDigit= new int[]{ - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; + private static final int decodeDigit(int cp) { + if(cp<='Z') { + if(cp<='9') { + if(cp<'0') { + return -1; + } else { + return cp-'0'+26; // 0..9 -> 26..35 + } + } else { + return cp-'A'; // A-Z -> 0..25 + } + } else if(cp<='z') { + return cp-'a'; // a..z -> 0..25 + } else { + return -1; + } + } ///CLOVER:OFF private static char asciiCaseMap(char b, boolean uppercase) { @@ -99,7 +90,7 @@ public final class Punycode { } } return b; - } + } ///CLOVER:ON /** * digitToBasic() returns the basic code point whose value @@ -124,7 +115,7 @@ public final class Punycode { * Converts Unicode to Punycode. * The input string must not contain single, unpaired surrogates. * The output will be represented as an array of ASCII code points. - * + * * @param src The source of the String Buffer passed. * @param caseFlags The boolean array of case flags. * @return An array of ASCII code points. @@ -140,7 +131,7 @@ public final class Punycode { * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit): */ srcCPCount=0; - + for(j=0; j errorNamesToErrors; static { - errorNamesToErrors=new TreeMap(); + errorNamesToErrors=new TreeMap<>(); errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL); errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG); errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);