ICU-21212 full range check for Punycode digits

2020-08-13 15:35:17 -07:00 · 2020-08-13 15:35:17 -07:00 · e19d12997b
commit e19d12997b
parent e3f2c0dd70
4 changed files with 170 additions and 75 deletions
--- a/icu4c/source/common/punycode.cpp
+++ b/icu4c/source/common/punycode.cpp
@ -107,36 +107,26 @@ digitToBasic(int32_t digit, UBool uppercase) {
 }

 /**
- * basicToDigit[] contains the numeric value of a basic code
- * point (for use in representing integers) in the range 0 to
- * BASE-1, or -1 if b is does not represent a value.
+ * @return the numeric value of a basic code point (for use in representing integers)
+ *         in the range 0 to BASE-1, or a negative value if cp is invalid.
 */
-static const int8_t
-basicToDigit[256]={
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-
-    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-    -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-};
+static int32_t decodeDigit(int32_t cp) {
+    if(cp<=u'Z') {
+        if(cp<=u'9') {
+            if(cp<u'0') {
+                return -1;
+            } else {
+                return cp-u'0'+26;  // 0..9 -> 26..35
+            }
+        } else {
+            return cp-u'A';  // A-Z -> 0..25
+        }
+    } else if(cp<=u'z') {
+        return cp-'a';  // a..z -> 0..25
+    } else {
+        return -1;
+    }
+}

 static inline char
 asciiCaseMap(char b, UBool uppercase) {
@ -455,7 +445,7 @@ u_strFromPunycode(const UChar *src, int32_t srcLength,
                return 0;
            }

-            digit=basicToDigit[(uint8_t)src[in++]];
+            digit=decodeDigit(src[in++]);
            if(digit<0) {
                *pErrorCode=U_INVALID_CHAR_FOUND;
                return 0;
--- a/icu4c/source/test/intltest/uts46test.cpp
+++ b/icu4c/source/test/intltest/uts46test.cpp
@ -39,6 +39,7 @@ public:
    void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
    void TestAPI();
    void TestNotSTD3();
+    void TestInvalidPunycodeDigits();
    void TestSomeCases();
    void IdnaTest();

@ -82,6 +83,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
    TESTCASE_AUTO_BEGIN;
    TESTCASE_AUTO(TestAPI);
    TESTCASE_AUTO(TestNotSTD3);
+    TESTCASE_AUTO(TestInvalidPunycodeDigits);
    TESTCASE_AUTO(TestSomeCases);
    TESTCASE_AUTO(IdnaTest);
    TESTCASE_AUTO_END;
@ -245,6 +247,71 @@ void UTS46Test::TestNotSTD3() {
    }
 }

+void UTS46Test::TestInvalidPunycodeDigits() {
+    IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
+    LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
+    if(errorCode.isFailure()) {
+        return;
+    }
+    UnicodeString result;
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--pleP", result, info, errorCode);  // P=U+0050
+        assertFalse("nameToUnicode() should succeed",
+                    (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+        assertEquals("normal result", u"ᔼᔴ", result);
+    }
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode);  // ends with non-ASCII U+0450
+        assertTrue("nameToUnicode() should detect non-ASCII",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    // Test with ASCII characters adjacent to LDH.
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '/'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect ':'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '@'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '['",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '`'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+
+    {
+        IDNAInfo info;
+        idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
+        assertTrue("nameToUnicode() should detect '{'",
+                   (info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
+    }
+}
+
 struct TestCase {
    // Input string and options string (Nontransitional/Transitional/Both).
    const char *s, *o;
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Punycode.java
@ -13,7 +13,7 @@ import com.ibm.icu.text.StringPrepParseException;
 import com.ibm.icu.text.UTF16;

 /**
- * Ported code from ICU punycode.c 
+ * Ported code from ICU punycode.c
 * @author ram
 */
 public final class Punycode {
@ -26,17 +26,17 @@ public final class Punycode {
    private static final int DAMP           = 700;
    private static final int INITIAL_BIAS   = 72;
    private static final int INITIAL_N      = 0x80;
-    
+
    /* "Basic" Unicode/ASCII code points */
    private static final char HYPHEN        = 0x2d;
    private static final char DELIMITER     = HYPHEN;
-    
+
    private static final int ZERO           = 0x30;
    //private static final int NINE           = 0x39;
-    
+
    private static final int SMALL_A        = 0x61;
    private static final int SMALL_Z        = 0x7a;
-    
+
    private static final int CAPITAL_A      = 0x41;
    private static final int CAPITAL_Z      = 0x5a;

@ -53,39 +53,30 @@ public final class Punycode {
            delta/=(BASE-TMIN);
        }

-        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));         
+        return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
    }

    /**
-     * basicToDigit[] contains the numeric value of a basic code
-     * point (for use in representing integers) in the range 0 to
-     * BASE-1, or -1 if b is does not represent a value.
+     * @return the numeric value of a basic code point (for use in representing integers)
+     *         in the range 0 to BASE-1, or a negative value if cp is invalid.
     */
-    static final int[]    basicToDigit= new int[]{
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-    
-        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-    
-        -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
-        15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-    
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
-    };
+    private static final int decodeDigit(int cp) {
+        if(cp<='Z') {
+            if(cp<='9') {
+                if(cp<'0') {
+                    return -1;
+                } else {
+                    return cp-'0'+26;  // 0..9 -> 26..35
+                }
+            } else {
+                return cp-'A';  // A-Z -> 0..25
+            }
+        } else if(cp<='z') {
+            return cp-'a';  // a..z -> 0..25
+        } else {
+            return -1;
+        }
+    }

    ///CLOVER:OFF
    private static char asciiCaseMap(char b, boolean uppercase) {
@ -99,7 +90,7 @@ public final class Punycode {
            }
        }
        return b;
-    }    
+    }
    ///CLOVER:ON
    /**
     * digitToBasic() returns the basic code point whose value
@ -124,7 +115,7 @@ public final class Punycode {
     * Converts Unicode to Punycode.
     * The input string must not contain single, unpaired surrogates.
     * The output will be represented as an array of ASCII code points.
-     * 
+     *
     * @param src The source of the String Buffer passed.
     * @param caseFlags The boolean array of case flags.
     * @return An array of ASCII code points.
@ -140,7 +131,7 @@ public final class Punycode {
         * convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
         */
        srcCPCount=0;
-        
+
        for(j=0; j<srcLength; ++j) {
            c=src.charAt(j);
            if(isBasic(c)) {
@ -152,7 +143,7 @@ public final class Punycode {
                    n|=c;
                } else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
                    ++j;
-                    
+
                    n|=UCharacter.getCodePoint(c, c2);
                } else {
                    /* error: unmatched surrogate */
@ -211,7 +202,7 @@ public final class Punycode {
                    /* Represent delta as a generalized variable-length integer: */
                    for(q=delta, k=BASE; /* no condition */; k+=BASE) {

-                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt   
+                        /** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt

                        t=k-bias;
                        if(t<TMIN) {
@ -220,7 +211,7 @@ public final class Punycode {
                            t=TMAX;
                        }
                        */
-                    
+
                        t=k-bias;
                        if(t<TMIN) {
                            t=TMIN;
@ -249,7 +240,7 @@ public final class Punycode {

        return dest;
    }
-    
+
    private static boolean isBasic(int ch){
        return (ch < INITIAL_N);
    }
@ -264,12 +255,12 @@ public final class Punycode {
    /**
     * Converts Punycode to Unicode.
     * The Unicode string will be at most as long as the Punycode string.
-     * 
+     *
     * @param src The source of the string buffer being passed.
     * @param caseFlags The array of boolean case flags.
     * @return StringBuilder string.
     */
-    public static StringBuilder decode(CharSequence src, boolean[] caseFlags) 
+    public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
                               throws StringPrepParseException{
        int srcLength = src.length();
        StringBuilder dest = new StringBuilder(src.length());
@ -330,7 +321,7 @@ public final class Punycode {
                    throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
                }

-                digit=basicToDigit[src.charAt(in++) & 0xFF];
+                digit=decodeDigit(src.charAt(in++));
                if(digit<0) {
                    throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
                }
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java
@ -105,9 +105,56 @@ public class UTS46Test extends TestFmwk {
        }
    }

+    @Test
+    public void TestInvalidPunycodeDigits() {
+        IDNA idna=IDNA.getUTS46Instance(0);
+        StringBuilder result=new StringBuilder();
+        IDNA.Info info=new IDNA.Info();
+        idna.nameToUnicode("xn--pleP", result, info);  // P=U+0050
+        assertFalse("nameToUnicode() should succeed",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+        assertEquals("normal result", "ᔼᔴ", result.toString());
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--pleѐ", result, info);  // ends with non-ASCII U+0450
+        assertTrue("nameToUnicode() should detect non-ASCII",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        // Test with ASCII characters adjacent to LDH.
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--PLE/", result, info);
+        assertTrue("nameToUnicode() should detect '/'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple:", result, info);
+        assertTrue("nameToUnicode() should detect ':'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple@", result, info);
+        assertTrue("nameToUnicode() should detect '@'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple[", result, info);
+        assertTrue("nameToUnicode() should detect '['",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple`", result, info);
+        assertTrue("nameToUnicode() should detect '`'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+
+        info=new IDNA.Info();
+        idna.nameToUnicode("xn--ple{", result, info);
+        assertTrue("nameToUnicode() should detect '{'",
+                info.getErrors().contains(IDNA.Error.PUNYCODE));
+    }
+
    private static final Map<String, IDNA.Error> errorNamesToErrors;
    static {
-        errorNamesToErrors=new TreeMap<String, IDNA.Error>();
+        errorNamesToErrors=new TreeMap<>();
        errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
        errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
        errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);