ICU-21212 full range check for Punycode digits

This commit is contained in:
Markus Scherer 2020-08-13 15:35:17 -07:00
parent e3f2c0dd70
commit e19d12997b
4 changed files with 170 additions and 75 deletions

View File

@ -107,36 +107,26 @@ digitToBasic(int32_t digit, UBool uppercase) {
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
* @return the numeric value of a basic code point (for use in representing integers)
* in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
static const int8_t
basicToDigit[256]={
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
static int32_t decodeDigit(int32_t cp) {
if(cp<=u'Z') {
if(cp<=u'9') {
if(cp<u'0') {
return -1;
} else {
return cp-u'0'+26; // 0..9 -> 26..35
}
} else {
return cp-u'A'; // A-Z -> 0..25
}
} else if(cp<=u'z') {
return cp-'a'; // a..z -> 0..25
} else {
return -1;
}
}
static inline char
asciiCaseMap(char b, UBool uppercase) {
@ -455,7 +445,7 @@ u_strFromPunycode(const UChar *src, int32_t srcLength,
return 0;
}
digit=basicToDigit[(uint8_t)src[in++]];
digit=decodeDigit(src[in++]);
if(digit<0) {
*pErrorCode=U_INVALID_CHAR_FOUND;
return 0;

View File

@ -39,6 +39,7 @@ public:
void runIndexedTest(int32_t index, UBool exec, const char *&name, char *par=NULL);
void TestAPI();
void TestNotSTD3();
void TestInvalidPunycodeDigits();
void TestSomeCases();
void IdnaTest();
@ -82,6 +83,7 @@ void UTS46Test::runIndexedTest(int32_t index, UBool exec, const char *&name, cha
TESTCASE_AUTO_BEGIN;
TESTCASE_AUTO(TestAPI);
TESTCASE_AUTO(TestNotSTD3);
TESTCASE_AUTO(TestInvalidPunycodeDigits);
TESTCASE_AUTO(TestSomeCases);
TESTCASE_AUTO(IdnaTest);
TESTCASE_AUTO_END;
@ -245,6 +247,71 @@ void UTS46Test::TestNotSTD3() {
}
}
void UTS46Test::TestInvalidPunycodeDigits() {
IcuTestErrorCode errorCode(*this, "TestInvalidPunycodeDigits()");
LocalPointer<IDNA> idna(IDNA::createUTS46Instance(0, errorCode));
if(errorCode.isFailure()) {
return;
}
UnicodeString result;
{
IDNAInfo info;
idna->nameToUnicode(u"xn--pleP", result, info, errorCode); // P=U+0050
assertFalse("nameToUnicode() should succeed",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
assertEquals("normal result", u"ᔼᔴ", result);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--pleѐ", result, info, errorCode); // ends with non-ASCII U+0450
assertTrue("nameToUnicode() should detect non-ASCII",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
// Test with ASCII characters adjacent to LDH.
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple/", result, info, errorCode);
assertTrue("nameToUnicode() should detect '/'",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple:", result, info, errorCode);
assertTrue("nameToUnicode() should detect ':'",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple@", result, info, errorCode);
assertTrue("nameToUnicode() should detect '@'",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple[", result, info, errorCode);
assertTrue("nameToUnicode() should detect '['",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple`", result, info, errorCode);
assertTrue("nameToUnicode() should detect '`'",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
{
IDNAInfo info;
idna->nameToUnicode(u"xn--ple{", result, info, errorCode);
assertTrue("nameToUnicode() should detect '{'",
(info.getErrors()&UIDNA_ERROR_PUNYCODE)!=0);
}
}
struct TestCase {
// Input string and options string (Nontransitional/Transitional/Both).
const char *s, *o;

View File

@ -13,7 +13,7 @@ import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
/**
* Ported code from ICU punycode.c
* Ported code from ICU punycode.c
* @author ram
*/
public final class Punycode {
@ -26,17 +26,17 @@ public final class Punycode {
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
/* "Basic" Unicode/ASCII code points */
private static final char HYPHEN = 0x2d;
private static final char DELIMITER = HYPHEN;
private static final int ZERO = 0x30;
//private static final int NINE = 0x39;
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
@ -53,39 +53,30 @@ public final class Punycode {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
* @return the numeric value of a basic code point (for use in representing integers)
* in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
static final int[] basicToDigit= new int[]{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
private static final int decodeDigit(int cp) {
if(cp<='Z') {
if(cp<='9') {
if(cp<'0') {
return -1;
} else {
return cp-'0'+26; // 0..9 -> 26..35
}
} else {
return cp-'A'; // A-Z -> 0..25
}
} else if(cp<='z') {
return cp-'a'; // a..z -> 0..25
} else {
return -1;
}
}
///CLOVER:OFF
private static char asciiCaseMap(char b, boolean uppercase) {
@ -99,7 +90,7 @@ public final class Punycode {
}
}
return b;
}
}
///CLOVER:ON
/**
* digitToBasic() returns the basic code point whose value
@ -124,7 +115,7 @@ public final class Punycode {
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
*
* @param src The source of the String Buffer passed.
* @param caseFlags The boolean array of case flags.
* @return An array of ASCII code points.
@ -140,7 +131,7 @@ public final class Punycode {
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=0;
for(j=0; j<srcLength; ++j) {
c=src.charAt(j);
if(isBasic(c)) {
@ -152,7 +143,7 @@ public final class Punycode {
n|=c;
} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
++j;
n|=UCharacter.getCodePoint(c, c2);
} else {
/* error: unmatched surrogate */
@ -211,7 +202,7 @@ public final class Punycode {
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
@ -220,7 +211,7 @@ public final class Punycode {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
@ -249,7 +240,7 @@ public final class Punycode {
return dest;
}
private static boolean isBasic(int ch){
return (ch < INITIAL_N);
}
@ -264,12 +255,12 @@ public final class Punycode {
/**
* Converts Punycode to Unicode.
* The Unicode string will be at most as long as the Punycode string.
*
*
* @param src The source of the string buffer being passed.
* @param caseFlags The array of boolean case flags.
* @return StringBuilder string.
*/
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException{
int srcLength = src.length();
StringBuilder dest = new StringBuilder(src.length());
@ -330,7 +321,7 @@ public final class Punycode {
throw new StringPrepParseException("Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
digit=basicToDigit[src.charAt(in++) & 0xFF];
digit=decodeDigit(src.charAt(in++));
if(digit<0) {
throw new StringPrepParseException("Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
}

View File

@ -105,9 +105,56 @@ public class UTS46Test extends TestFmwk {
}
}
@Test
public void TestInvalidPunycodeDigits() {
IDNA idna=IDNA.getUTS46Instance(0);
StringBuilder result=new StringBuilder();
IDNA.Info info=new IDNA.Info();
idna.nameToUnicode("xn--pleP", result, info); // P=U+0050
assertFalse("nameToUnicode() should succeed",
info.getErrors().contains(IDNA.Error.PUNYCODE));
assertEquals("normal result", "ᔼᔴ", result.toString());
info=new IDNA.Info();
idna.nameToUnicode("xn--pleѐ", result, info); // ends with non-ASCII U+0450
assertTrue("nameToUnicode() should detect non-ASCII",
info.getErrors().contains(IDNA.Error.PUNYCODE));
// Test with ASCII characters adjacent to LDH.
info=new IDNA.Info();
idna.nameToUnicode("xn--PLE/", result, info);
assertTrue("nameToUnicode() should detect '/'",
info.getErrors().contains(IDNA.Error.PUNYCODE));
info=new IDNA.Info();
idna.nameToUnicode("xn--ple:", result, info);
assertTrue("nameToUnicode() should detect ':'",
info.getErrors().contains(IDNA.Error.PUNYCODE));
info=new IDNA.Info();
idna.nameToUnicode("xn--ple@", result, info);
assertTrue("nameToUnicode() should detect '@'",
info.getErrors().contains(IDNA.Error.PUNYCODE));
info=new IDNA.Info();
idna.nameToUnicode("xn--ple[", result, info);
assertTrue("nameToUnicode() should detect '['",
info.getErrors().contains(IDNA.Error.PUNYCODE));
info=new IDNA.Info();
idna.nameToUnicode("xn--ple`", result, info);
assertTrue("nameToUnicode() should detect '`'",
info.getErrors().contains(IDNA.Error.PUNYCODE));
info=new IDNA.Info();
idna.nameToUnicode("xn--ple{", result, info);
assertTrue("nameToUnicode() should detect '{'",
info.getErrors().contains(IDNA.Error.PUNYCODE));
}
private static final Map<String, IDNA.Error> errorNamesToErrors;
static {
errorNamesToErrors=new TreeMap<String, IDNA.Error>();
errorNamesToErrors=new TreeMap<>();
errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);