diff --git a/.gitattributes b/.gitattributes index 83b835f8d4..da6048c892 100644 --- a/.gitattributes +++ b/.gitattributes @@ -88,6 +88,8 @@ icu4j/main/classes/core/.classpath -text icu4j/main/classes/core/.project -text icu4j/main/classes/core/.settings/org.eclipse.jdt.core.prefs -text icu4j/main/classes/core/manifest.stub -text +icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java -text +icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java -text icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java -text icu4j/main/classes/currdata/.externalToolBuilders/copy-data-currdata.launch -text icu4j/main/classes/currdata/.settings/org.eclipse.jdt.core.prefs -text @@ -142,6 +144,7 @@ icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/BidiTest.txt -text icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/confusables.txt -text icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/confusablesWholeScript.txt -text icu4j/main/tests/core/src/com/ibm/icu/dev/test/bidi/BiDiConformanceTest.java -text +icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java -text icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.OlsonTimeZone.dat -text icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.TimeZoneAdapter.dat -text icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.math.BigDecimal.dat -text diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java new file mode 100644 index 0000000000..33bcc0610b --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java @@ -0,0 +1,437 @@ +/* +******************************************************************************* +* Copyright (C) 2003-2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +package com.ibm.icu.impl; + +import com.ibm.icu.impl.Punycode; +import com.ibm.icu.text.IDNA; +import com.ibm.icu.text.StringPrep; +import com.ibm.icu.text.StringPrepParseException; +import com.ibm.icu.text.UCharacterIterator; + +/** + * IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java + * while extending that class to support IDNA2008/UTS #46 as well. + * @author Ram Viswanadha + */ +public final class IDNA2003 { + /* IDNA ACE Prefix is "xn--" */ + private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; + //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length; + + private static final int MAX_LABEL_LENGTH = 63; + private static final int HYPHEN = 0x002D; + private static final int CAPITAL_A = 0x0041; + private static final int CAPITAL_Z = 0x005A; + private static final int LOWER_CASE_DELTA = 0x0020; + private static final int FULL_STOP = 0x002E; + private static final int MAX_DOMAIN_NAME_LENGTH = 255; + + // The NamePrep profile object + private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); + + private static boolean startsWithPrefix(StringBuffer src){ + boolean startsWithPrefix = true; + + if(src.length() < ACE_PREFIX.length){ + return false; + } + for(int i=0; i0x007A){ + return false; + } + //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] + if( (ch==0x002D) || + (0x0030 <= ch && ch <= 0x0039) || + (0x0041 <= ch && ch <= 0x005A) || + (0x0061 <= ch && ch <= 0x007A) + ){ + return true; + } + return false; + } + + /** + * Ascertain if the given code point is a label separator as + * defined by the IDNA RFC + * + * @param ch The code point to be ascertained + * @return true if the char is a label separator + * @stable ICU 2.8 + */ + private static boolean isLabelSeparator(int ch){ + switch(ch){ + case 0x002e: + case 0x3002: + case 0xFF0E: + case 0xFF61: + return true; + default: + return false; + } + } + + public static StringBuffer convertToASCII(UCharacterIterator src, int options) + throws StringPrepParseException{ + + boolean[] caseFlags = null; + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + boolean srcIsLDH = true; + + //get the options + boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0); + int ch; + // step 1 + while((ch = src.next())!= UCharacterIterator.DONE){ + if(ch> 0x7f){ + srcIsASCII = false; + } + } + int failPos = -1; + src.setToStart(); + StringBuffer processOut = null; + // step 2 is performed only if the source contains non ASCII + if(!srcIsASCII){ + // step 2 + processOut = namePrep.prepare(src, options); + }else{ + processOut = new StringBuffer(src.getText()); + } + int poLen = processOut.length(); + + if(poLen==0){ + throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); + } + StringBuffer dest = new StringBuffer(); + + // reset the variable to verify if output of prepare is ASCII or not + srcIsASCII = true; + + // step 3 & 4 + for(int j=0;j 0x7F){ + srcIsASCII = false; + }else if(isLDHChar(ch)==false){ + // here we do not assemble surrogates + // since we know that LDH code points + // are in the ASCII range only + srcIsLDH = false; + failPos = j; + } + } + + if(useSTD3ASCIIRules == true){ + // verify 3a and 3b + if( srcIsLDH == false /* source contains some non-LDH characters */ + || processOut.charAt(0) == HYPHEN + || processOut.charAt(processOut.length()-1) == HYPHEN){ + + /* populate the parseError struct */ + if(srcIsLDH==false){ + throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", + StringPrepParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (failPos>0) ? (failPos-1) : failPos); + }else if(processOut.charAt(0) == HYPHEN){ + throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", + StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); + + }else{ + throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", + StringPrepParseException.STD3_ASCII_RULES_ERROR, + processOut.toString(), + (poLen>0) ? poLen-1 : poLen); + + } + } + } + if(srcIsASCII){ + dest = processOut; + }else{ + // step 5 : verify the sequence does not begin with ACE prefix + if(!startsWithPrefix(processOut)){ + + //step 6: encode the sequence with punycode + caseFlags = new boolean[poLen]; + + StringBuilder punyout = Punycode.encode(processOut,caseFlags); + + // convert all codepoints to lower case ASCII + StringBuffer lowerOut = toASCIILower(punyout); + + //Step 7: prepend the ACE prefix + dest.append(ACE_PREFIX,0,ACE_PREFIX.length); + //Step 6: copy the contents in b2 into dest + dest.append(lowerOut); + }else{ + + throw new StringPrepParseException("The input does not start with the ACE Prefix.", + StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0); + } + } + if(dest.length() > MAX_LABEL_LENGTH){ + throw new StringPrepParseException("The labels in the input are too long. Length > 63.", + StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); + } + return dest; + } + + public static StringBuffer convertIDNToASCII(String src,int options) + throws StringPrepParseException{ + + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex=0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); + //make sure this is not a root label separator. + if(!(label.length()==0 && sepIndex==srcArr.length)){ + UCharacterIterator iter = UCharacterIterator.getInstance(label); + result.append(convertToASCII(iter,options)); + } + if(sepIndex==srcArr.length){ + break; + } + + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex = sepIndex; + result.append((char)FULL_STOP); + } + if(result.length() > MAX_DOMAIN_NAME_LENGTH){ + throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); + } + return result; + } + + public static StringBuffer convertToUnicode(UCharacterIterator src, int options) + throws StringPrepParseException{ + + boolean[] caseFlags = null; + + // the source contains all ascii codepoints + boolean srcIsASCII = true; + // assume the source contains all LDH codepoints + //boolean srcIsLDH = true; + + //get the options + //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); + + //int failPos = -1; + int ch; + int saveIndex = src.getIndex(); + // step 1: find out if all the codepoints in src are ASCII + while((ch=src.next())!= UCharacterIterator.DONE){ + if(ch>0x7F){ + srcIsASCII = false; + }/*else if((srcIsLDH = isLDHChar(ch))==false){ + failPos = src.getIndex(); + }*/ + } + StringBuffer processOut; + + if(srcIsASCII == false){ + try { + // step 2: process the string + src.setIndex(saveIndex); + processOut = namePrep.prepare(src,options); + } catch (StringPrepParseException ex) { + return new StringBuffer(src.getText()); + } + + }else{ + //just point to source + processOut = new StringBuffer(src.getText()); + } + // TODO: + // The RFC states that + // + // ToUnicode never fails. If any step fails, then the original input + // is returned immediately in that step. + // + + //step 3: verify ACE Prefix + if(startsWithPrefix(processOut)){ + StringBuffer decodeOut = null; + + //step 4: Remove the ACE Prefix + String temp = processOut.substring(ACE_PREFIX.length,processOut.length()); + + //step 5: Decode using punycode + try { + decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags)); + } catch (StringPrepParseException e) { + decodeOut = null; + } + + //step 6:Apply toASCII + if (decodeOut != null) { + StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options); + + //step 7: verify + if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ +// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed", +// StringPrepParseException.VERIFICATION_ERROR); + decodeOut = null; + } + } + + //step 8: return output of step 5 + if (decodeOut != null) { + return decodeOut; + } + } + +// }else{ +// // verify that STD3 ASCII rules are satisfied +// if(useSTD3ASCIIRules == true){ +// if( srcIsLDH == false /* source contains some non-LDH characters */ +// || processOut.charAt(0) == HYPHEN +// || processOut.charAt(processOut.length()-1) == HYPHEN){ +// +// if(srcIsLDH==false){ +// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", +// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), +// (failPos>0) ? (failPos-1) : failPos); +// }else if(processOut.charAt(0) == HYPHEN){ +// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", +// StringPrepParseException.STD3_ASCII_RULES_ERROR, +// processOut.toString(),0); +// +// }else{ +// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", +// StringPrepParseException.STD3_ASCII_RULES_ERROR, +// processOut.toString(), +// processOut.length()); +// +// } +// } +// } +// // just return the source +// return new StringBuffer(src.getText()); +// } + + return new StringBuffer(src.getText()); + } + + public static StringBuffer convertIDNToUnicode(String src, int options) + throws StringPrepParseException{ + + char[] srcArr = src.toCharArray(); + StringBuffer result = new StringBuffer(); + int sepIndex=0; + int oldSepIndex=0; + for(;;){ + sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); + String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); + if(label.length()==0 && sepIndex!=srcArr.length ){ + throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); + } + UCharacterIterator iter = UCharacterIterator.getInstance(label); + result.append(convertToUnicode(iter,options)); + if(sepIndex==srcArr.length){ + break; + } + // Unlike the ToASCII operation we don't normalize the label separators + result.append(srcArr[sepIndex]); + // increment the sepIndex to skip past the separator + sepIndex++; + oldSepIndex =sepIndex; + } + if(result.length() > MAX_DOMAIN_NAME_LENGTH){ + throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); + } + return result; + } + + public static int compare(String s1, String s2, int options) throws StringPrepParseException{ + StringBuffer s1Out = convertIDNToASCII(s1, options); + StringBuffer s2Out = convertIDNToASCII(s2, options); + return compareCaseInsensitiveASCII(s1Out,s2Out); + } +} diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java index 01b59830a7..920e515c80 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Norm2AllModes.java @@ -8,7 +8,6 @@ package com.ibm.icu.impl; import java.io.IOException; import java.io.InputStream; -import java.util.MissingResourceException; import com.ibm.icu.text.Normalizer; import com.ibm.icu.text.Normalizer2; @@ -328,13 +327,12 @@ public final class Norm2AllModes { private static CacheBase cache = new SoftCache() { protected Norm2AllModes createInstance(String key, InputStream data) { + Normalizer2Impl impl; if(data==null) { - throw new MissingResourceException( - "No Normalizer2 data name \""+key+"\" cached, and InputStream is null", - "Normalizer2", - key); + impl=new Normalizer2Impl().load(ICUResourceBundle.ICU_BUNDLE+"/"+key+".nrm"); + } else { + impl=new Normalizer2Impl().load(data); } - Normalizer2Impl impl=new Normalizer2Impl().load(data); return new Norm2AllModes(impl); } }; diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java index a20ff4eb5e..d4882a3f58 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java @@ -340,6 +340,9 @@ public final class Normalizer2Impl { * @draft ICU 4.6 */ public static boolean equal(CharSequence s1, CharSequence s2) { + if(s1==s2) { + return true; + } int length=s1.length(); if(length!=s2.length()) { return false; @@ -368,6 +371,9 @@ public final class Normalizer2Impl { if((limit1-start1)!=(limit2-start2)) { return false; } + if(s1==s2 && start1==start2) { + return true; + } while(start1=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) && + isASCIIString(dest) && + (dest.length()>254 || dest.charAt(253)!='.') + ) { + addError(info, Error.DOMAIN_NAME_TOO_LONG); + } + return dest; + } + + @Override + public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) { + return process(name, false, false, dest, info); + } + + private static final Normalizer2 uts46Norm2= + Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm + final int options; + + // Severe errors which usually result in a U+FFFD replacement character in the result string. + private static final EnumSet severeErrors=EnumSet.of( + Error.LEADING_COMBINING_MARK, + Error.DISALLOWED, + Error.PUNYCODE, + Error.LABEL_HAS_DOT, + Error.INVALID_ACE_LABEL); + + private static boolean + isASCIIString(CharSequence dest) { + int length=dest.length(); + for(int i=0; i0x7f) { + return false; + } + } + return true; + } + + // UTS #46 data for ASCII characters. + // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase + // and passes through all other ASCII characters. + // If USE_STD3_RULES is set, then non-LDH characters are disallowed + // using this data. + // The ASCII fastpath also uses this data. + // Values: -1=disallowed 0==valid 1==mapped (lowercase) + private static final byte asciiData[]={ + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // 002D..002E; valid # HYPHEN-MINUS..FULL STOP + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1, + // 0030..0039; valid # DIGIT ZERO..DIGIT NINE + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, + // 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z + -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, + // 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z + -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 + }; + + private StringBuilder + process(CharSequence src, + boolean isLabel, boolean toASCII, + StringBuilder dest, + Info info) { + // uts46Norm2.normalize() would do all of this error checking and setup, + // but with the ASCII fastpath we do not always call it, and do not + // call it first. + if(dest==src) { + throw new IllegalArgumentException(); + } + // Arguments are fine, reset output values. + dest.delete(0, 0x7fffffff); + resetInfo(info); + int srcLength=src.length(); + if(srcLength==0) { + if(toASCII) { + addError(info, Error.EMPTY_LABEL); + } + return dest; + } + // ASCII fastpath + boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; + int labelStart=0; + int i; + for(i=0;; ++i) { + if(i==srcLength) { + if(toASCII) { + if((i-labelStart)>63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + // There is a trailing dot if labelStart==i. + if(!isLabel && i>=254 && (i>254 || labelStart0x7f) { + break; + } + int cData=asciiData[c]; + if(cData>0) { + dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter. + } else if(cData<0 && disallowNonLDHDot) { + break; // Replacing with U+FFFD can be complicated for toASCII. + } else { + dest.append(c); + if(c=='-') { // hyphen + if(i==(labelStart+3) && src.charAt(i-1)=='-') { + // "??--..." is Punycode or forbidden. + ++i; // '-' was copied to dest already + break; + } + if(i==labelStart) { + // label starts with "-" + addLabelError(info, Error.LEADING_HYPHEN); + } + if((i+1)==srcLength || src.charAt(i+1)=='.') { + // label ends with "-" + addLabelError(info, Error.TRAILING_HYPHEN); + } + } else if(c=='.') { // dot + if(isLabel) { + // Replacing with U+FFFD can be complicated for toASCII. + ++i; // '.' was copied to dest already + break; + } + if(toASCII) { + // Permit an empty label at the end but not elsewhere. + if(i==labelStart && i<(srcLength-1)) { + addLabelError(info, Error.EMPTY_LABEL); + } else if((i-labelStart)>63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + } + promoteAndResetLabelErrors(info); + labelStart=i+1; + } + } + } + promoteAndResetLabelErrors(info); + processUnicode(src, labelStart, i, isLabel, toASCII, dest, info); + if( isBiDi(info) && !hasCertainErrors(info, severeErrors) && + (!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart))) + ) { + addError(info, Error.BIDI); + } + return dest; + } + + private StringBuilder + processUnicode(CharSequence src, + int labelStart, int mappingStart, + boolean isLabel, boolean toASCII, + StringBuilder dest, + Info info) { + if(mappingStart==0) { + uts46Norm2.normalize(src, dest); + } else { + uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length())); + } + boolean doMapDevChars= + toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 : + (options&NONTRANSITIONAL_TO_UNICODE)==0; + int destLength=dest.length(); + int labelLimit=labelStart; + while(labelLimit=0x200c)) { + setTransitionalDifferent(info); + if(doMapDevChars) { + destLength=mapDevChars(dest, labelStart, labelLimit); + // Do not increment labelLimit in case c was removed. + // All deviation characters have been mapped, no need to check for them again. + doMapDevChars=false; + } else { + ++labelLimit; + } + } else { + ++labelLimit; + } + } + // Permit an empty label at the end (0=4 && + dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' && + dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-' + ) { + // Label starts with "xn--", try to un-Punycode it. + wasPunycode=true; + try { + fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null); + } catch (StringPrepParseException e) { + addLabelError(info, Error.PUNYCODE); + return markBadACELabel(dest, labelStart, labelLength, toASCII, info); + } + // Check for NFC, and for characters that are not + // valid or deviation characters according to the normalizer. + // If there is something wrong, then the string will change. + // Note that the normalizer passes through non-LDH ASCII and deviation characters. + // Deviation characters are ok in Punycode even in transitional processing. + // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES + // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too. + boolean isValid=uts46Norm2.isNormalized(fromPunycode); + if(!isValid) { + addLabelError(info, Error.INVALID_ACE_LABEL); + return markBadACELabel(dest, labelStart, labelLength, toASCII, info); + } + labelString=fromPunycode; + labelStart=0; + labelLength=fromPunycode.length(); + } else { + wasPunycode=false; + labelString=dest; + } + // Validity check + if(labelLength==0) { + if(toASCII) { + addLabelError(info, Error.EMPTY_LABEL); + } + return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); + } + // labelLength>0 + if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') { + // label starts with "??--" + addLabelError(info, Error.HYPHEN_3_4); + } + if(labelString.charAt(labelStart)=='-') { + // label starts with "-" + addLabelError(info, Error.LEADING_HYPHEN); + } + if(labelString.charAt(labelStart+labelLength-1)=='-') { + // label ends with "-" + addLabelError(info, Error.TRAILING_HYPHEN); + } + // If the label was not a Punycode label, then it was the result of + // mapping, normalization and label segmentation. + // If the label was in Punycode, then we mapped it again above + // and checked its validity. + // Now we handle the STD3 restriction to LDH characters (if set) + // and we look for U+FFFD which indicates disallowed characters + // in a non-Punycode label or U+FFFD itself in a Punycode label. + // We also check for dots which can come from the input to a single-label function. + // Ok to cast away const because we own the UnicodeString. + int i=labelStart; + int limit=labelStart+labelLength; + char oredChars=0; + // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed. + boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; + do { + char c=labelString.charAt(i); + if(c<=0x7f) { + if(c=='.') { + addLabelError(info, Error.LABEL_HAS_DOT); + labelString.setCharAt(i, '\ufffd'); + } else if(disallowNonLDHDot && asciiData[c]<0) { + addLabelError(info, Error.DISALLOWED); + labelString.setCharAt(i, '\ufffd'); + } + } else { + oredChars|=c; + if(c==0xfffd) { + addLabelError(info, Error.DISALLOWED); + ++i; + } + } + ++i; + } while(i0xffff) { + // Remove c's trail surrogate. + labelString.deleteCharAt(labelStart+1); + --labelLength; + if(labelString==dest) { + --destLabelLength; + } + } + } + if(!hasCertainLabelErrors(info, severeErrors)) { + // Do contextual checks only if we do not have U+FFFD from a severe error + // because U+FFFD can make these checks fail. + if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) { + checkLabelBiDi(labelString, labelStart, labelLength, info); + } + if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c && + !isLabelOkContextJ(labelString, labelStart, labelLength) + ) { + addLabelError(info, Error.CONTEXTJ); + } + if(toASCII) { + if(wasPunycode) { + // Leave a Punycode label unchanged if it has no severe errors. + if(destLabelLength>63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + return destLabelLength; + } else if(oredChars>=0x80) { + // Contains non-ASCII characters. + StringBuilder punycode; + try { + punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null); + } catch (StringPrepParseException e) { + throw new RuntimeException(e); // unexpected + } + punycode.insert(0, "xn--"); + if(punycode.length()>63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + return replaceLabel(dest, destLabelStart, destLabelLength, + punycode, punycode.length()); + } else { + // all-ASCII label + if(labelLength>63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + } + } + } else { + // If a Punycode label has severe errors, + // then leave it but make sure it does not look valid. + if(wasPunycode) { + addLabelError(info, Error.INVALID_ACE_LABEL); + return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info); + } + } + return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength); + } + private int + markBadACELabel(StringBuilder dest, + int labelStart, int labelLength, + boolean toASCII, Info info) { + boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0; + boolean isASCII=true; + boolean onlyLDH=true; + int i=labelStart+4; // After the initial "xn--". + int limit=labelStart+labelLength; + do { + char c=dest.charAt(i); + if(c<=0x7f) { + if(c=='.') { + addLabelError(info, Error.LABEL_HAS_DOT); + dest.setCharAt(i, '\ufffd'); + isASCII=onlyLDH=false; + } else if(asciiData[c]<0) { + onlyLDH=false; + if(disallowNonLDHDot) { + dest.setCharAt(i, '\ufffd'); + isASCII=false; + } + } + } else { + isASCII=onlyLDH=false; + } + } while(++i63) { + addLabelError(info, Error.LABEL_TOO_LONG); + } + } + return labelLength; + } + + private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT); + private static final int R_AL_MASK= + U_MASK(UCharacterDirection.RIGHT_TO_LEFT)| + U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC); + private static final int L_R_AL_MASK=L_MASK|R_AL_MASK; + + private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER); + + private static final int EN_AN_MASK= + U_MASK(UCharacterDirection.EUROPEAN_NUMBER)| + U_MASK(UCharacterDirection.ARABIC_NUMBER); + private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK; + private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER); + + private static final int ES_CS_ET_ON_BN_NSM_MASK= + U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)| + U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)| + U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)| + U_MASK(UCharacterDirection.OTHER_NEUTRAL)| + U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)| + U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK); + private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK; + private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK; + + // We scan the whole label and check both for whether it contains RTL characters + // and whether it passes the BiDi Rule. + // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find + // that a domain name is a BiDi domain name (has an RTL label) only after + // processing several earlier labels. + private void + checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) { + // IDNA2008 BiDi rule + // Get the directionality of the first character. + int c; + int i=labelStart; + c=Character.codePointAt(label, i); + i+=Character.charCount(c); + int firstMask=U_MASK(UCharacter.getDirection(c)); + // 1. The first character must be a character with BIDI property L, R + // or AL. If it has the R or AL property, it is an RTL label; if it + // has the L property, it is an LTR label. + if((firstMask&~L_R_AL_MASK)!=0) { + setNotOkBiDi(info); + } + // Get the directionality of the last non-NSM character. + int lastMask; + int labelLimit=labelStart+labelLength; + for(;;) { + if(i>=labelLimit) { + lastMask=firstMask; + break; + } + c=Character.codePointBefore(label, labelLimit); + labelLimit-=Character.charCount(c); + int dir=UCharacter.getDirection(c); + if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) { + lastMask=U_MASK(dir); + break; + } + } + // 3. In an RTL label, the end of the label must be a character with + // BIDI property R, AL, EN or AN, followed by zero or more + // characters with BIDI property NSM. + // 6. In an LTR label, the end of the label must be a character with + // BIDI property L or EN, followed by zero or more characters with + // BIDI property NSM. + if( (firstMask&L_MASK)!=0 ? + (lastMask&~L_EN_MASK)!=0 : + (lastMask&~R_AL_EN_AN_MASK)!=0 + ) { + setNotOkBiDi(info); + } + // Get the directionalities of the intervening characters. + int mask=0; + while(ilabelStart) { + c=s.charAt(i-1); + if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) { + // Last character in the label is not an L or EN. + return false; + } + } + labelStart=i+1; + } else if(i==labelStart) { + if(!('a'<=c && c<='z')) { + // First character in the label is not an L. + return false; + } + } else { + if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) { + // Intermediate character in the label is a B, S or WS. + return false; + } + } + } + return true; + } + + private boolean + isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) { + // [IDNA2008-Tables] + // 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER + int labelLimit=labelStart+labelLength; + for(int i=labelStart; iIDNA RFC. + * Abstract base class for IDNA processing. + * See http://www.unicode.org/reports/tr46/ + * and http://www.ietf.org/rfc/rfc3490.txt + *

+ * The IDNA class is not intended for public subclassing. + *

+ * The non-static methods implement UTS #46 and IDNA2008. + * IDNA2008 is implemented according to UTS #46, see getUTS46Instance(). + *

+ * The static methods implement IDNA2003. + *

+ * IDNA2003 API Overview: + *

+ * The static IDNA API methods implement the IDNA protocol as defined in the + * IDNA RFC. * The draft defines 2 operations: ToASCII and ToUnicode. Domain labels * containing non-ASCII code points are required to be processed by * ToASCII operation before passing it to resolver libraries. Domain names @@ -30,177 +48,369 @@ import com.ibm.icu.impl.Punycode; * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string). * - * @author Ram Viswanadha + * @author Ram Viswanadha, Markus Scherer * @stable ICU 2.8 */ -public final class IDNA { - - /* IDNA ACE Prefix is "xn--" */ - private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ; - //private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length; - - private static final int MAX_LABEL_LENGTH = 63; - private static final int HYPHEN = 0x002D; - private static final int CAPITAL_A = 0x0041; - private static final int CAPITAL_Z = 0x005A; - private static final int LOWER_CASE_DELTA = 0x0020; - private static final int FULL_STOP = 0x002E; - private static final int MAX_DOMAIN_NAME_LENGTH = 255; +public abstract class IDNA { /** - * Option to prohibit processing of unassigned codepoints in the input and - * do not check if the input conforms to STD-3 ASCII rules. - * - * @see #convertToASCII #convertToUnicode + * Default options value: None of the other options are set. * @stable ICU 2.8 */ - public static final int DEFAULT = 0x0000; + public static final int DEFAULT = 0; /** - * Option to allow processing of unassigned codepoints in the input - * - * @see #convertToASCII #convertToUnicode + * Option to allow unassigned code points in domain names and labels. + * This option is ignored by the UTS46 implementation. + * (UTS #46 disallows unassigned code points.) * @stable ICU 2.8 */ - public static final int ALLOW_UNASSIGNED = 0x0001; + public static final int ALLOW_UNASSIGNED = 1; /** - * Option to check if input conforms to STD-3 ASCII rules - * - * @see #convertToASCII #convertToUnicode + * Option to check whether the input conforms to the STD3 ASCII rules, + * for example the restriction of labels to LDH characters + * (ASCII Letters, Digits and Hyphen-Minus). * @stable ICU 2.8 */ - public static final int USE_STD3_RULES = 0x0002; - - // static final singleton object that is initialized - // at class initialization time, hence guaranteed to - // be initialized and thread safe - private static final IDNA singleton = new IDNA(); - - // The NamePrep profile object - private StringPrep namePrep; - - /* private constructor to prevent construction of the object */ - private IDNA(){ - namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP); - } - - private static boolean startsWithPrefix(StringBuffer src){ - boolean startsWithPrefix = true; - - if(src.length() < ACE_PREFIX.length){ - return false; - } - for(int i=0; i0x007A){ - return false; - } - //[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A] - if( (ch==0x002D) || - (0x0030 <= ch && ch <= 0x0039) || - (0x0041 <= ch && ch <= 0x005A) || - (0x0061 <= ch && ch <= 0x007A) - ){ - return true; - } - return false; - } - + public static final int USE_STD3_RULES = 2; /** - * Ascertain if the given code point is a label separator as - * defined by the IDNA RFC - * - * @param ch The code point to be ascertained - * @return true if the char is a label separator - * @stable ICU 2.8 + * IDNA option to check for whether the input conforms to the BiDi rules. + * This option is ignored by the IDNA2003 implementation. + * (IDNA2003 always performs a BiDi check.) + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. */ - private static boolean isLabelSeparator(int ch){ - switch(ch){ - case 0x002e: - case 0x3002: - case 0xFF0E: - case 0xFF61: - return true; - default: - return false; + public static final int CHECK_BIDI = 4; + /** + * IDNA option to check for whether the input conforms to the CONTEXTJ rules. + * This option is ignored by the IDNA2003 implementation. + * (The CONTEXTJ check is new in IDNA2008.) + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static final int CHECK_CONTEXTJ = 8; + /** + * IDNA option for nontransitional processing in ToASCII(). + * By default, ToASCII() uses transitional processing. + * This option is ignored by the IDNA2003 implementation. + * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static final int NONTRANSITIONAL_TO_ASCII = 0x10; + /** + * IDNA option for nontransitional processing in ToUnicode(). + * By default, ToUnicode() uses transitional processing. + * This option is ignored by the IDNA2003 implementation. + * (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.) + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static final int NONTRANSITIONAL_TO_UNICODE = 0x20; + + /** + * Returns an IDNA instance which implements UTS #46. + * Returns an unmodifiable instance, owned by the caller. + * Cache it for multiple operations, and delete it when done. + * The instance is thread-safe, that is, it can be used concurrently. + *

+ * UTS #46 defines Unicode IDNA Compatibility Processing, + * updated to the latest version of Unicode and compatible with both + * IDNA2003 and IDNA2008. + *

+ * The worker functions use transitional processing, including deviation mappings, + * unless NONTRANSITIONAL_TO_ASCII or NONTRANSITIONAL_TO_UNICODE + * is used in which case the deviation characters are passed through without change. + *

+ * Disallowed characters are mapped to U+FFFD. + *

+ * Operations with the UTS #46 instance do not support the + * ALLOW_UNASSIGNED option. + *

+ * By default, the UTS #46 implementation allows all ASCII characters (as valid or mapped). + * When the USE_STD3_RULES option is used, ASCII characters other than + * letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD. + * + * @param options Bit set to modify the processing and error checking. + * @return the UTS #46 IDNA instance, if successful + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static IDNA getUTS46Instance(int options) { + return new UTS46(options); + } + + /** + * Converts a single domain name label into its ASCII form for DNS lookup. + * If any processing step fails, then info.hasErrors() will be true and + * the result might not be an ASCII string. + * The label might be modified according to the types of errors. + * Labels with severe errors will be left in (or turned into) their Unicode form. + * + * @param label Input domain name label + * @param dest Destination string object + * @param info Output container of IDNA processing details. + * @return dest + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public abstract StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info); + + /** + * Converts a single domain name label into its Unicode form for human-readable display. + * If any processing step fails, then info.hasErrors() will be true. + * The label might be modified according to the types of errors. + * + * @param label Input domain name label + * @param dest Destination string object + * @param info Output container of IDNA processing details. + * @return dest + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public abstract StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info); + + /** + * Converts a whole domain name into its ASCII form for DNS lookup. + * If any processing step fails, then info.hasErrors() will be true and + * the result might not be an ASCII string. + * The domain name might be modified according to the types of errors. + * Labels with severe errors will be left in (or turned into) their Unicode form. + * + * @param name Input domain name + * @param dest Destination string object + * @param info Output container of IDNA processing details. + * @return dest + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public abstract StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info); + + /** + * Converts a whole domain name into its Unicode form for human-readable display. + * If any processing step fails, then info.hasErrors() will be true. + * The domain name might be modified according to the types of errors. + * + * @param name Input domain name + * @param dest Destination string object + * @param info Output container of IDNA processing details. + * @return dest + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public abstract StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info); + + /** + * Output container for IDNA processing errors. + * The Info class is not suitable for subclassing. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static final class Info { + /** + * Constructor. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public Info() { + errors=EnumSet.noneOf(Error.class); + labelErrors=EnumSet.noneOf(Error.class); + isTransDiff=false; + isBiDi=false; + isOkBiDi=true; + } + /** + * Were there IDNA processing errors? + * @return true if there were processing errors + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public boolean hasErrors() { return !errors.isEmpty(); } + /** + * Returns a set indicating IDNA processing errors. + * @return set of processing errors (modifiable, and not null) + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public Set getErrors() { return errors; } + /** + * Returns true if transitional and nontransitional processing produce different results. + * This is the case when the input label or domain name contains + * one or more deviation characters outside a Punycode label (see UTS #46). + *

    + *
  • With nontransitional processing, such characters are + * copied to the destination string. + *
  • With transitional processing, such characters are + * mapped (sharp s/sigma) or removed (joiner/nonjoiner). + *
+ * @return true if transitional and nontransitional processing produce different results + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public boolean isTransitionalDifferent() { return isTransDiff; } + + private void reset() { + errors.clear(); + labelErrors.clear(); + isTransDiff=false; + isBiDi=false; + isOkBiDi=true; + } + + private EnumSet errors, labelErrors; + private boolean isTransDiff; + private boolean isBiDi; + private boolean isOkBiDi; + } + + // The following protected methods give IDNA subclasses access to the private IDNAInfo fields. + // The IDNAInfo also provides intermediate state that is publicly invisible, + // avoiding the allocation of another worker object. + protected static void resetInfo(Info info) { + info.reset(); + } + protected static boolean hasCertainErrors(Info info, EnumSet errors) { + return !info.errors.isEmpty() && !Collections.disjoint(info.errors, errors); + } + protected static boolean hasCertainLabelErrors(Info info, EnumSet errors) { + return !info.labelErrors.isEmpty() && !Collections.disjoint(info.labelErrors, errors); + } + protected static void addLabelError(Info info, Error error) { + info.labelErrors.add(error); + } + protected static void promoteAndResetLabelErrors(Info info) { + if(!info.labelErrors.isEmpty()) { + info.errors.addAll(info.labelErrors); + info.labelErrors.clear(); } } - + protected static void addError(Info info, Error error) { + info.errors.add(error); + } + protected static void setTransitionalDifferent(Info info) { + info.isTransDiff=true; + } + protected static void setBiDi(Info info) { + info.isBiDi=true; + } + protected static boolean isBiDi(Info info) { + return info.isBiDi; + } + protected static void setNotOkBiDi(Info info) { + info.isOkBiDi=false; + } + protected static boolean isOkBiDi(Info info) { + return info.isOkBiDi; + } + /** - * This function implements the ToASCII operation as defined in the IDNA RFC. + * IDNA error bit set values. + * When a domain name or label fails a processing step or does not meet the + * validity criteria, then one or more of these error bits are set. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + public static enum Error { + /** + * A non-final domain name label (or the whole domain name) is empty. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + EMPTY_LABEL, + /** + * A domain name label is longer than 63 bytes. + * (See STD13/RFC1034 3.1. Name space specifications and terminology.) + * This is only checked in ToASCII operations, and only if the output label is all-ASCII. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + LABEL_TOO_LONG, + /** + * A domain name is longer than 255 bytes in its storage form. + * (See STD13/RFC1034 3.1. Name space specifications and terminology.) + * This is only checked in ToASCII operations, and only if the output domain name is all-ASCII. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + DOMAIN_NAME_TOO_LONG, + /** + * A label starts with a hyphen-minus ('-'). + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + LEADING_HYPHEN, + /** + * A label ends with a hyphen-minus ('-'). + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + TRAILING_HYPHEN, + /** + * A label contains hyphen-minus ('-') in the third and fourth positions. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + HYPHEN_3_4, + /** + * A label starts with a combining mark. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + LEADING_COMBINING_MARK, + /** + * A label or domain name contains disallowed characters. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + DISALLOWED, + /** + * A label starts with "xn--" but does not contain valid Punycode. + * That is, an xn-- label failed Punycode decoding. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + PUNYCODE, + /** + * A label contains a dot=full stop. + * This can occur in an input string for a single-label function. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + LABEL_HAS_DOT, + /** + * An ACE label does not contain a valid label string. + * The label was successfully ACE (Punycode) decoded but the resulting + * string had severe validation errors. For example, + * it might contain characters that are not allowed in ACE labels, + * or it might not be normalized. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + INVALID_ACE_LABEL, + /** + * A label does not meet the IDNA BiDi requirements (for right-to-left characters). + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + BIDI, + /** + * A label does not meet the IDNA CONTEXTJ requirements. + * @draft ICU 4.6 + * @provisional This API might change or be removed in a future release. + */ + CONTEXTJ + } + + /** + * Sole constructor. (For invocation by subclass constructors, typically implicit.) + * @internal + * @deprecated This API is ICU internal only. + */ + protected IDNA() { + } + + /* IDNA2003 API ------------------------------------------------------------- */ + + /** + * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually * separated by dots; e.g." "www.example.com" is composed of 3 labels @@ -231,7 +441,7 @@ public final class IDNA { } /** - * This function implements the ToASCII operation as defined in the IDNA RFC. + * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually * separated by dots; e.g." "www.example.com" is composed of 3 labels @@ -261,7 +471,7 @@ public final class IDNA { } /** - * This function implements the ToASCII operation as defined in the IDNA RFC. + * IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * ASCII names. A label is an individual part of a domain name. Labels are usually * separated by dots; e.g." "www.example.com" is composed of 3 labels @@ -286,115 +496,11 @@ public final class IDNA { */ public static StringBuffer convertToASCII(UCharacterIterator src, int options) throws StringPrepParseException{ - - boolean[] caseFlags = null; - - // the source contains all ascii codepoints - boolean srcIsASCII = true; - // assume the source contains all LDH codepoints - boolean srcIsLDH = true; - - //get the options - boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); - int ch; - // step 1 - while((ch = src.next())!= UCharacterIterator.DONE){ - if(ch> 0x7f){ - srcIsASCII = false; - } - } - int failPos = -1; - src.setToStart(); - StringBuffer processOut = null; - // step 2 is performed only if the source contains non ASCII - if(!srcIsASCII){ - // step 2 - processOut = singleton.namePrep.prepare(src, options); - }else{ - processOut = new StringBuffer(src.getText()); - } - int poLen = processOut.length(); - - if(poLen==0){ - throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); - } - StringBuffer dest = new StringBuffer(); - - // reset the variable to verify if output of prepare is ASCII or not - srcIsASCII = true; - - // step 3 & 4 - for(int j=0;j 0x7F){ - srcIsASCII = false; - }else if(isLDHChar(ch)==false){ - // here we do not assemble surrogates - // since we know that LDH code points - // are in the ASCII range only - srcIsLDH = false; - failPos = j; - } - } - - if(useSTD3ASCIIRules == true){ - // verify 3a and 3b - if( srcIsLDH == false /* source contains some non-LDH characters */ - || processOut.charAt(0) == HYPHEN - || processOut.charAt(processOut.length()-1) == HYPHEN){ - - /* populate the parseError struct */ - if(srcIsLDH==false){ - throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules", - StringPrepParseException.STD3_ASCII_RULES_ERROR, - processOut.toString(), - (failPos>0) ? (failPos-1) : failPos); - }else if(processOut.charAt(0) == HYPHEN){ - throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", - StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0); - - }else{ - throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", - StringPrepParseException.STD3_ASCII_RULES_ERROR, - processOut.toString(), - (poLen>0) ? poLen-1 : poLen); - - } - } - } - if(srcIsASCII){ - dest = processOut; - }else{ - // step 5 : verify the sequence does not begin with ACE prefix - if(!startsWithPrefix(processOut)){ - - //step 6: encode the sequence with punycode - caseFlags = new boolean[poLen]; - - StringBuffer punyout = Punycode.encode(processOut,caseFlags); - - // convert all codepoints to lower case ASCII - StringBuffer lowerOut = toASCIILower(punyout); - - //Step 7: prepend the ACE prefix - dest.append(ACE_PREFIX,0,ACE_PREFIX.length); - //Step 6: copy the contents in b2 into dest - dest.append(lowerOut); - }else{ - - throw new StringPrepParseException("The input does not start with the ACE Prefix.", - StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0); - } - } - if(dest.length() > MAX_LABEL_LENGTH){ - throw new StringPrepParseException("The labels in the input are too long. Length > 63.", - StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0); - } - return dest; + return IDNA2003.convertToASCII(src, options); } - + /** - * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application @@ -428,7 +534,7 @@ public final class IDNA { } /** - * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application @@ -462,7 +568,7 @@ public final class IDNA { } /** - * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * It is important to note that this operation can fail. If it fails, then the input * domain name cannot be used as an Internationalized Domain Name and the application @@ -492,37 +598,12 @@ public final class IDNA { */ public static StringBuffer convertIDNToASCII(String src,int options) throws StringPrepParseException{ - - char[] srcArr = src.toCharArray(); - StringBuffer result = new StringBuffer(); - int sepIndex=0; - int oldSepIndex=0; - for(;;){ - sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); - String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); - //make sure this is not a root label separator. - if(!(label.length()==0 && sepIndex==srcArr.length)){ - UCharacterIterator iter = UCharacterIterator.getInstance(label); - result.append(convertToASCII(iter,options)); - } - if(sepIndex==srcArr.length){ - break; - } - - // increment the sepIndex to skip past the separator - sepIndex++; - oldSepIndex = sepIndex; - result.append((char)FULL_STOP); - } - if(result.length() > MAX_DOMAIN_NAME_LENGTH){ - throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); - } - return result; + return IDNA2003.convertIDNToASCII(src, options); } /** - * This function implements the ToUnicode operation as defined in the IDNA RFC. + * IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually * separated by dots; for e.g." "www.example.com" is composed of 3 labels @@ -552,7 +633,7 @@ public final class IDNA { } /** - * This function implements the ToUnicode operation as defined in the IDNA RFC. + * IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually * separated by dots; for e.g." "www.example.com" is composed of 3 labels @@ -582,7 +663,7 @@ public final class IDNA { } /** - * Function that implements the ToUnicode operation as defined in the IDNA RFC. + * IDNA2003: Function that implements the ToUnicode operation as defined in the IDNA RFC. * This operation is done on single labels before sending it to something that expects * Unicode names. A label is an individual part of a domain name. Labels are usually * separated by dots; for e.g." "www.example.com" is composed of 3 labels @@ -607,116 +688,11 @@ public final class IDNA { */ public static StringBuffer convertToUnicode(UCharacterIterator src, int options) throws StringPrepParseException{ - - boolean[] caseFlags = null; - - // the source contains all ascii codepoints - boolean srcIsASCII = true; - // assume the source contains all LDH codepoints - //boolean srcIsLDH = true; - - //get the options - //boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0); - - //int failPos = -1; - int ch; - int saveIndex = src.getIndex(); - // step 1: find out if all the codepoints in src are ASCII - while((ch=src.next())!= UCharacterIterator.DONE){ - if(ch>0x7F){ - srcIsASCII = false; - }/*else if((srcIsLDH = isLDHChar(ch))==false){ - failPos = src.getIndex(); - }*/ - } - StringBuffer processOut; - - if(srcIsASCII == false){ - try { - // step 2: process the string - src.setIndex(saveIndex); - processOut = singleton.namePrep.prepare(src,options); - } catch (StringPrepParseException ex) { - return new StringBuffer(src.getText()); - } - - }else{ - //just point to source - processOut = new StringBuffer(src.getText()); - } - // TODO: - // The RFC states that - // - // ToUnicode never fails. If any step fails, then the original input - // is returned immediately in that step. - // - - //step 3: verify ACE Prefix - if(startsWithPrefix(processOut)){ - StringBuffer decodeOut = null; - - //step 4: Remove the ACE Prefix - String temp = processOut.substring(ACE_PREFIX.length,processOut.length()); - - //step 5: Decode using punycode - try { - decodeOut = Punycode.decode(new StringBuffer(temp),caseFlags); - } catch (StringPrepParseException e) { - decodeOut = null; - } - - //step 6:Apply toASCII - if (decodeOut != null) { - StringBuffer toASCIIOut = convertToASCII(decodeOut, options); - - //step 7: verify - if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){ -// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed", -// StringPrepParseException.VERIFICATION_ERROR); - decodeOut = null; - } - } - - //step 8: return output of step 5 - if (decodeOut != null) { - return decodeOut; - } - } - -// }else{ -// // verify that STD3 ASCII rules are satisfied -// if(useSTD3ASCIIRules == true){ -// if( srcIsLDH == false /* source contains some non-LDH characters */ -// || processOut.charAt(0) == HYPHEN -// || processOut.charAt(processOut.length()-1) == HYPHEN){ -// -// if(srcIsLDH==false){ -// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", -// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(), -// (failPos>0) ? (failPos-1) : failPos); -// }else if(processOut.charAt(0) == HYPHEN){ -// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", -// StringPrepParseException.STD3_ASCII_RULES_ERROR, -// processOut.toString(),0); -// -// }else{ -// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules", -// StringPrepParseException.STD3_ASCII_RULES_ERROR, -// processOut.toString(), -// processOut.length()); -// -// } -// } -// } -// // just return the source -// return new StringBuffer(src.getText()); -// } - - return new StringBuffer(src.getText()); + return IDNA2003.convertToUnicode(src, options); } /** - * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name @@ -747,7 +723,7 @@ public final class IDNA { } /** - * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name @@ -778,7 +754,7 @@ public final class IDNA { } /** - * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. + * IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC. * This operation is done on complete domain names, e.g: "www.example.com". * * Note: IDNA RFC specifies that a conformant application should divide a domain name @@ -804,37 +780,12 @@ public final class IDNA { * @stable ICU 2.8 */ public static StringBuffer convertIDNToUnicode(String src, int options) - throws StringPrepParseException{ - - char[] srcArr = src.toCharArray(); - StringBuffer result = new StringBuffer(); - int sepIndex=0; - int oldSepIndex=0; - for(;;){ - sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length); - String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex); - if(label.length()==0 && sepIndex!=srcArr.length ){ - throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL); - } - UCharacterIterator iter = UCharacterIterator.getInstance(label); - result.append(convertToUnicode(iter,options)); - if(sepIndex==srcArr.length){ - break; - } - // Unlike the ToASCII operation we don't normalize the label separators - result.append(srcArr[sepIndex]); - // increment the sepIndex to skip past the separator - sepIndex++; - oldSepIndex =sepIndex; - } - if(result.length() > MAX_DOMAIN_NAME_LENGTH){ - throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR); - } - return result; + throws StringPrepParseException{ + return IDNA2003.convertIDNToUnicode(src, options); } /** - * Compare two IDN strings for equivalence. + * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. * According to IDN RFC, whenever two labels are compared, they are * considered equal if and only if their ASCII forms (obtained by @@ -860,19 +811,16 @@ public final class IDNA { * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 * @stable ICU 2.8 */ - // TODO: optimize public static int compare(StringBuffer s1, StringBuffer s2, int options) throws StringPrepParseException{ if(s1==null || s2 == null){ throw new IllegalArgumentException("One of the source buffers is null"); } - StringBuffer s1Out = convertIDNToASCII(s1.toString(),options); - StringBuffer s2Out = convertIDNToASCII(s2.toString(), options); - return compareCaseInsensitiveASCII(s1Out,s2Out); + return IDNA2003.compare(s1.toString(), s2.toString(), options); } /** - * Compare two IDN strings for equivalence. + * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. * According to IDN RFC, whenever two labels are compared, they are * considered equal if and only if their ASCII forms (obtained by @@ -898,18 +846,14 @@ public final class IDNA { * @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2 * @stable ICU 2.8 */ - // TODO: optimize - public static int compare(String s1, String s2, int options) - throws StringPrepParseException{ + public static int compare(String s1, String s2, int options) throws StringPrepParseException{ if(s1==null || s2 == null){ throw new IllegalArgumentException("One of the source buffers is null"); } - StringBuffer s1Out = convertIDNToASCII(s1, options); - StringBuffer s2Out = convertIDNToASCII(s2, options); - return compareCaseInsensitiveASCII(s1Out,s2Out); + return IDNA2003.compare(s1, s2, options); } /** - * Compare two IDN strings for equivalence. + * IDNA2003: Compare two IDN strings for equivalence. * This function splits the domain names into labels and compares them. * According to IDN RFC, whenever two labels are compared, they are * considered equal if and only if their ASCII forms (obtained by @@ -935,14 +879,11 @@ public final class IDNA { * @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2 * @stable ICU 2.8 */ - // TODO: optimize public static int compare(UCharacterIterator s1, UCharacterIterator s2, int options) throws StringPrepParseException{ if(s1==null || s2 == null){ throw new IllegalArgumentException("One of the source buffers is null"); } - StringBuffer s1Out = convertIDNToASCII(s1.getText(), options); - StringBuffer s2Out = convertIDNToASCII(s2.getText(), options); - return compareCaseInsensitiveASCII(s1Out,s2Out); + return IDNA2003.compare(s1.getText(), s2.getText(), options); } } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestAll.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestAll.java index ba236ed284..99348a6b2e 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestAll.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/TestAll.java @@ -1,7 +1,7 @@ /* ******************************************************************************* - * Copyright (C) 1996-2008, International Business Machines Corporation and * - * others. All Rights Reserved. * + * Copyright (C) 1996-2010, International Business Machines Corporation and + * others. All Rights Reserved. ******************************************************************************* */ package com.ibm.icu.dev.test.normalizer; @@ -25,6 +25,7 @@ public class TestAll extends TestGroup { "TestCanonicalIterator", "NormalizationMonkeyTest", "NormalizerRegressionTests", + "UTS46Test" }); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java new file mode 100644 index 0000000000..3f85dbba2a --- /dev/null +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java @@ -0,0 +1,714 @@ +/* +******************************************************************************* +* Copyright (C) 2010, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +*/ +package com.ibm.icu.dev.test.normalizer; + +import java.util.Collections; +import java.util.EnumSet; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import com.ibm.icu.dev.test.TestFmwk; +import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus; +import com.ibm.icu.text.IDNA; + +/** + * UTS #46 (IDNA2008) test. + * @author Markus Scherer + * @since 2010jul10 + */ +public class UTS46Test extends TestFmwk { + public static void main(String[] args) throws Exception { + new UTS46Test().run(args); + } + public UTS46Test() { + trans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ); + nontrans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ| + IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE); + } + + public void TestAPI() { + StringBuilder result=new StringBuilder(); + IDNA.Info info=new IDNA.Info(); + String input="www.eXample.cOm"; + String expected="www.example.com"; + trans.nameToASCII(input, result, info); + if(info.hasErrors() || !UTF16Plus.equal(result, expected)) { + errln(String.format("T.nameToASCII(www.example.com) info.errors=%s result matches=%b", + info.getErrors(), UTF16Plus.equal(result, expected))); + } + input="xn--bcher.de-65a"; + expected="xn--bcher\uFFFDde-65a"; + nontrans.labelToASCII(input, result, info); + if( !info.getErrors().equals(EnumSet.of(IDNA.Error.LABEL_HAS_DOT, IDNA.Error.INVALID_ACE_LABEL)) || + !UTF16Plus.equal(result, expected) + ) { + errln(String.format("N.labelToASCII(label-with-dot) failed with errors %s", + info.getErrors())); + } + // Java API tests that are not parallel to C++ tests + // because the C++ specifics (error codes etc.) do not apply here. + String resultString=trans.nameToUnicode("fA\u00DF.de", result, info).toString(); + if(info.hasErrors() || !resultString.equals("fass.de")) { + errln(String.format("T.nameToUnicode(fA\u00DF.de) info.errors=%s result matches=%b", + info.getErrors(), resultString.equals("fass.de"))); + } + try { + nontrans.labelToUnicode(result, result, info); + errln("N.labelToUnicode(result, result) did not throw an Exception"); + } catch(Exception e) { + // as expected (should be an IllegalArgumentException, or an ICU version of it) + } + } + + public void TestNotSTD3() { + IDNA not3=IDNA.getUTS46Instance(IDNA.CHECK_BIDI); + String input="\u0000A_2+2=4\n.e\u00DFen.net"; + StringBuilder result=new StringBuilder(); + IDNA.Info info=new IDNA.Info(); + if( !not3.nameToUnicode(input, result, info).toString().equals("\u0000a_2+2=4\n.essen.net") || + info.hasErrors() + ) { + errln(String.format("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %s string %s", + info.getErrors(), prettify(result.toString()))); + } + // A space (BiDi class WS) is not allowed in a BiDi domain name. + input="a z.xn--4db.edu"; + not3.nameToASCII(input, result, info); + if(!UTF16Plus.equal(result, input) || !info.getErrors().equals(EnumSet.of(IDNA.Error.BIDI))) { + errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed"); + } + } + + private static final Map errorNamesToErrors; + static { + errorNamesToErrors=new TreeMap(); + errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL); + errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG); + errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG); + errorNamesToErrors.put("UIDNA_ERROR_LEADING_HYPHEN", IDNA.Error.LEADING_HYPHEN); + errorNamesToErrors.put("UIDNA_ERROR_TRAILING_HYPHEN", IDNA.Error.TRAILING_HYPHEN); + errorNamesToErrors.put("UIDNA_ERROR_HYPHEN_3_4", IDNA.Error.HYPHEN_3_4); + errorNamesToErrors.put("UIDNA_ERROR_LEADING_COMBINING_MARK", IDNA.Error.LEADING_COMBINING_MARK); + errorNamesToErrors.put("UIDNA_ERROR_DISALLOWED", IDNA.Error.DISALLOWED); + errorNamesToErrors.put("UIDNA_ERROR_PUNYCODE", IDNA.Error.PUNYCODE); + errorNamesToErrors.put("UIDNA_ERROR_LABEL_HAS_DOT", IDNA.Error.LABEL_HAS_DOT); + errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL); + errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI); + errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ); + } + + private static final class TestCase { + private TestCase() { + errors=EnumSet.noneOf(IDNA.Error.class); + } + private void set(String[] data) { + s=data[0]; + o=data[1]; + u=data[2]; + errors.clear(); + if(data[3].length()!=0) { + for(String e: data[3].split("\\|")) { + errors.add(errorNamesToErrors.get(e)); + } + } + } + // Input string and options string (Nontransitional/Transitional/Both). + private String s, o; + // Expected Unicode result string. + private String u; + private EnumSet errors; + }; + + private static final String testCases[][]={ + { "www.eXample.cOm", "B", // all ASCII + "www.example.com", "" }, + { "B\u00FCcher.de", "B", // u-umlaut + "b\u00FCcher.de", "" }, + { "\u00D6BB", "B", // O-umlaut + "\u00F6bb", "" }, + { "fa\u00DF.de", "N", // sharp s + "fa\u00DF.de", "" }, + { "fa\u00DF.de", "T", // sharp s + "fass.de", "" }, + { "XN--fA-hia.dE", "B", // sharp s in Punycode + "fa\u00DF.de", "" }, + { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N", // Greek with final sigma + "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "" }, + { "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T", // Greek with final sigma + "\u03B2\u03CC\u03BB\u03BF\u03C3.com", "" }, + { "xn--nxasmm1c", "B", // Greek with final sigma in Punycode + "\u03B2\u03CC\u03BB\u03BF\u03C2", "" }, + { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N", // "Sri" in "Sri Lanka" has a ZWJ + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" }, + { "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T", // "Sri" in "Sri Lanka" has a ZWJ + "www.\u0DC1\u0DCA\u0DBB\u0DD3.com", "" }, + { "www.xn--10cl1a0b660p.com", "B", // "Sri" in Punycode + "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" }, + { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N", // ZWNJ + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "" }, + { "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T", // ZWNJ + "\u0646\u0627\u0645\u0647\u0627\u06CC", "" }, + { "xn--mgba3gch31f060k.com", "B", // ZWNJ in Punycode + "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", "" }, + { "a.b\uFF0Ec\u3002d\uFF61", "B", + "a.b.c.d.", "" }, + { "U\u0308.xn--tda", "B", // U+umlaut.u-umlaut + "\u00FC.\u00FC", "" }, + { "xn--u-ccb", "B", // u+umlaut in Punycode + "xn--u-ccb\uFFFD", "UIDNA_ERROR_INVALID_ACE_LABEL" }, + { "a\u2488com", "B", // contains 1-dot + "a\uFFFDcom", "UIDNA_ERROR_DISALLOWED" }, + { "xn--a-ecp.ru", "B", // contains 1-dot in Punycode + "xn--a-ecp\uFFFD.ru", "UIDNA_ERROR_INVALID_ACE_LABEL" }, + { "xn--0.pt", "B", // invalid Punycode + "xn--0\uFFFD.pt", "UIDNA_ERROR_PUNYCODE" }, + { "xn--a.pt", "B", // U+0080 + "xn--a\uFFFD.pt", "UIDNA_ERROR_INVALID_ACE_LABEL" }, + { "xn--a-\u00C4.pt", "B", // invalid Punycode + "xn--a-\u00E4.pt", "UIDNA_ERROR_PUNYCODE" }, + { "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B", // Japanese with fullwidth ".jp" + "\u65E5\u672C\u8A9E.jp", "" }, + { "\u2615", "B", "\u2615", "" }, // Unicode 4.0 HOT BEVERAGE + // many deviation characters, test the special mapping code + { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N", + "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", + "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ" }, + { "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+ + "\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+ + "\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T", + "1.assbcssssssssd"+ + "\u03C3\u03C3sssssssssssssssse"+ + "ssssssssssssssssssssx"+ + "ssssssssssssssssssssy"+ + "sssssssssssssss\u015Dssz", "UIDNA_ERROR_LABEL_TOO_LONG" }, + // "xn--bss" with deviation characters + { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N", + "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "UIDNA_ERROR_CONTEXTJ" }, + { "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T", + "\u5919", "" }, + // "xn--bssffl" written as: + // 02E3 MODIFIER LETTER SMALL X + // 034F COMBINING GRAPHEME JOINER (ignored) + // 2115 DOUBLE-STRUCK CAPITAL N + // 200B ZERO WIDTH SPACE (ignored) + // FE63 SMALL HYPHEN-MINUS + // 00AD SOFT HYPHEN (ignored) + // FF0D FULLWIDTH HYPHEN-MINUS + // 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored) + // 212C SCRIPT CAPITAL B + // FE00 VARIATION SELECTOR-1 (ignored) + // 017F LATIN SMALL LETTER LONG S + // 2064 INVISIBLE PLUS (ignored) + // 1D530 MATHEMATICAL FRAKTUR SMALL S + // E01EF VARIATION SELECTOR-256 (ignored) + // FB04 LATIN SMALL LIGATURE FFL + { "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C"+ + "\u212C\uFE00\u017F\u2064"+"\uD835\uDD30\uDB40\uDDEF"/*1D530 E01EF*/+"\uFB04", "B", + "\u5921\u591E\u591C\u5919", "" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901.", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901.", "" }, + // Domain name >256 characters, forces slow path in UTF-8 processing. + { "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "12345678901234567890123456789012345678901234567890123456789012", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "12345678901234567890123456789012345678901234567890123456789012", + "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789\u05D0", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789\u05D0", + "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890", + "UIDNA_ERROR_LABEL_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890.", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890.", + "UIDNA_ERROR_LABEL_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901234."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", + "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, + // label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te + { "\u00E41234567890123456789012345678901234567890123456789012345", "B", + "\u00E41234567890123456789012345678901234567890123456789012345", "" }, + { "1234567890\u00E41234567890123456789012345678901234567890123456", "B", + "1234567890\u00E41234567890123456789012345678901234567890123456", "UIDNA_ERROR_LABEL_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901.", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901.", "" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "12345678901234567890123456789012345678901234567890123456789012", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E4123456789012345678901234567890123456789012345."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "12345678901234567890123456789012345678901234567890123456789012", + "UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890", + "UIDNA_ERROR_LABEL_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890.", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "123456789012345678901234567890123456789012345678901234567890.", + "UIDNA_ERROR_LABEL_TOO_LONG" }, + { "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", "B", + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890\u00E41234567890123456789012345678901234567890123456."+ + "123456789012345678901234567890123456789012345678901234567890123."+ + "1234567890123456789012345678901234567890123456789012345678901", + "UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" }, + // hyphen errors and empty-label errors + // "xn---q----jra"=="-q--a-umlaut-" + { "a.b..-q--a-.e", "B", "a.b..-q--a-.e", + "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ + "UIDNA_ERROR_HYPHEN_3_4" }, + { "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e", + "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ + "UIDNA_ERROR_HYPHEN_3_4" }, + { "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e", + "UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+ + "UIDNA_ERROR_HYPHEN_3_4" }, + { "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" }, + { "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, + { "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, + { "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" }, + { "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" }, + { "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" }, + { "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" }, + { "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" }, + { "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" }, + { "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" }, + { "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", "UIDNA_ERROR_LEADING_COMBINING_MARK" }, + { "a.b.xn--c-bcb.d", "B", + "a.b.xn--c-bcb\uFFFD.d", "UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL" }, + // BiDi + { "A0", "B", "a0", "" }, + { "0A", "B", "0a", "" }, // all-LTR is ok to start with a digit (EN) + { "0A.\u05D0", "B", // ASCII label does not start with L/R/AL + "0a.\u05D0", "UIDNA_ERROR_BIDI" }, + { "c.xn--0-eha.xn--4db", "B", // 2nd label does not start with L/R/AL + "c.0\u00FC.\u05D0", "UIDNA_ERROR_BIDI" }, + { "b-.\u05D0", "B", // label does not end with L/EN + "b-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" }, + { "d.xn----dha.xn--4db", "B", // 2nd label does not end with L/EN + "d.\u00FC-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" }, + { "a\u05D0", "B", "a\u05D0", "UIDNA_ERROR_BIDI" }, // first dir != last dir + { "\u05D0\u05C7", "B", "\u05D0\u05C7", "" }, + { "\u05D09\u05C7", "B", "\u05D09\u05C7", "" }, + { "\u05D0a\u05C7", "B", "\u05D0a\u05C7", "UIDNA_ERROR_BIDI" }, // first dir != last dir + { "\u05D0\u05EA", "B", "\u05D0\u05EA", "" }, + { "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", "" }, + { "a\u05D0Tz", "B", "a\u05D0tz", "UIDNA_ERROR_BIDI" }, // mixed dir + { "\u05D0T\u05EA", "B", "\u05D0t\u05EA", "UIDNA_ERROR_BIDI" }, // mixed dir + { "\u05D07\u05EA", "B", "\u05D07\u05EA", "" }, + { "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" }, // Arabic 7 in the middle + { "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" }, // AN digit in LTR + { "\u05D07\u0667\u05EA", "B", // mixed EN/AN digits in RTL + "\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" }, + // ZWJ + { "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" }, // Virama+ZWJ + { "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama + { "\u200D", "N", "\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama + // ZWNJ + { "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", "" }, // Virama+ZWNJ + { "\u0BB9\u200C", "N", "\u0BB9\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama + { "\u200C", "N", "\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama + { "\u0644\u0670\u200C\u06ED\u06EF", "N", // Joining types D T ZWNJ T R + "\u0644\u0670\u200C\u06ED\u06EF", "" }, + { "\u0644\u0670\u200C\u06EF", "N", // D T ZWNJ R + "\u0644\u0670\u200C\u06EF", "" }, + { "\u0644\u200C\u06ED\u06EF", "N", // D ZWNJ T R + "\u0644\u200C\u06ED\u06EF", "" }, + { "\u0644\u200C\u06EF", "N", // D ZWNJ R + "\u0644\u200C\u06EF", "" }, + { "\u0644\u0670\u200C\u06ED", "N", // D T ZWNJ T + "\u0644\u0670\u200C\u06ED", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" }, + { "\u06EF\u200C\u06EF", "N", // R ZWNJ R + "\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" }, + { "\u0644\u200C", "N", // D ZWNJ + "\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" }, + // { "", "B", + // "", "" }, + }; + + public void TestSomeCases() { + StringBuilder aT=new StringBuilder(), uT=new StringBuilder(); + StringBuilder aN=new StringBuilder(), uN=new StringBuilder(); + IDNA.Info aTInfo=new IDNA.Info(), uTInfo=new IDNA.Info(); + IDNA.Info aNInfo=new IDNA.Info(), uNInfo=new IDNA.Info(); + + StringBuilder aTuN=new StringBuilder(), uTaN=new StringBuilder(); + StringBuilder aNuN=new StringBuilder(), uNaN=new StringBuilder(); + IDNA.Info aTuNInfo=new IDNA.Info(), uTaNInfo=new IDNA.Info(); + IDNA.Info aNuNInfo=new IDNA.Info(), uNaNInfo=new IDNA.Info(); + + StringBuilder aTL=new StringBuilder(), uTL=new StringBuilder(); + StringBuilder aNL=new StringBuilder(), uNL=new StringBuilder(); + IDNA.Info aTLInfo=new IDNA.Info(), uTLInfo=new IDNA.Info(); + IDNA.Info aNLInfo=new IDNA.Info(), uNLInfo=new IDNA.Info(); + + EnumSet uniErrors=EnumSet.noneOf(IDNA.Error.class); + + TestCase testCase=new TestCase(); + int i; + for(i=0; i severeErrors=EnumSet.of( + IDNA.Error.LEADING_COMBINING_MARK, + IDNA.Error.DISALLOWED, + IDNA.Error.PUNYCODE, + IDNA.Error.LABEL_HAS_DOT, + IDNA.Error.INVALID_ACE_LABEL); + private static final EnumSet lengthErrors=EnumSet.of( + IDNA.Error.EMPTY_LABEL, + IDNA.Error.LABEL_TOO_LONG, + IDNA.Error.DOMAIN_NAME_TOO_LONG); + + private boolean hasError(IDNA.Info info, IDNA.Error error) { + return info.getErrors().contains(error); + } + // assumes that certainErrors is not empty + private boolean hasCertainErrors(Set errors, Set certainErrors) { + return !errors.isEmpty() && !Collections.disjoint(errors, certainErrors); + } + private boolean hasCertainErrors(IDNA.Info info, Set certainErrors) { + return hasCertainErrors(info.getErrors(), certainErrors); + } + private boolean sameErrors(Set a, Set b) { + return a.equals(b); + } + private boolean sameErrors(IDNA.Info a, IDNA.Info b) { + return sameErrors(a.getErrors(), b.getErrors()); + } + private boolean sameErrors(IDNA.Info a, Set b) { + return sameErrors(a.getErrors(), b); + } + + private static boolean + isASCII(CharSequence str) { + int length=str.length(); + for(int i=0; i=0x80) { + return false; + } + } + return true; + } +}