parent
c2228ba03e
commit
a3d88afb60
3
.gitattributes
vendored
3
.gitattributes
vendored
@ -88,6 +88,8 @@ icu4j/main/classes/core/.classpath -text
|
||||
icu4j/main/classes/core/.project -text
|
||||
icu4j/main/classes/core/.settings/org.eclipse.jdt.core.prefs -text
|
||||
icu4j/main/classes/core/manifest.stub -text
|
||||
icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java -text
|
||||
icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java -text
|
||||
icu4j/main/classes/core/src/com/ibm/icu/text/SpoofChecker.java -text
|
||||
icu4j/main/classes/currdata/.externalToolBuilders/copy-data-currdata.launch -text
|
||||
icu4j/main/classes/currdata/.settings/org.eclipse.jdt.core.prefs -text
|
||||
@ -142,6 +144,7 @@ icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/BidiTest.txt -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/confusables.txt -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/confusablesWholeScript.txt -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/test/bidi/BiDiConformanceTest.java -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/UTS46Test.java -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.OlsonTimeZone.dat -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.impl.TimeZoneAdapter.dat -text
|
||||
icu4j/main/tests/core/src/com/ibm/icu/dev/test/serializable/data/ICU_3.6/com.ibm.icu.math.BigDecimal.dat -text
|
||||
|
437
icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java
Normal file
437
icu4j/main/classes/core/src/com/ibm/icu/impl/IDNA2003.java
Normal file
@ -0,0 +1,437 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.impl.Punycode;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.StringPrep;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
import com.ibm.icu.text.UCharacterIterator;
|
||||
|
||||
/**
|
||||
* IDNA2003 implementation code, moved out of com.ibm.icu.text.IDNA.java
|
||||
* while extending that class to support IDNA2008/UTS #46 as well.
|
||||
* @author Ram Viswanadha
|
||||
*/
|
||||
public final class IDNA2003 {
|
||||
/* IDNA ACE Prefix is "xn--" */
|
||||
private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
|
||||
//private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
|
||||
|
||||
private static final int MAX_LABEL_LENGTH = 63;
|
||||
private static final int HYPHEN = 0x002D;
|
||||
private static final int CAPITAL_A = 0x0041;
|
||||
private static final int CAPITAL_Z = 0x005A;
|
||||
private static final int LOWER_CASE_DELTA = 0x0020;
|
||||
private static final int FULL_STOP = 0x002E;
|
||||
private static final int MAX_DOMAIN_NAME_LENGTH = 255;
|
||||
|
||||
// The NamePrep profile object
|
||||
private static final StringPrep namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
|
||||
|
||||
private static boolean startsWithPrefix(StringBuffer src){
|
||||
boolean startsWithPrefix = true;
|
||||
|
||||
if(src.length() < ACE_PREFIX.length){
|
||||
return false;
|
||||
}
|
||||
for(int i=0; i<ACE_PREFIX.length;i++){
|
||||
if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
|
||||
startsWithPrefix = false;
|
||||
}
|
||||
}
|
||||
return startsWithPrefix;
|
||||
}
|
||||
|
||||
private static char toASCIILower(char ch){
|
||||
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
|
||||
return (char)(ch + LOWER_CASE_DELTA);
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
private static StringBuffer toASCIILower(CharSequence src){
|
||||
StringBuffer dest = new StringBuffer();
|
||||
for(int i=0; i<src.length();i++){
|
||||
dest.append(toASCIILower(src.charAt(i)));
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
|
||||
char c1,c2;
|
||||
int rc;
|
||||
for(int i =0;/* no condition */;i++) {
|
||||
/* If we reach the ends of both strings then they match */
|
||||
if(i == s1.length()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
c1 = s1.charAt(i);
|
||||
c2 = s2.charAt(i);
|
||||
|
||||
/* Case-insensitive comparison */
|
||||
if(c1!=c2) {
|
||||
rc=toASCIILower(c1)-toASCIILower(c2);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static int getSeparatorIndex(char[] src,int start, int limit){
|
||||
for(; start<limit;start++){
|
||||
if(isLabelSeparator(src[start])){
|
||||
return start;
|
||||
}
|
||||
}
|
||||
// we have not found the separator just return length
|
||||
return start;
|
||||
}
|
||||
|
||||
/*
|
||||
private static int getSeparatorIndex(UCharacterIterator iter){
|
||||
int currentIndex = iter.getIndex();
|
||||
int separatorIndex = 0;
|
||||
int ch;
|
||||
while((ch=iter.next())!= UCharacterIterator.DONE){
|
||||
if(isLabelSeparator(ch)){
|
||||
separatorIndex = iter.getIndex();
|
||||
iter.setIndex(currentIndex);
|
||||
return separatorIndex;
|
||||
}
|
||||
}
|
||||
// reset index
|
||||
iter.setIndex(currentIndex);
|
||||
// we have not found the separator just return the length
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
private static boolean isLDHChar(int ch){
|
||||
// high runner case
|
||||
if(ch>0x007A){
|
||||
return false;
|
||||
}
|
||||
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
|
||||
if( (ch==0x002D) ||
|
||||
(0x0030 <= ch && ch <= 0x0039) ||
|
||||
(0x0041 <= ch && ch <= 0x005A) ||
|
||||
(0x0061 <= ch && ch <= 0x007A)
|
||||
){
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ascertain if the given code point is a label separator as
|
||||
* defined by the IDNA RFC
|
||||
*
|
||||
* @param ch The code point to be ascertained
|
||||
* @return true if the char is a label separator
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
private static boolean isLabelSeparator(int ch){
|
||||
switch(ch){
|
||||
case 0x002e:
|
||||
case 0x3002:
|
||||
case 0xFF0E:
|
||||
case 0xFF61:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static StringBuffer convertToASCII(UCharacterIterator src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
boolean[] caseFlags = null;
|
||||
|
||||
// the source contains all ascii codepoints
|
||||
boolean srcIsASCII = true;
|
||||
// assume the source contains all LDH codepoints
|
||||
boolean srcIsLDH = true;
|
||||
|
||||
//get the options
|
||||
boolean useSTD3ASCIIRules = ((options & IDNA.USE_STD3_RULES) != 0);
|
||||
int ch;
|
||||
// step 1
|
||||
while((ch = src.next())!= UCharacterIterator.DONE){
|
||||
if(ch> 0x7f){
|
||||
srcIsASCII = false;
|
||||
}
|
||||
}
|
||||
int failPos = -1;
|
||||
src.setToStart();
|
||||
StringBuffer processOut = null;
|
||||
// step 2 is performed only if the source contains non ASCII
|
||||
if(!srcIsASCII){
|
||||
// step 2
|
||||
processOut = namePrep.prepare(src, options);
|
||||
}else{
|
||||
processOut = new StringBuffer(src.getText());
|
||||
}
|
||||
int poLen = processOut.length();
|
||||
|
||||
if(poLen==0){
|
||||
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
|
||||
}
|
||||
StringBuffer dest = new StringBuffer();
|
||||
|
||||
// reset the variable to verify if output of prepare is ASCII or not
|
||||
srcIsASCII = true;
|
||||
|
||||
// step 3 & 4
|
||||
for(int j=0;j<poLen;j++ ){
|
||||
ch=processOut.charAt(j);
|
||||
if(ch > 0x7F){
|
||||
srcIsASCII = false;
|
||||
}else if(isLDHChar(ch)==false){
|
||||
// here we do not assemble surrogates
|
||||
// since we know that LDH code points
|
||||
// are in the ASCII range only
|
||||
srcIsLDH = false;
|
||||
failPos = j;
|
||||
}
|
||||
}
|
||||
|
||||
if(useSTD3ASCIIRules == true){
|
||||
// verify 3a and 3b
|
||||
if( srcIsLDH == false /* source contains some non-LDH characters */
|
||||
|| processOut.charAt(0) == HYPHEN
|
||||
|| processOut.charAt(processOut.length()-1) == HYPHEN){
|
||||
|
||||
/* populate the parseError struct */
|
||||
if(srcIsLDH==false){
|
||||
throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
processOut.toString(),
|
||||
(failPos>0) ? (failPos-1) : failPos);
|
||||
}else if(processOut.charAt(0) == HYPHEN){
|
||||
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
|
||||
|
||||
}else{
|
||||
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
processOut.toString(),
|
||||
(poLen>0) ? poLen-1 : poLen);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if(srcIsASCII){
|
||||
dest = processOut;
|
||||
}else{
|
||||
// step 5 : verify the sequence does not begin with ACE prefix
|
||||
if(!startsWithPrefix(processOut)){
|
||||
|
||||
//step 6: encode the sequence with punycode
|
||||
caseFlags = new boolean[poLen];
|
||||
|
||||
StringBuilder punyout = Punycode.encode(processOut,caseFlags);
|
||||
|
||||
// convert all codepoints to lower case ASCII
|
||||
StringBuffer lowerOut = toASCIILower(punyout);
|
||||
|
||||
//Step 7: prepend the ACE prefix
|
||||
dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
|
||||
//Step 6: copy the contents in b2 into dest
|
||||
dest.append(lowerOut);
|
||||
}else{
|
||||
|
||||
throw new StringPrepParseException("The input does not start with the ACE Prefix.",
|
||||
StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
|
||||
}
|
||||
}
|
||||
if(dest.length() > MAX_LABEL_LENGTH){
|
||||
throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
|
||||
StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static StringBuffer convertIDNToASCII(String src,int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
char[] srcArr = src.toCharArray();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int sepIndex=0;
|
||||
int oldSepIndex=0;
|
||||
for(;;){
|
||||
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
|
||||
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
|
||||
//make sure this is not a root label separator.
|
||||
if(!(label.length()==0 && sepIndex==srcArr.length)){
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
||||
result.append(convertToASCII(iter,options));
|
||||
}
|
||||
if(sepIndex==srcArr.length){
|
||||
break;
|
||||
}
|
||||
|
||||
// increment the sepIndex to skip past the separator
|
||||
sepIndex++;
|
||||
oldSepIndex = sepIndex;
|
||||
result.append((char)FULL_STOP);
|
||||
}
|
||||
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
|
||||
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
boolean[] caseFlags = null;
|
||||
|
||||
// the source contains all ascii codepoints
|
||||
boolean srcIsASCII = true;
|
||||
// assume the source contains all LDH codepoints
|
||||
//boolean srcIsLDH = true;
|
||||
|
||||
//get the options
|
||||
//boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
|
||||
|
||||
//int failPos = -1;
|
||||
int ch;
|
||||
int saveIndex = src.getIndex();
|
||||
// step 1: find out if all the codepoints in src are ASCII
|
||||
while((ch=src.next())!= UCharacterIterator.DONE){
|
||||
if(ch>0x7F){
|
||||
srcIsASCII = false;
|
||||
}/*else if((srcIsLDH = isLDHChar(ch))==false){
|
||||
failPos = src.getIndex();
|
||||
}*/
|
||||
}
|
||||
StringBuffer processOut;
|
||||
|
||||
if(srcIsASCII == false){
|
||||
try {
|
||||
// step 2: process the string
|
||||
src.setIndex(saveIndex);
|
||||
processOut = namePrep.prepare(src,options);
|
||||
} catch (StringPrepParseException ex) {
|
||||
return new StringBuffer(src.getText());
|
||||
}
|
||||
|
||||
}else{
|
||||
//just point to source
|
||||
processOut = new StringBuffer(src.getText());
|
||||
}
|
||||
// TODO:
|
||||
// The RFC states that
|
||||
// <quote>
|
||||
// ToUnicode never fails. If any step fails, then the original input
|
||||
// is returned immediately in that step.
|
||||
// </quote>
|
||||
|
||||
//step 3: verify ACE Prefix
|
||||
if(startsWithPrefix(processOut)){
|
||||
StringBuffer decodeOut = null;
|
||||
|
||||
//step 4: Remove the ACE Prefix
|
||||
String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
|
||||
|
||||
//step 5: Decode using punycode
|
||||
try {
|
||||
decodeOut = new StringBuffer(Punycode.decode(temp,caseFlags));
|
||||
} catch (StringPrepParseException e) {
|
||||
decodeOut = null;
|
||||
}
|
||||
|
||||
//step 6:Apply toASCII
|
||||
if (decodeOut != null) {
|
||||
StringBuffer toASCIIOut = convertToASCII(UCharacterIterator.getInstance(decodeOut), options);
|
||||
|
||||
//step 7: verify
|
||||
if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
|
||||
// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
|
||||
// StringPrepParseException.VERIFICATION_ERROR);
|
||||
decodeOut = null;
|
||||
}
|
||||
}
|
||||
|
||||
//step 8: return output of step 5
|
||||
if (decodeOut != null) {
|
||||
return decodeOut;
|
||||
}
|
||||
}
|
||||
|
||||
// }else{
|
||||
// // verify that STD3 ASCII rules are satisfied
|
||||
// if(useSTD3ASCIIRules == true){
|
||||
// if( srcIsLDH == false /* source contains some non-LDH characters */
|
||||
// || processOut.charAt(0) == HYPHEN
|
||||
// || processOut.charAt(processOut.length()-1) == HYPHEN){
|
||||
//
|
||||
// if(srcIsLDH==false){
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
|
||||
// (failPos>0) ? (failPos-1) : failPos);
|
||||
// }else if(processOut.charAt(0) == HYPHEN){
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
// processOut.toString(),0);
|
||||
//
|
||||
// }else{
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
// processOut.toString(),
|
||||
// processOut.length());
|
||||
//
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// // just return the source
|
||||
// return new StringBuffer(src.getText());
|
||||
// }
|
||||
|
||||
return new StringBuffer(src.getText());
|
||||
}
|
||||
|
||||
public static StringBuffer convertIDNToUnicode(String src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
char[] srcArr = src.toCharArray();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int sepIndex=0;
|
||||
int oldSepIndex=0;
|
||||
for(;;){
|
||||
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
|
||||
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
|
||||
if(label.length()==0 && sepIndex!=srcArr.length ){
|
||||
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
|
||||
}
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
||||
result.append(convertToUnicode(iter,options));
|
||||
if(sepIndex==srcArr.length){
|
||||
break;
|
||||
}
|
||||
// Unlike the ToASCII operation we don't normalize the label separators
|
||||
result.append(srcArr[sepIndex]);
|
||||
// increment the sepIndex to skip past the separator
|
||||
sepIndex++;
|
||||
oldSepIndex =sepIndex;
|
||||
}
|
||||
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
|
||||
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public static int compare(String s1, String s2, int options) throws StringPrepParseException{
|
||||
StringBuffer s1Out = convertIDNToASCII(s1, options);
|
||||
StringBuffer s2Out = convertIDNToASCII(s2, options);
|
||||
return compareCaseInsensitiveASCII(s1Out,s2Out);
|
||||
}
|
||||
}
|
@ -8,7 +8,6 @@ package com.ibm.icu.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.MissingResourceException;
|
||||
|
||||
import com.ibm.icu.text.Normalizer;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
@ -328,13 +327,12 @@ public final class Norm2AllModes {
|
||||
private static CacheBase<String, Norm2AllModes, InputStream> cache =
|
||||
new SoftCache<String, Norm2AllModes, InputStream>() {
|
||||
protected Norm2AllModes createInstance(String key, InputStream data) {
|
||||
Normalizer2Impl impl;
|
||||
if(data==null) {
|
||||
throw new MissingResourceException(
|
||||
"No Normalizer2 data name \""+key+"\" cached, and InputStream is null",
|
||||
"Normalizer2",
|
||||
key);
|
||||
impl=new Normalizer2Impl().load(ICUResourceBundle.ICU_BUNDLE+"/"+key+".nrm");
|
||||
} else {
|
||||
impl=new Normalizer2Impl().load(data);
|
||||
}
|
||||
Normalizer2Impl impl=new Normalizer2Impl().load(data);
|
||||
return new Norm2AllModes(impl);
|
||||
}
|
||||
};
|
||||
|
@ -340,6 +340,9 @@ public final class Normalizer2Impl {
|
||||
* @draft ICU 4.6
|
||||
*/
|
||||
public static boolean equal(CharSequence s1, CharSequence s2) {
|
||||
if(s1==s2) {
|
||||
return true;
|
||||
}
|
||||
int length=s1.length();
|
||||
if(length!=s2.length()) {
|
||||
return false;
|
||||
@ -368,6 +371,9 @@ public final class Normalizer2Impl {
|
||||
if((limit1-start1)!=(limit2-start2)) {
|
||||
return false;
|
||||
}
|
||||
if(s1==s2 && start1==start2) {
|
||||
return true;
|
||||
}
|
||||
while(start1<limit1) {
|
||||
if(s1.charAt(start1++)!=s2.charAt(start2++)) {
|
||||
return false;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2009, International Business Machines Corporation and *
|
||||
* Copyright (C) 2003-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -132,7 +132,7 @@ public final class Punycode {
|
||||
* @param caseFlags The boolean array of case flags.
|
||||
* @return An array of ASCII code points.
|
||||
*/
|
||||
public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws StringPrepParseException{
|
||||
public static StringBuilder encode(CharSequence src, boolean[] caseFlags) throws StringPrepParseException{
|
||||
|
||||
int[] cpBuffer = new int[MAX_CP_COUNT];
|
||||
int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
|
||||
@ -140,7 +140,7 @@ public final class Punycode {
|
||||
int srcLength = src.length();
|
||||
int destCapacity = MAX_CP_COUNT;
|
||||
char[] dest = new char[destCapacity];
|
||||
StringBuffer result = new StringBuffer();
|
||||
StringBuilder result = new StringBuilder();
|
||||
/*
|
||||
* Handle the basic code points and
|
||||
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
|
||||
@ -290,12 +290,12 @@ public final class Punycode {
|
||||
*
|
||||
* @param src The source of the string buffer being passed.
|
||||
* @param caseFlags The array of boolean case flags.
|
||||
* @return StringBuffer string.
|
||||
* @return StringBuilder string.
|
||||
*/
|
||||
public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
|
||||
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
|
||||
throws StringPrepParseException{
|
||||
int srcLength = src.length();
|
||||
StringBuffer result = new StringBuffer();
|
||||
StringBuilder result = new StringBuilder();
|
||||
int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
|
||||
destCPCount, firstSupplementaryIndex, cpLength;
|
||||
char b;
|
||||
|
739
icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
Normal file
739
icu4j/main/classes/core/src/com/ibm/icu/impl/UTS46.java
Normal file
@ -0,0 +1,739 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UCharacterDirection;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.StringPrepParseException;
|
||||
|
||||
// Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
|
||||
//
|
||||
// The domain name length limit is 255 octets in an internal DNS representation
|
||||
// where the last ("root") label is the empty label
|
||||
// represented by length byte 0 alone.
|
||||
// In a conventional string, this translates to 253 characters, or 254
|
||||
// if there is a trailing dot for the root label.
|
||||
|
||||
/**
|
||||
* UTS #46 (IDNA2008) implementation.
|
||||
* @author Markus Scherer
|
||||
* @since 2010jul09
|
||||
*/
|
||||
public final class UTS46 extends IDNA {
|
||||
public UTS46(int options) {
|
||||
this.options=options;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
|
||||
return process(label, true, true, dest, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
|
||||
return process(label, true, false, dest, info);
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
|
||||
process(name, false, true, dest, info);
|
||||
if( dest.length()>=254 && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG) &&
|
||||
isASCIIString(dest) &&
|
||||
(dest.length()>254 || dest.charAt(253)!='.')
|
||||
) {
|
||||
addError(info, Error.DOMAIN_NAME_TOO_LONG);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
|
||||
return process(name, false, false, dest, info);
|
||||
}
|
||||
|
||||
private static final Normalizer2 uts46Norm2=
|
||||
Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm
|
||||
final int options;
|
||||
|
||||
// Severe errors which usually result in a U+FFFD replacement character in the result string.
|
||||
private static final EnumSet<Error> severeErrors=EnumSet.of(
|
||||
Error.LEADING_COMBINING_MARK,
|
||||
Error.DISALLOWED,
|
||||
Error.PUNYCODE,
|
||||
Error.LABEL_HAS_DOT,
|
||||
Error.INVALID_ACE_LABEL);
|
||||
|
||||
private static boolean
|
||||
isASCIIString(CharSequence dest) {
|
||||
int length=dest.length();
|
||||
for(int i=0; i<length; ++i) {
|
||||
if(dest.charAt(i)>0x7f) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// UTS #46 data for ASCII characters.
|
||||
// The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
|
||||
// and passes through all other ASCII characters.
|
||||
// If USE_STD3_RULES is set, then non-LDH characters are disallowed
|
||||
// using this data.
|
||||
// The ASCII fastpath also uses this data.
|
||||
// Values: -1=disallowed 0==valid 1==mapped (lowercase)
|
||||
private static final byte asciiData[]={
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
// 002D..002E; valid # HYPHEN-MINUS..FULL STOP
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, -1,
|
||||
// 0030..0039; valid # DIGIT ZERO..DIGIT NINE
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1,
|
||||
// 0041..005A; mapped # LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
|
||||
-1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1,
|
||||
// 0061..007A; valid # LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||||
-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1
|
||||
};
|
||||
|
||||
private StringBuilder
|
||||
process(CharSequence src,
|
||||
boolean isLabel, boolean toASCII,
|
||||
StringBuilder dest,
|
||||
Info info) {
|
||||
// uts46Norm2.normalize() would do all of this error checking and setup,
|
||||
// but with the ASCII fastpath we do not always call it, and do not
|
||||
// call it first.
|
||||
if(dest==src) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
// Arguments are fine, reset output values.
|
||||
dest.delete(0, 0x7fffffff);
|
||||
resetInfo(info);
|
||||
int srcLength=src.length();
|
||||
if(srcLength==0) {
|
||||
if(toASCII) {
|
||||
addError(info, Error.EMPTY_LABEL);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
// ASCII fastpath
|
||||
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
|
||||
int labelStart=0;
|
||||
int i;
|
||||
for(i=0;; ++i) {
|
||||
if(i==srcLength) {
|
||||
if(toASCII) {
|
||||
if((i-labelStart)>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
// There is a trailing dot if labelStart==i.
|
||||
if(!isLabel && i>=254 && (i>254 || labelStart<i)) {
|
||||
addError(info, Error.DOMAIN_NAME_TOO_LONG);
|
||||
}
|
||||
}
|
||||
promoteAndResetLabelErrors(info);
|
||||
return dest;
|
||||
}
|
||||
char c=src.charAt(i);
|
||||
if(c>0x7f) {
|
||||
break;
|
||||
}
|
||||
int cData=asciiData[c];
|
||||
if(cData>0) {
|
||||
dest.append((char)(c+0x20)); // Lowercase an uppercase ASCII letter.
|
||||
} else if(cData<0 && disallowNonLDHDot) {
|
||||
break; // Replacing with U+FFFD can be complicated for toASCII.
|
||||
} else {
|
||||
dest.append(c);
|
||||
if(c=='-') { // hyphen
|
||||
if(i==(labelStart+3) && src.charAt(i-1)=='-') {
|
||||
// "??--..." is Punycode or forbidden.
|
||||
++i; // '-' was copied to dest already
|
||||
break;
|
||||
}
|
||||
if(i==labelStart) {
|
||||
// label starts with "-"
|
||||
addLabelError(info, Error.LEADING_HYPHEN);
|
||||
}
|
||||
if((i+1)==srcLength || src.charAt(i+1)=='.') {
|
||||
// label ends with "-"
|
||||
addLabelError(info, Error.TRAILING_HYPHEN);
|
||||
}
|
||||
} else if(c=='.') { // dot
|
||||
if(isLabel) {
|
||||
// Replacing with U+FFFD can be complicated for toASCII.
|
||||
++i; // '.' was copied to dest already
|
||||
break;
|
||||
}
|
||||
if(toASCII) {
|
||||
// Permit an empty label at the end but not elsewhere.
|
||||
if(i==labelStart && i<(srcLength-1)) {
|
||||
addLabelError(info, Error.EMPTY_LABEL);
|
||||
} else if((i-labelStart)>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
}
|
||||
promoteAndResetLabelErrors(info);
|
||||
labelStart=i+1;
|
||||
}
|
||||
}
|
||||
}
|
||||
promoteAndResetLabelErrors(info);
|
||||
processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
|
||||
if( isBiDi(info) && !hasCertainErrors(info, severeErrors) &&
|
||||
(!isOkBiDi(info) || (labelStart>0 && !isASCIIOkBiDi(dest, labelStart)))
|
||||
) {
|
||||
addError(info, Error.BIDI);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
private StringBuilder
|
||||
processUnicode(CharSequence src,
|
||||
int labelStart, int mappingStart,
|
||||
boolean isLabel, boolean toASCII,
|
||||
StringBuilder dest,
|
||||
Info info) {
|
||||
if(mappingStart==0) {
|
||||
uts46Norm2.normalize(src, dest);
|
||||
} else {
|
||||
uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
|
||||
}
|
||||
boolean doMapDevChars=
|
||||
toASCII ? (options&NONTRANSITIONAL_TO_ASCII)==0 :
|
||||
(options&NONTRANSITIONAL_TO_UNICODE)==0;
|
||||
int destLength=dest.length();
|
||||
int labelLimit=labelStart;
|
||||
while(labelLimit<destLength) {
|
||||
char c=dest.charAt(labelLimit);
|
||||
if(c=='.' && !isLabel) {
|
||||
int labelLength=labelLimit-labelStart;
|
||||
int newLength=processLabel(dest, labelStart, labelLength,
|
||||
toASCII, info);
|
||||
promoteAndResetLabelErrors(info);
|
||||
destLength+=newLength-labelLength;
|
||||
labelLimit=labelStart+=newLength+1;
|
||||
} else if(0xdf<=c && c<=0x200d && (c==0xdf || c==0x3c2 || c>=0x200c)) {
|
||||
setTransitionalDifferent(info);
|
||||
if(doMapDevChars) {
|
||||
destLength=mapDevChars(dest, labelStart, labelLimit);
|
||||
// Do not increment labelLimit in case c was removed.
|
||||
// All deviation characters have been mapped, no need to check for them again.
|
||||
doMapDevChars=false;
|
||||
} else {
|
||||
++labelLimit;
|
||||
}
|
||||
} else {
|
||||
++labelLimit;
|
||||
}
|
||||
}
|
||||
// Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
|
||||
// but not an empty label elsewhere nor a completely empty domain name.
|
||||
// processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
|
||||
if(0==labelStart || labelStart<labelLimit) {
|
||||
processLabel(dest, labelStart, labelLimit-labelStart, toASCII, info);
|
||||
promoteAndResetLabelErrors(info);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
// returns the new dest.length()
|
||||
private int
|
||||
mapDevChars(StringBuilder dest, int labelStart, int mappingStart) {
|
||||
int length=dest.length();
|
||||
boolean didMapDevChars=false;
|
||||
for(int i=mappingStart; i<length;) {
|
||||
char c=dest.charAt(i);
|
||||
switch(c) {
|
||||
case 0xdf:
|
||||
// Map sharp s to ss.
|
||||
didMapDevChars=true;
|
||||
dest.setCharAt(i++, 's');
|
||||
dest.insert(i++, 's');
|
||||
++length;
|
||||
break;
|
||||
case 0x3c2: // Map final sigma to nonfinal sigma.
|
||||
didMapDevChars=true;
|
||||
dest.setCharAt(i++, '\u03c3');
|
||||
break;
|
||||
case 0x200c: // Ignore/remove ZWNJ.
|
||||
case 0x200d: // Ignore/remove ZWJ.
|
||||
didMapDevChars=true;
|
||||
dest.delete(i, i+1);
|
||||
--length;
|
||||
break;
|
||||
default:
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(didMapDevChars) {
|
||||
// Mapping deviation characters might have resulted in an un-NFC string.
|
||||
// We could use either the NFC or the UTS #46 normalizer.
|
||||
// By using the UTS #46 normalizer again, we avoid having to load a second .nrm data file.
|
||||
String normalized=uts46Norm2.normalize(dest.subSequence(labelStart, dest.length()));
|
||||
dest.replace(labelStart, 0x7fffffff, normalized);
|
||||
return dest.length();
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
// Replace the label in dest with the label string, if the label was modified.
|
||||
// If label==dest then the label was modified in-place and labelLength
|
||||
// is the new label length, different from label.length().
|
||||
// If label!=dest then labelLength==label.length().
|
||||
// Returns labelLength (= the new label length).
|
||||
private static int
|
||||
replaceLabel(StringBuilder dest, int destLabelStart, int destLabelLength,
|
||||
CharSequence label, int labelLength) {
|
||||
if(label!=dest) {
|
||||
dest.delete(destLabelStart, destLabelStart+destLabelLength).insert(destLabelStart, label);
|
||||
// or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString());
|
||||
// which would create a String rather than moving characters in the StringBuilder.
|
||||
}
|
||||
return labelLength;
|
||||
}
|
||||
|
||||
// returns the new label length
|
||||
private int
|
||||
processLabel(StringBuilder dest,
|
||||
int labelStart, int labelLength,
|
||||
boolean toASCII,
|
||||
Info info) {
|
||||
StringBuilder fromPunycode;
|
||||
StringBuilder labelString;
|
||||
int destLabelStart=labelStart;
|
||||
int destLabelLength=labelLength;
|
||||
boolean wasPunycode;
|
||||
if( labelLength>=4 &&
|
||||
dest.charAt(labelStart)=='x' && dest.charAt(labelStart+1)=='n' &&
|
||||
dest.charAt(labelStart+2)=='-' && dest.charAt(labelStart+3)=='-'
|
||||
) {
|
||||
// Label starts with "xn--", try to un-Punycode it.
|
||||
wasPunycode=true;
|
||||
try {
|
||||
fromPunycode=Punycode.decode(dest.subSequence(labelStart+4, labelStart+labelLength), null);
|
||||
} catch (StringPrepParseException e) {
|
||||
addLabelError(info, Error.PUNYCODE);
|
||||
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
|
||||
}
|
||||
// Check for NFC, and for characters that are not
|
||||
// valid or deviation characters according to the normalizer.
|
||||
// If there is something wrong, then the string will change.
|
||||
// Note that the normalizer passes through non-LDH ASCII and deviation characters.
|
||||
// Deviation characters are ok in Punycode even in transitional processing.
|
||||
// In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
|
||||
// then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
|
||||
boolean isValid=uts46Norm2.isNormalized(fromPunycode);
|
||||
if(!isValid) {
|
||||
addLabelError(info, Error.INVALID_ACE_LABEL);
|
||||
return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
|
||||
}
|
||||
labelString=fromPunycode;
|
||||
labelStart=0;
|
||||
labelLength=fromPunycode.length();
|
||||
} else {
|
||||
wasPunycode=false;
|
||||
labelString=dest;
|
||||
}
|
||||
// Validity check
|
||||
if(labelLength==0) {
|
||||
if(toASCII) {
|
||||
addLabelError(info, Error.EMPTY_LABEL);
|
||||
}
|
||||
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
|
||||
}
|
||||
// labelLength>0
|
||||
if(labelLength>=4 && labelString.charAt(labelStart+2)=='-' && labelString.charAt(labelStart+3)=='-') {
|
||||
// label starts with "??--"
|
||||
addLabelError(info, Error.HYPHEN_3_4);
|
||||
}
|
||||
if(labelString.charAt(labelStart)=='-') {
|
||||
// label starts with "-"
|
||||
addLabelError(info, Error.LEADING_HYPHEN);
|
||||
}
|
||||
if(labelString.charAt(labelStart+labelLength-1)=='-') {
|
||||
// label ends with "-"
|
||||
addLabelError(info, Error.TRAILING_HYPHEN);
|
||||
}
|
||||
// If the label was not a Punycode label, then it was the result of
|
||||
// mapping, normalization and label segmentation.
|
||||
// If the label was in Punycode, then we mapped it again above
|
||||
// and checked its validity.
|
||||
// Now we handle the STD3 restriction to LDH characters (if set)
|
||||
// and we look for U+FFFD which indicates disallowed characters
|
||||
// in a non-Punycode label or U+FFFD itself in a Punycode label.
|
||||
// We also check for dots which can come from the input to a single-label function.
|
||||
// Ok to cast away const because we own the UnicodeString.
|
||||
int i=labelStart;
|
||||
int limit=labelStart+labelLength;
|
||||
char oredChars=0;
|
||||
// If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
|
||||
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
|
||||
do {
|
||||
char c=labelString.charAt(i);
|
||||
if(c<=0x7f) {
|
||||
if(c=='.') {
|
||||
addLabelError(info, Error.LABEL_HAS_DOT);
|
||||
labelString.setCharAt(i, '\ufffd');
|
||||
} else if(disallowNonLDHDot && asciiData[c]<0) {
|
||||
addLabelError(info, Error.DISALLOWED);
|
||||
labelString.setCharAt(i, '\ufffd');
|
||||
}
|
||||
} else {
|
||||
oredChars|=c;
|
||||
if(c==0xfffd) {
|
||||
addLabelError(info, Error.DISALLOWED);
|
||||
++i;
|
||||
}
|
||||
}
|
||||
++i;
|
||||
} while(i<limit);
|
||||
// Check for a leading combining mark after other validity checks
|
||||
// so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here.
|
||||
int c;
|
||||
// "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
|
||||
c=labelString.codePointAt(labelStart);
|
||||
if((U_GET_GC_MASK(c)&U_GC_M_MASK)!=0) {
|
||||
addLabelError(info, Error.LEADING_COMBINING_MARK);
|
||||
labelString.setCharAt(labelStart, '\ufffd');
|
||||
if(c>0xffff) {
|
||||
// Remove c's trail surrogate.
|
||||
labelString.deleteCharAt(labelStart+1);
|
||||
--labelLength;
|
||||
if(labelString==dest) {
|
||||
--destLabelLength;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(!hasCertainLabelErrors(info, severeErrors)) {
|
||||
// Do contextual checks only if we do not have U+FFFD from a severe error
|
||||
// because U+FFFD can make these checks fail.
|
||||
if((options&CHECK_BIDI)!=0 && (!isBiDi(info) || isOkBiDi(info))) {
|
||||
checkLabelBiDi(labelString, labelStart, labelLength, info);
|
||||
}
|
||||
if( (options&CHECK_CONTEXTJ)!=0 && (oredChars&0x200c)==0x200c &&
|
||||
!isLabelOkContextJ(labelString, labelStart, labelLength)
|
||||
) {
|
||||
addLabelError(info, Error.CONTEXTJ);
|
||||
}
|
||||
if(toASCII) {
|
||||
if(wasPunycode) {
|
||||
// Leave a Punycode label unchanged if it has no severe errors.
|
||||
if(destLabelLength>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
return destLabelLength;
|
||||
} else if(oredChars>=0x80) {
|
||||
// Contains non-ASCII characters.
|
||||
StringBuilder punycode;
|
||||
try {
|
||||
punycode=Punycode.encode(labelString.subSequence(labelStart, labelStart+labelLength), null);
|
||||
} catch (StringPrepParseException e) {
|
||||
throw new RuntimeException(e); // unexpected
|
||||
}
|
||||
punycode.insert(0, "xn--");
|
||||
if(punycode.length()>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
return replaceLabel(dest, destLabelStart, destLabelLength,
|
||||
punycode, punycode.length());
|
||||
} else {
|
||||
// all-ASCII label
|
||||
if(labelLength>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If a Punycode label has severe errors,
|
||||
// then leave it but make sure it does not look valid.
|
||||
if(wasPunycode) {
|
||||
addLabelError(info, Error.INVALID_ACE_LABEL);
|
||||
return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
|
||||
}
|
||||
}
|
||||
return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
|
||||
}
|
||||
private int
|
||||
markBadACELabel(StringBuilder dest,
|
||||
int labelStart, int labelLength,
|
||||
boolean toASCII, Info info) {
|
||||
boolean disallowNonLDHDot=(options&USE_STD3_RULES)!=0;
|
||||
boolean isASCII=true;
|
||||
boolean onlyLDH=true;
|
||||
int i=labelStart+4; // After the initial "xn--".
|
||||
int limit=labelStart+labelLength;
|
||||
do {
|
||||
char c=dest.charAt(i);
|
||||
if(c<=0x7f) {
|
||||
if(c=='.') {
|
||||
addLabelError(info, Error.LABEL_HAS_DOT);
|
||||
dest.setCharAt(i, '\ufffd');
|
||||
isASCII=onlyLDH=false;
|
||||
} else if(asciiData[c]<0) {
|
||||
onlyLDH=false;
|
||||
if(disallowNonLDHDot) {
|
||||
dest.setCharAt(i, '\ufffd');
|
||||
isASCII=false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
isASCII=onlyLDH=false;
|
||||
}
|
||||
} while(++i<limit);
|
||||
if(onlyLDH) {
|
||||
dest.insert(labelStart+labelLength, '\ufffd');
|
||||
++labelLength;
|
||||
} else {
|
||||
if(toASCII && isASCII && labelLength>63) {
|
||||
addLabelError(info, Error.LABEL_TOO_LONG);
|
||||
}
|
||||
}
|
||||
return labelLength;
|
||||
}
|
||||
|
||||
private static final int L_MASK=U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
|
||||
private static final int R_AL_MASK=
|
||||
U_MASK(UCharacterDirection.RIGHT_TO_LEFT)|
|
||||
U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
|
||||
private static final int L_R_AL_MASK=L_MASK|R_AL_MASK;
|
||||
|
||||
private static final int R_AL_AN_MASK=R_AL_MASK|U_MASK(UCharacterDirection.ARABIC_NUMBER);
|
||||
|
||||
private static final int EN_AN_MASK=
|
||||
U_MASK(UCharacterDirection.EUROPEAN_NUMBER)|
|
||||
U_MASK(UCharacterDirection.ARABIC_NUMBER);
|
||||
private static final int R_AL_EN_AN_MASK=R_AL_MASK|EN_AN_MASK;
|
||||
private static final int L_EN_MASK=L_MASK|U_MASK(UCharacterDirection.EUROPEAN_NUMBER);
|
||||
|
||||
private static final int ES_CS_ET_ON_BN_NSM_MASK=
|
||||
U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)|
|
||||
U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)|
|
||||
U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)|
|
||||
U_MASK(UCharacterDirection.OTHER_NEUTRAL)|
|
||||
U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)|
|
||||
U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
|
||||
private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK=L_EN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
|
||||
private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK=R_AL_MASK|EN_AN_MASK|ES_CS_ET_ON_BN_NSM_MASK;
|
||||
|
||||
// We scan the whole label and check both for whether it contains RTL characters
|
||||
// and whether it passes the BiDi Rule.
|
||||
// In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
|
||||
// that a domain name is a BiDi domain name (has an RTL label) only after
|
||||
// processing several earlier labels.
|
||||
private void
|
||||
checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
|
||||
// IDNA2008 BiDi rule
|
||||
// Get the directionality of the first character.
|
||||
int c;
|
||||
int i=labelStart;
|
||||
c=Character.codePointAt(label, i);
|
||||
i+=Character.charCount(c);
|
||||
int firstMask=U_MASK(UCharacter.getDirection(c));
|
||||
// 1. The first character must be a character with BIDI property L, R
|
||||
// or AL. If it has the R or AL property, it is an RTL label; if it
|
||||
// has the L property, it is an LTR label.
|
||||
if((firstMask&~L_R_AL_MASK)!=0) {
|
||||
setNotOkBiDi(info);
|
||||
}
|
||||
// Get the directionality of the last non-NSM character.
|
||||
int lastMask;
|
||||
int labelLimit=labelStart+labelLength;
|
||||
for(;;) {
|
||||
if(i>=labelLimit) {
|
||||
lastMask=firstMask;
|
||||
break;
|
||||
}
|
||||
c=Character.codePointBefore(label, labelLimit);
|
||||
labelLimit-=Character.charCount(c);
|
||||
int dir=UCharacter.getDirection(c);
|
||||
if(dir!=UCharacterDirection.DIR_NON_SPACING_MARK) {
|
||||
lastMask=U_MASK(dir);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// 3. In an RTL label, the end of the label must be a character with
|
||||
// BIDI property R, AL, EN or AN, followed by zero or more
|
||||
// characters with BIDI property NSM.
|
||||
// 6. In an LTR label, the end of the label must be a character with
|
||||
// BIDI property L or EN, followed by zero or more characters with
|
||||
// BIDI property NSM.
|
||||
if( (firstMask&L_MASK)!=0 ?
|
||||
(lastMask&~L_EN_MASK)!=0 :
|
||||
(lastMask&~R_AL_EN_AN_MASK)!=0
|
||||
) {
|
||||
setNotOkBiDi(info);
|
||||
}
|
||||
// Get the directionalities of the intervening characters.
|
||||
int mask=0;
|
||||
while(i<labelLimit) {
|
||||
c=Character.codePointAt(label, i);
|
||||
i+=Character.charCount(c);
|
||||
mask|=U_MASK(UCharacter.getDirection(c));
|
||||
}
|
||||
if((firstMask&L_MASK)!=0) {
|
||||
// 5. In an LTR label, only characters with the BIDI properties L, EN,
|
||||
// ES, CS, ET, ON, BN and NSM are allowed.
|
||||
if((mask&~L_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
|
||||
setNotOkBiDi(info);
|
||||
}
|
||||
} else {
|
||||
// 2. In an RTL label, only characters with the BIDI properties R, AL,
|
||||
// AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
|
||||
if((mask&~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK)!=0) {
|
||||
setNotOkBiDi(info);
|
||||
}
|
||||
// 4. In an RTL label, if an EN is present, no AN may be present, and
|
||||
// vice versa.
|
||||
if((mask&EN_AN_MASK)==EN_AN_MASK) {
|
||||
setNotOkBiDi(info);
|
||||
}
|
||||
}
|
||||
// An RTL label is a label that contains at least one character of type
|
||||
// R, AL or AN. [...]
|
||||
// A "BIDI domain name" is a domain name that contains at least one RTL
|
||||
// label. [...]
|
||||
// The following rule, consisting of six conditions, applies to labels
|
||||
// in BIDI domain names.
|
||||
if(((firstMask|mask|lastMask)&R_AL_AN_MASK)!=0) {
|
||||
setBiDi(info);
|
||||
}
|
||||
}
|
||||
|
||||
// Special code for the ASCII prefix of a BiDi domain name.
|
||||
// The ASCII prefix is all-LTR.
|
||||
|
||||
// IDNA2008 BiDi rule, parts relevant to ASCII labels:
|
||||
// 1. The first character must be a character with BIDI property L [...]
|
||||
// 5. In an LTR label, only characters with the BIDI properties L, EN,
|
||||
// ES, CS, ET, ON, BN and NSM are allowed.
|
||||
// 6. In an LTR label, the end of the label must be a character with
|
||||
// BIDI property L or EN [...]
|
||||
|
||||
// UTF-16 version, called for mapped ASCII prefix.
|
||||
// Cannot contain uppercase A-Z.
|
||||
// s[length-1] must be the trailing dot.
|
||||
private static boolean
|
||||
isASCIIOkBiDi(CharSequence s, int length) {
|
||||
int labelStart=0;
|
||||
for(int i=0; i<length; ++i) {
|
||||
char c=s.charAt(i);
|
||||
if(c=='.') { // dot
|
||||
if(i>labelStart) {
|
||||
c=s.charAt(i-1);
|
||||
if(!('a'<=c && c<='z') && !('0'<=c && c<='9')) {
|
||||
// Last character in the label is not an L or EN.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
labelStart=i+1;
|
||||
} else if(i==labelStart) {
|
||||
if(!('a'<=c && c<='z')) {
|
||||
// First character in the label is not an L.
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if(c<=0x20 && (c>=0x1c || (9<=c && c<=0xd))) {
|
||||
// Intermediate character in the label is a B, S or WS.
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean
|
||||
isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
|
||||
// [IDNA2008-Tables]
|
||||
// 200C..200D ; CONTEXTJ # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
|
||||
int labelLimit=labelStart+labelLength;
|
||||
for(int i=labelStart; i<labelLimit; ++i) {
|
||||
if(label.charAt(i)==0x200c) {
|
||||
// Appendix A.1. ZERO WIDTH NON-JOINER
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
|
||||
// If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
|
||||
// (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
|
||||
if(i==labelStart) {
|
||||
return false;
|
||||
}
|
||||
int c;
|
||||
int j=i;
|
||||
c=Character.codePointBefore(label, j);
|
||||
j-=Character.charCount(c);
|
||||
if(UCharacter.getCombiningClass(c)==9) {
|
||||
continue;
|
||||
}
|
||||
// check precontext (Joining_Type:{L,D})(Joining_Type:T)*
|
||||
for(;;) {
|
||||
/* UJoiningType */ int type=UCharacter.getIntPropertyValue(c, UProperty.JOINING_TYPE);
|
||||
if(type==UCharacter.JoiningType.TRANSPARENT) {
|
||||
if(j==0) {
|
||||
return false;
|
||||
}
|
||||
c=Character.codePointBefore(label, j);
|
||||
j-=Character.charCount(c);
|
||||
} else if(type==UCharacter.JoiningType.LEFT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
|
||||
break; // precontext fulfilled
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
|
||||
for(j=i+1;;) {
|
||||
if(j==labelLimit) {
|
||||
return false;
|
||||
}
|
||||
c=Character.codePointAt(label, j);
|
||||
j+=Character.charCount(c);
|
||||
/* UJoiningType */ int type=UCharacter.getIntPropertyValue(c, UProperty.JOINING_TYPE);
|
||||
if(type==UCharacter.JoiningType.TRANSPARENT) {
|
||||
// just skip this character
|
||||
} else if(type==UCharacter.JoiningType.RIGHT_JOINING || type==UCharacter.JoiningType.DUAL_JOINING) {
|
||||
break; // postcontext fulfilled
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else if(label.charAt(i)==0x200d) {
|
||||
// Appendix A.2. ZERO WIDTH JOINER (U+200D)
|
||||
// Rule Set:
|
||||
// False;
|
||||
// If Canonical_Combining_Class(Before(cp)) .eq. Virama Then True;
|
||||
if(i==labelStart) {
|
||||
return false;
|
||||
}
|
||||
int c=Character.codePointBefore(label, i);
|
||||
if(UCharacter.getCombiningClass(c)!=9) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// TODO: make public(?) -- in C, these are public in uchar.h
|
||||
private static int U_MASK(int x) {
|
||||
return 1<<x;
|
||||
}
|
||||
private static int U_GET_GC_MASK(int c) {
|
||||
return (1<<UCharacter.getType(c));
|
||||
}
|
||||
private static int U_GC_M_MASK=
|
||||
U_MASK(UCharacterCategory.NON_SPACING_MARK)|
|
||||
U_MASK(UCharacterCategory.ENCLOSING_MARK)|
|
||||
U_MASK(UCharacterCategory.COMBINING_SPACING_MARK);
|
||||
}
|
@ -1,17 +1,35 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2009, International Business Machines Corporation and *
|
||||
* Copyright (C) 2003-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import com.ibm.icu.impl.Punycode;
|
||||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.impl.IDNA2003;
|
||||
import com.ibm.icu.impl.UTS46;
|
||||
|
||||
/**
|
||||
*
|
||||
* IDNA API implements the IDNA protocol as defined in the <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
|
||||
* Abstract base class for IDNA processing.
|
||||
* See http://www.unicode.org/reports/tr46/
|
||||
* and http://www.ietf.org/rfc/rfc3490.txt
|
||||
* <p>
|
||||
* The IDNA class is not intended for public subclassing.
|
||||
* <p>
|
||||
* The non-static methods implement UTS #46 and IDNA2008.
|
||||
* IDNA2008 is implemented according to UTS #46, see getUTS46Instance().
|
||||
* <p>
|
||||
* The static methods implement IDNA2003.
|
||||
* <p>
|
||||
* IDNA2003 API Overview:
|
||||
* <p>
|
||||
* The static IDNA API methods implement the IDNA protocol as defined in the
|
||||
* <a href="http://www.ietf.org/rfc/rfc3490.txt">IDNA RFC</a>.
|
||||
* The draft defines 2 operations: ToASCII and ToUnicode. Domain labels
|
||||
* containing non-ASCII code points are required to be processed by
|
||||
* ToASCII operation before passing it to resolver libraries. Domain names
|
||||
@ -30,177 +48,369 @@ import com.ibm.icu.impl.Punycode;
|
||||
* ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string)
|
||||
* ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
|
||||
*
|
||||
* @author Ram Viswanadha
|
||||
* @author Ram Viswanadha, Markus Scherer
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public final class IDNA {
|
||||
|
||||
/* IDNA ACE Prefix is "xn--" */
|
||||
private static char[] ACE_PREFIX = new char[]{ 0x0078,0x006E,0x002d,0x002d } ;
|
||||
//private static final int ACE_PREFIX_LENGTH = ACE_PREFIX.length;
|
||||
|
||||
private static final int MAX_LABEL_LENGTH = 63;
|
||||
private static final int HYPHEN = 0x002D;
|
||||
private static final int CAPITAL_A = 0x0041;
|
||||
private static final int CAPITAL_Z = 0x005A;
|
||||
private static final int LOWER_CASE_DELTA = 0x0020;
|
||||
private static final int FULL_STOP = 0x002E;
|
||||
private static final int MAX_DOMAIN_NAME_LENGTH = 255;
|
||||
public abstract class IDNA {
|
||||
/**
|
||||
* Option to prohibit processing of unassigned codepoints in the input and
|
||||
* do not check if the input conforms to STD-3 ASCII rules.
|
||||
*
|
||||
* @see #convertToASCII #convertToUnicode
|
||||
* Default options value: None of the other options are set.
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int DEFAULT = 0x0000;
|
||||
public static final int DEFAULT = 0;
|
||||
/**
|
||||
* Option to allow processing of unassigned codepoints in the input
|
||||
*
|
||||
* @see #convertToASCII #convertToUnicode
|
||||
* Option to allow unassigned code points in domain names and labels.
|
||||
* This option is ignored by the UTS46 implementation.
|
||||
* (UTS #46 disallows unassigned code points.)
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int ALLOW_UNASSIGNED = 0x0001;
|
||||
public static final int ALLOW_UNASSIGNED = 1;
|
||||
/**
|
||||
* Option to check if input conforms to STD-3 ASCII rules
|
||||
*
|
||||
* @see #convertToASCII #convertToUnicode
|
||||
* Option to check whether the input conforms to the STD3 ASCII rules,
|
||||
* for example the restriction of labels to LDH characters
|
||||
* (ASCII Letters, Digits and Hyphen-Minus).
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static final int USE_STD3_RULES = 0x0002;
|
||||
|
||||
// static final singleton object that is initialized
|
||||
// at class initialization time, hence guaranteed to
|
||||
// be initialized and thread safe
|
||||
private static final IDNA singleton = new IDNA();
|
||||
|
||||
// The NamePrep profile object
|
||||
private StringPrep namePrep;
|
||||
|
||||
/* private constructor to prevent construction of the object */
|
||||
private IDNA(){
|
||||
namePrep = StringPrep.getInstance(StringPrep.RFC3491_NAMEPREP);
|
||||
}
|
||||
|
||||
private static boolean startsWithPrefix(StringBuffer src){
|
||||
boolean startsWithPrefix = true;
|
||||
|
||||
if(src.length() < ACE_PREFIX.length){
|
||||
return false;
|
||||
}
|
||||
for(int i=0; i<ACE_PREFIX.length;i++){
|
||||
if(toASCIILower(src.charAt(i)) != ACE_PREFIX[i]){
|
||||
startsWithPrefix = false;
|
||||
}
|
||||
}
|
||||
return startsWithPrefix;
|
||||
}
|
||||
|
||||
private static char toASCIILower(char ch){
|
||||
if(CAPITAL_A <= ch && ch <= CAPITAL_Z){
|
||||
return (char)(ch + LOWER_CASE_DELTA);
|
||||
}
|
||||
return ch;
|
||||
}
|
||||
|
||||
private static StringBuffer toASCIILower(StringBuffer src){
|
||||
StringBuffer dest = new StringBuffer();
|
||||
for(int i=0; i<src.length();i++){
|
||||
dest.append(toASCIILower(src.charAt(i)));
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
private static int compareCaseInsensitiveASCII(StringBuffer s1, StringBuffer s2){
|
||||
char c1,c2;
|
||||
int rc;
|
||||
for(int i =0;/* no condition */;i++) {
|
||||
/* If we reach the ends of both strings then they match */
|
||||
if(i == s1.length()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
c1 = s1.charAt(i);
|
||||
c2 = s2.charAt(i);
|
||||
|
||||
/* Case-insensitive comparison */
|
||||
if(c1!=c2) {
|
||||
rc=toASCIILower(c1)-toASCIILower(c2);
|
||||
if(rc!=0) {
|
||||
return rc;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static int getSeparatorIndex(char[] src,int start, int limit){
|
||||
for(; start<limit;start++){
|
||||
if(isLabelSeparator(src[start])){
|
||||
return start;
|
||||
}
|
||||
}
|
||||
// we have not found the separator just return length
|
||||
return start;
|
||||
}
|
||||
|
||||
/*
|
||||
private static int getSeparatorIndex(UCharacterIterator iter){
|
||||
int currentIndex = iter.getIndex();
|
||||
int separatorIndex = 0;
|
||||
int ch;
|
||||
while((ch=iter.next())!= UCharacterIterator.DONE){
|
||||
if(isLabelSeparator(ch)){
|
||||
separatorIndex = iter.getIndex();
|
||||
iter.setIndex(currentIndex);
|
||||
return separatorIndex;
|
||||
}
|
||||
}
|
||||
// reset index
|
||||
iter.setIndex(currentIndex);
|
||||
// we have not found the separator just return the length
|
||||
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
private static boolean isLDHChar(int ch){
|
||||
// high runner case
|
||||
if(ch>0x007A){
|
||||
return false;
|
||||
}
|
||||
//[\\u002D \\u0030-\\u0039 \\u0041-\\u005A \\u0061-\\u007A]
|
||||
if( (ch==0x002D) ||
|
||||
(0x0030 <= ch && ch <= 0x0039) ||
|
||||
(0x0041 <= ch && ch <= 0x005A) ||
|
||||
(0x0061 <= ch && ch <= 0x007A)
|
||||
){
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static final int USE_STD3_RULES = 2;
|
||||
/**
|
||||
* Ascertain if the given code point is a label separator as
|
||||
* defined by the IDNA RFC
|
||||
*
|
||||
* @param ch The code point to be ascertained
|
||||
* @return true if the char is a label separator
|
||||
* @stable ICU 2.8
|
||||
* IDNA option to check for whether the input conforms to the BiDi rules.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* (IDNA2003 always performs a BiDi check.)
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
private static boolean isLabelSeparator(int ch){
|
||||
switch(ch){
|
||||
case 0x002e:
|
||||
case 0x3002:
|
||||
case 0xFF0E:
|
||||
case 0xFF61:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
public static final int CHECK_BIDI = 4;
|
||||
/**
|
||||
* IDNA option to check for whether the input conforms to the CONTEXTJ rules.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* (The CONTEXTJ check is new in IDNA2008.)
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int CHECK_CONTEXTJ = 8;
|
||||
/**
|
||||
* IDNA option for nontransitional processing in ToASCII().
|
||||
* By default, ToASCII() uses transitional processing.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int NONTRANSITIONAL_TO_ASCII = 0x10;
|
||||
/**
|
||||
* IDNA option for nontransitional processing in ToUnicode().
|
||||
* By default, ToUnicode() uses transitional processing.
|
||||
* This option is ignored by the IDNA2003 implementation.
|
||||
* (This is only relevant for compatibility of newer IDNA implementations with IDNA2003.)
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int NONTRANSITIONAL_TO_UNICODE = 0x20;
|
||||
|
||||
/**
|
||||
* Returns an IDNA instance which implements UTS #46.
|
||||
* Returns an unmodifiable instance, owned by the caller.
|
||||
* Cache it for multiple operations, and delete it when done.
|
||||
* The instance is thread-safe, that is, it can be used concurrently.
|
||||
* <p>
|
||||
* UTS #46 defines Unicode IDNA Compatibility Processing,
|
||||
* updated to the latest version of Unicode and compatible with both
|
||||
* IDNA2003 and IDNA2008.
|
||||
* <p>
|
||||
* The worker functions use transitional processing, including deviation mappings,
|
||||
* unless NONTRANSITIONAL_TO_ASCII or NONTRANSITIONAL_TO_UNICODE
|
||||
* is used in which case the deviation characters are passed through without change.
|
||||
* <p>
|
||||
* Disallowed characters are mapped to U+FFFD.
|
||||
* <p>
|
||||
* Operations with the UTS #46 instance do not support the
|
||||
* ALLOW_UNASSIGNED option.
|
||||
* <p>
|
||||
* By default, the UTS #46 implementation allows all ASCII characters (as valid or mapped).
|
||||
* When the USE_STD3_RULES option is used, ASCII characters other than
|
||||
* letters, digits, hyphen (LDH) and dot/full stop are disallowed and mapped to U+FFFD.
|
||||
*
|
||||
* @param options Bit set to modify the processing and error checking.
|
||||
* @return the UTS #46 IDNA instance, if successful
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static IDNA getUTS46Instance(int options) {
|
||||
return new UTS46(options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a single domain name label into its ASCII form for DNS lookup.
|
||||
* If any processing step fails, then info.hasErrors() will be true and
|
||||
* the result might not be an ASCII string.
|
||||
* The label might be modified according to the types of errors.
|
||||
* Labels with severe errors will be left in (or turned into) their Unicode form.
|
||||
*
|
||||
* @param label Input domain name label
|
||||
* @param dest Destination string object
|
||||
* @param info Output container of IDNA processing details.
|
||||
* @return dest
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info);
|
||||
|
||||
/**
|
||||
* Converts a single domain name label into its Unicode form for human-readable display.
|
||||
* If any processing step fails, then info.hasErrors() will be true.
|
||||
* The label might be modified according to the types of errors.
|
||||
*
|
||||
* @param label Input domain name label
|
||||
* @param dest Destination string object
|
||||
* @param info Output container of IDNA processing details.
|
||||
* @return dest
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info);
|
||||
|
||||
/**
|
||||
* Converts a whole domain name into its ASCII form for DNS lookup.
|
||||
* If any processing step fails, then info.hasErrors() will be true and
|
||||
* the result might not be an ASCII string.
|
||||
* The domain name might be modified according to the types of errors.
|
||||
* Labels with severe errors will be left in (or turned into) their Unicode form.
|
||||
*
|
||||
* @param name Input domain name
|
||||
* @param dest Destination string object
|
||||
* @param info Output container of IDNA processing details.
|
||||
* @return dest
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info);
|
||||
|
||||
/**
|
||||
* Converts a whole domain name into its Unicode form for human-readable display.
|
||||
* If any processing step fails, then info.hasErrors() will be true.
|
||||
* The domain name might be modified according to the types of errors.
|
||||
*
|
||||
* @param name Input domain name
|
||||
* @param dest Destination string object
|
||||
* @param info Output container of IDNA processing details.
|
||||
* @return dest
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info);
|
||||
|
||||
/**
|
||||
* Output container for IDNA processing errors.
|
||||
* The Info class is not suitable for subclassing.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Info {
|
||||
/**
|
||||
* Constructor.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Info() {
|
||||
errors=EnumSet.noneOf(Error.class);
|
||||
labelErrors=EnumSet.noneOf(Error.class);
|
||||
isTransDiff=false;
|
||||
isBiDi=false;
|
||||
isOkBiDi=true;
|
||||
}
|
||||
/**
|
||||
* Were there IDNA processing errors?
|
||||
* @return true if there were processing errors
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean hasErrors() { return !errors.isEmpty(); }
|
||||
/**
|
||||
* Returns a set indicating IDNA processing errors.
|
||||
* @return set of processing errors (modifiable, and not null)
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Set<Error> getErrors() { return errors; }
|
||||
/**
|
||||
* Returns true if transitional and nontransitional processing produce different results.
|
||||
* This is the case when the input label or domain name contains
|
||||
* one or more deviation characters outside a Punycode label (see UTS #46).
|
||||
* <ul>
|
||||
* <li>With nontransitional processing, such characters are
|
||||
* copied to the destination string.
|
||||
* <li>With transitional processing, such characters are
|
||||
* mapped (sharp s/sigma) or removed (joiner/nonjoiner).
|
||||
* </ul>
|
||||
* @return true if transitional and nontransitional processing produce different results
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean isTransitionalDifferent() { return isTransDiff; }
|
||||
|
||||
private void reset() {
|
||||
errors.clear();
|
||||
labelErrors.clear();
|
||||
isTransDiff=false;
|
||||
isBiDi=false;
|
||||
isOkBiDi=true;
|
||||
}
|
||||
|
||||
private EnumSet<Error> errors, labelErrors;
|
||||
private boolean isTransDiff;
|
||||
private boolean isBiDi;
|
||||
private boolean isOkBiDi;
|
||||
}
|
||||
|
||||
// The following protected methods give IDNA subclasses access to the private IDNAInfo fields.
|
||||
// The IDNAInfo also provides intermediate state that is publicly invisible,
|
||||
// avoiding the allocation of another worker object.
|
||||
protected static void resetInfo(Info info) {
|
||||
info.reset();
|
||||
}
|
||||
protected static boolean hasCertainErrors(Info info, EnumSet<Error> errors) {
|
||||
return !info.errors.isEmpty() && !Collections.disjoint(info.errors, errors);
|
||||
}
|
||||
protected static boolean hasCertainLabelErrors(Info info, EnumSet<Error> errors) {
|
||||
return !info.labelErrors.isEmpty() && !Collections.disjoint(info.labelErrors, errors);
|
||||
}
|
||||
protected static void addLabelError(Info info, Error error) {
|
||||
info.labelErrors.add(error);
|
||||
}
|
||||
protected static void promoteAndResetLabelErrors(Info info) {
|
||||
if(!info.labelErrors.isEmpty()) {
|
||||
info.errors.addAll(info.labelErrors);
|
||||
info.labelErrors.clear();
|
||||
}
|
||||
}
|
||||
|
||||
protected static void addError(Info info, Error error) {
|
||||
info.errors.add(error);
|
||||
}
|
||||
protected static void setTransitionalDifferent(Info info) {
|
||||
info.isTransDiff=true;
|
||||
}
|
||||
protected static void setBiDi(Info info) {
|
||||
info.isBiDi=true;
|
||||
}
|
||||
protected static boolean isBiDi(Info info) {
|
||||
return info.isBiDi;
|
||||
}
|
||||
protected static void setNotOkBiDi(Info info) {
|
||||
info.isOkBiDi=false;
|
||||
}
|
||||
protected static boolean isOkBiDi(Info info) {
|
||||
return info.isOkBiDi;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA error bit set values.
|
||||
* When a domain name or label fails a processing step or does not meet the
|
||||
* validity criteria, then one or more of these error bits are set.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static enum Error {
|
||||
/**
|
||||
* A non-final domain name label (or the whole domain name) is empty.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
EMPTY_LABEL,
|
||||
/**
|
||||
* A domain name label is longer than 63 bytes.
|
||||
* (See STD13/RFC1034 3.1. Name space specifications and terminology.)
|
||||
* This is only checked in ToASCII operations, and only if the output label is all-ASCII.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
LABEL_TOO_LONG,
|
||||
/**
|
||||
* A domain name is longer than 255 bytes in its storage form.
|
||||
* (See STD13/RFC1034 3.1. Name space specifications and terminology.)
|
||||
* This is only checked in ToASCII operations, and only if the output domain name is all-ASCII.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
DOMAIN_NAME_TOO_LONG,
|
||||
/**
|
||||
* A label starts with a hyphen-minus ('-').
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
LEADING_HYPHEN,
|
||||
/**
|
||||
* A label ends with a hyphen-minus ('-').
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
TRAILING_HYPHEN,
|
||||
/**
|
||||
* A label contains hyphen-minus ('-') in the third and fourth positions.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
HYPHEN_3_4,
|
||||
/**
|
||||
* A label starts with a combining mark.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
LEADING_COMBINING_MARK,
|
||||
/**
|
||||
* A label or domain name contains disallowed characters.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
DISALLOWED,
|
||||
/**
|
||||
* A label starts with "xn--" but does not contain valid Punycode.
|
||||
* That is, an xn-- label failed Punycode decoding.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
PUNYCODE,
|
||||
/**
|
||||
* A label contains a dot=full stop.
|
||||
* This can occur in an input string for a single-label function.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
LABEL_HAS_DOT,
|
||||
/**
|
||||
* An ACE label does not contain a valid label string.
|
||||
* The label was successfully ACE (Punycode) decoded but the resulting
|
||||
* string had severe validation errors. For example,
|
||||
* it might contain characters that are not allowed in ACE labels,
|
||||
* or it might not be normalized.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
INVALID_ACE_LABEL,
|
||||
/**
|
||||
* A label does not meet the IDNA BiDi requirements (for right-to-left characters).
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
BIDI,
|
||||
/**
|
||||
* A label does not meet the IDNA CONTEXTJ requirements.
|
||||
* @draft ICU 4.6
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
CONTEXTJ
|
||||
}
|
||||
|
||||
/**
|
||||
* Sole constructor. (For invocation by subclass constructors, typically implicit.)
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
protected IDNA() {
|
||||
}
|
||||
|
||||
/* IDNA2003 API ------------------------------------------------------------- */
|
||||
|
||||
/**
|
||||
* IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* ASCII names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; e.g." "www.example.com" is composed of 3 labels
|
||||
@ -231,7 +441,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* ASCII names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; e.g." "www.example.com" is composed of 3 labels
|
||||
@ -261,7 +471,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA2003: This function implements the ToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* ASCII names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; e.g." "www.example.com" is composed of 3 labels
|
||||
@ -286,115 +496,11 @@ public final class IDNA {
|
||||
*/
|
||||
public static StringBuffer convertToASCII(UCharacterIterator src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
boolean[] caseFlags = null;
|
||||
|
||||
// the source contains all ascii codepoints
|
||||
boolean srcIsASCII = true;
|
||||
// assume the source contains all LDH codepoints
|
||||
boolean srcIsLDH = true;
|
||||
|
||||
//get the options
|
||||
boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
|
||||
int ch;
|
||||
// step 1
|
||||
while((ch = src.next())!= UCharacterIterator.DONE){
|
||||
if(ch> 0x7f){
|
||||
srcIsASCII = false;
|
||||
}
|
||||
}
|
||||
int failPos = -1;
|
||||
src.setToStart();
|
||||
StringBuffer processOut = null;
|
||||
// step 2 is performed only if the source contains non ASCII
|
||||
if(!srcIsASCII){
|
||||
// step 2
|
||||
processOut = singleton.namePrep.prepare(src, options);
|
||||
}else{
|
||||
processOut = new StringBuffer(src.getText());
|
||||
}
|
||||
int poLen = processOut.length();
|
||||
|
||||
if(poLen==0){
|
||||
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
|
||||
}
|
||||
StringBuffer dest = new StringBuffer();
|
||||
|
||||
// reset the variable to verify if output of prepare is ASCII or not
|
||||
srcIsASCII = true;
|
||||
|
||||
// step 3 & 4
|
||||
for(int j=0;j<poLen;j++ ){
|
||||
ch=processOut.charAt(j);
|
||||
if(ch > 0x7F){
|
||||
srcIsASCII = false;
|
||||
}else if(isLDHChar(ch)==false){
|
||||
// here we do not assemble surrogates
|
||||
// since we know that LDH code points
|
||||
// are in the ASCII range only
|
||||
srcIsLDH = false;
|
||||
failPos = j;
|
||||
}
|
||||
}
|
||||
|
||||
if(useSTD3ASCIIRules == true){
|
||||
// verify 3a and 3b
|
||||
if( srcIsLDH == false /* source contains some non-LDH characters */
|
||||
|| processOut.charAt(0) == HYPHEN
|
||||
|| processOut.charAt(processOut.length()-1) == HYPHEN){
|
||||
|
||||
/* populate the parseError struct */
|
||||
if(srcIsLDH==false){
|
||||
throw new StringPrepParseException( "The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
processOut.toString(),
|
||||
(failPos>0) ? (failPos-1) : failPos);
|
||||
}else if(processOut.charAt(0) == HYPHEN){
|
||||
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),0);
|
||||
|
||||
}else{
|
||||
throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
processOut.toString(),
|
||||
(poLen>0) ? poLen-1 : poLen);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
if(srcIsASCII){
|
||||
dest = processOut;
|
||||
}else{
|
||||
// step 5 : verify the sequence does not begin with ACE prefix
|
||||
if(!startsWithPrefix(processOut)){
|
||||
|
||||
//step 6: encode the sequence with punycode
|
||||
caseFlags = new boolean[poLen];
|
||||
|
||||
StringBuffer punyout = Punycode.encode(processOut,caseFlags);
|
||||
|
||||
// convert all codepoints to lower case ASCII
|
||||
StringBuffer lowerOut = toASCIILower(punyout);
|
||||
|
||||
//Step 7: prepend the ACE prefix
|
||||
dest.append(ACE_PREFIX,0,ACE_PREFIX.length);
|
||||
//Step 6: copy the contents in b2 into dest
|
||||
dest.append(lowerOut);
|
||||
}else{
|
||||
|
||||
throw new StringPrepParseException("The input does not start with the ACE Prefix.",
|
||||
StringPrepParseException.ACE_PREFIX_ERROR,processOut.toString(),0);
|
||||
}
|
||||
}
|
||||
if(dest.length() > MAX_LABEL_LENGTH){
|
||||
throw new StringPrepParseException("The labels in the input are too long. Length > 63.",
|
||||
StringPrepParseException.LABEL_TOO_LONG_ERROR,dest.toString(),0);
|
||||
}
|
||||
return dest;
|
||||
return IDNA2003.convertToASCII(src, options);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
* It is important to note that this operation can fail. If it fails, then the input
|
||||
* domain name cannot be used as an Internationalized Domain Name and the application
|
||||
@ -428,7 +534,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
* It is important to note that this operation can fail. If it fails, then the input
|
||||
* domain name cannot be used as an Internationalized Domain Name and the application
|
||||
@ -462,7 +568,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
* It is important to note that this operation can fail. If it fails, then the input
|
||||
* domain name cannot be used as an Internationalized Domain Name and the application
|
||||
@ -492,37 +598,12 @@ public final class IDNA {
|
||||
*/
|
||||
public static StringBuffer convertIDNToASCII(String src,int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
char[] srcArr = src.toCharArray();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int sepIndex=0;
|
||||
int oldSepIndex=0;
|
||||
for(;;){
|
||||
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
|
||||
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
|
||||
//make sure this is not a root label separator.
|
||||
if(!(label.length()==0 && sepIndex==srcArr.length)){
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
||||
result.append(convertToASCII(iter,options));
|
||||
}
|
||||
if(sepIndex==srcArr.length){
|
||||
break;
|
||||
}
|
||||
|
||||
// increment the sepIndex to skip past the separator
|
||||
sepIndex++;
|
||||
oldSepIndex = sepIndex;
|
||||
result.append((char)FULL_STOP);
|
||||
}
|
||||
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
|
||||
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
|
||||
}
|
||||
return result;
|
||||
return IDNA2003.convertIDNToASCII(src, options);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* This function implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* Unicode names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
|
||||
@ -552,7 +633,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* This function implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: This function implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* Unicode names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
|
||||
@ -582,7 +663,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* Function that implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Function that implements the ToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on <b>single labels</b> before sending it to something that expects
|
||||
* Unicode names. A label is an individual part of a domain name. Labels are usually
|
||||
* separated by dots; for e.g." "www.example.com" is composed of 3 labels
|
||||
@ -607,116 +688,11 @@ public final class IDNA {
|
||||
*/
|
||||
public static StringBuffer convertToUnicode(UCharacterIterator src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
boolean[] caseFlags = null;
|
||||
|
||||
// the source contains all ascii codepoints
|
||||
boolean srcIsASCII = true;
|
||||
// assume the source contains all LDH codepoints
|
||||
//boolean srcIsLDH = true;
|
||||
|
||||
//get the options
|
||||
//boolean useSTD3ASCIIRules = ((options & USE_STD3_RULES) != 0);
|
||||
|
||||
//int failPos = -1;
|
||||
int ch;
|
||||
int saveIndex = src.getIndex();
|
||||
// step 1: find out if all the codepoints in src are ASCII
|
||||
while((ch=src.next())!= UCharacterIterator.DONE){
|
||||
if(ch>0x7F){
|
||||
srcIsASCII = false;
|
||||
}/*else if((srcIsLDH = isLDHChar(ch))==false){
|
||||
failPos = src.getIndex();
|
||||
}*/
|
||||
}
|
||||
StringBuffer processOut;
|
||||
|
||||
if(srcIsASCII == false){
|
||||
try {
|
||||
// step 2: process the string
|
||||
src.setIndex(saveIndex);
|
||||
processOut = singleton.namePrep.prepare(src,options);
|
||||
} catch (StringPrepParseException ex) {
|
||||
return new StringBuffer(src.getText());
|
||||
}
|
||||
|
||||
}else{
|
||||
//just point to source
|
||||
processOut = new StringBuffer(src.getText());
|
||||
}
|
||||
// TODO:
|
||||
// The RFC states that
|
||||
// <quote>
|
||||
// ToUnicode never fails. If any step fails, then the original input
|
||||
// is returned immediately in that step.
|
||||
// </quote>
|
||||
|
||||
//step 3: verify ACE Prefix
|
||||
if(startsWithPrefix(processOut)){
|
||||
StringBuffer decodeOut = null;
|
||||
|
||||
//step 4: Remove the ACE Prefix
|
||||
String temp = processOut.substring(ACE_PREFIX.length,processOut.length());
|
||||
|
||||
//step 5: Decode using punycode
|
||||
try {
|
||||
decodeOut = Punycode.decode(new StringBuffer(temp),caseFlags);
|
||||
} catch (StringPrepParseException e) {
|
||||
decodeOut = null;
|
||||
}
|
||||
|
||||
//step 6:Apply toASCII
|
||||
if (decodeOut != null) {
|
||||
StringBuffer toASCIIOut = convertToASCII(decodeOut, options);
|
||||
|
||||
//step 7: verify
|
||||
if(compareCaseInsensitiveASCII(processOut, toASCIIOut) !=0){
|
||||
// throw new StringPrepParseException("The verification step prescribed by the RFC 3491 failed",
|
||||
// StringPrepParseException.VERIFICATION_ERROR);
|
||||
decodeOut = null;
|
||||
}
|
||||
}
|
||||
|
||||
//step 8: return output of step 5
|
||||
if (decodeOut != null) {
|
||||
return decodeOut;
|
||||
}
|
||||
}
|
||||
|
||||
// }else{
|
||||
// // verify that STD3 ASCII rules are satisfied
|
||||
// if(useSTD3ASCIIRules == true){
|
||||
// if( srcIsLDH == false /* source contains some non-LDH characters */
|
||||
// || processOut.charAt(0) == HYPHEN
|
||||
// || processOut.charAt(processOut.length()-1) == HYPHEN){
|
||||
//
|
||||
// if(srcIsLDH==false){
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,processOut.toString(),
|
||||
// (failPos>0) ? (failPos-1) : failPos);
|
||||
// }else if(processOut.charAt(0) == HYPHEN){
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
// processOut.toString(),0);
|
||||
//
|
||||
// }else{
|
||||
// throw new StringPrepParseException("The input does not conform to the STD 3 ASCII rules",
|
||||
// StringPrepParseException.STD3_ASCII_RULES_ERROR,
|
||||
// processOut.toString(),
|
||||
// processOut.length());
|
||||
//
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// // just return the source
|
||||
// return new StringBuffer(src.getText());
|
||||
// }
|
||||
|
||||
return new StringBuffer(src.getText());
|
||||
return IDNA2003.convertToUnicode(src, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
*
|
||||
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
|
||||
@ -747,7 +723,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
*
|
||||
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
|
||||
@ -778,7 +754,7 @@ public final class IDNA {
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* IDNA2003: Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
|
||||
* This operation is done on complete domain names, e.g: "www.example.com".
|
||||
*
|
||||
* <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
|
||||
@ -804,37 +780,12 @@ public final class IDNA {
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
public static StringBuffer convertIDNToUnicode(String src, int options)
|
||||
throws StringPrepParseException{
|
||||
|
||||
char[] srcArr = src.toCharArray();
|
||||
StringBuffer result = new StringBuffer();
|
||||
int sepIndex=0;
|
||||
int oldSepIndex=0;
|
||||
for(;;){
|
||||
sepIndex = getSeparatorIndex(srcArr,sepIndex,srcArr.length);
|
||||
String label = new String(srcArr,oldSepIndex,sepIndex-oldSepIndex);
|
||||
if(label.length()==0 && sepIndex!=srcArr.length ){
|
||||
throw new StringPrepParseException("Found zero length lable after NamePrep.",StringPrepParseException.ZERO_LENGTH_LABEL);
|
||||
}
|
||||
UCharacterIterator iter = UCharacterIterator.getInstance(label);
|
||||
result.append(convertToUnicode(iter,options));
|
||||
if(sepIndex==srcArr.length){
|
||||
break;
|
||||
}
|
||||
// Unlike the ToASCII operation we don't normalize the label separators
|
||||
result.append(srcArr[sepIndex]);
|
||||
// increment the sepIndex to skip past the separator
|
||||
sepIndex++;
|
||||
oldSepIndex =sepIndex;
|
||||
}
|
||||
if(result.length() > MAX_DOMAIN_NAME_LENGTH){
|
||||
throw new StringPrepParseException("The output exceed the max allowed length.", StringPrepParseException.DOMAIN_NAME_TOO_LONG_ERROR);
|
||||
}
|
||||
return result;
|
||||
throws StringPrepParseException{
|
||||
return IDNA2003.convertIDNToUnicode(src, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two IDN strings for equivalence.
|
||||
* IDNA2003: Compare two IDN strings for equivalence.
|
||||
* This function splits the domain names into labels and compares them.
|
||||
* According to IDN RFC, whenever two labels are compared, they are
|
||||
* considered equal if and only if their ASCII forms (obtained by
|
||||
@ -860,19 +811,16 @@ public final class IDNA {
|
||||
* @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// TODO: optimize
|
||||
public static int compare(StringBuffer s1, StringBuffer s2, int options)
|
||||
throws StringPrepParseException{
|
||||
if(s1==null || s2 == null){
|
||||
throw new IllegalArgumentException("One of the source buffers is null");
|
||||
}
|
||||
StringBuffer s1Out = convertIDNToASCII(s1.toString(),options);
|
||||
StringBuffer s2Out = convertIDNToASCII(s2.toString(), options);
|
||||
return compareCaseInsensitiveASCII(s1Out,s2Out);
|
||||
return IDNA2003.compare(s1.toString(), s2.toString(), options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare two IDN strings for equivalence.
|
||||
* IDNA2003: Compare two IDN strings for equivalence.
|
||||
* This function splits the domain names into labels and compares them.
|
||||
* According to IDN RFC, whenever two labels are compared, they are
|
||||
* considered equal if and only if their ASCII forms (obtained by
|
||||
@ -898,18 +846,14 @@ public final class IDNA {
|
||||
* @return 0 if the strings are equal, > 0 if s1 > s2 and < 0 if s1 < s2
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// TODO: optimize
|
||||
public static int compare(String s1, String s2, int options)
|
||||
throws StringPrepParseException{
|
||||
public static int compare(String s1, String s2, int options) throws StringPrepParseException{
|
||||
if(s1==null || s2 == null){
|
||||
throw new IllegalArgumentException("One of the source buffers is null");
|
||||
}
|
||||
StringBuffer s1Out = convertIDNToASCII(s1, options);
|
||||
StringBuffer s2Out = convertIDNToASCII(s2, options);
|
||||
return compareCaseInsensitiveASCII(s1Out,s2Out);
|
||||
return IDNA2003.compare(s1, s2, options);
|
||||
}
|
||||
/**
|
||||
* Compare two IDN strings for equivalence.
|
||||
* IDNA2003: Compare two IDN strings for equivalence.
|
||||
* This function splits the domain names into labels and compares them.
|
||||
* According to IDN RFC, whenever two labels are compared, they are
|
||||
* considered equal if and only if their ASCII forms (obtained by
|
||||
@ -935,14 +879,11 @@ public final class IDNA {
|
||||
* @return 0 if the strings are equal, > 0 if i1 > i2 and < 0 if i1 < i2
|
||||
* @stable ICU 2.8
|
||||
*/
|
||||
// TODO: optimize
|
||||
public static int compare(UCharacterIterator s1, UCharacterIterator s2, int options)
|
||||
throws StringPrepParseException{
|
||||
if(s1==null || s2 == null){
|
||||
throw new IllegalArgumentException("One of the source buffers is null");
|
||||
}
|
||||
StringBuffer s1Out = convertIDNToASCII(s1.getText(), options);
|
||||
StringBuffer s2Out = convertIDNToASCII(s2.getText(), options);
|
||||
return compareCaseInsensitiveASCII(s1Out,s2Out);
|
||||
return IDNA2003.compare(s1.getText(), s2.getText(), options);
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2008, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.normalizer;
|
||||
@ -25,6 +25,7 @@ public class TestAll extends TestGroup {
|
||||
"TestCanonicalIterator",
|
||||
"NormalizationMonkeyTest",
|
||||
"NormalizerRegressionTests",
|
||||
"UTS46Test"
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,714 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.normalizer;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
|
||||
import com.ibm.icu.text.IDNA;
|
||||
|
||||
/**
|
||||
* UTS #46 (IDNA2008) test.
|
||||
* @author Markus Scherer
|
||||
* @since 2010jul10
|
||||
*/
|
||||
public class UTS46Test extends TestFmwk {
|
||||
public static void main(String[] args) throws Exception {
|
||||
new UTS46Test().run(args);
|
||||
}
|
||||
public UTS46Test() {
|
||||
trans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ);
|
||||
nontrans=IDNA.getUTS46Instance(IDNA.USE_STD3_RULES|IDNA.CHECK_BIDI|IDNA.CHECK_CONTEXTJ|
|
||||
IDNA.NONTRANSITIONAL_TO_ASCII|IDNA.NONTRANSITIONAL_TO_UNICODE);
|
||||
}
|
||||
|
||||
public void TestAPI() {
|
||||
StringBuilder result=new StringBuilder();
|
||||
IDNA.Info info=new IDNA.Info();
|
||||
String input="www.eXample.cOm";
|
||||
String expected="www.example.com";
|
||||
trans.nameToASCII(input, result, info);
|
||||
if(info.hasErrors() || !UTF16Plus.equal(result, expected)) {
|
||||
errln(String.format("T.nameToASCII(www.example.com) info.errors=%s result matches=%b",
|
||||
info.getErrors(), UTF16Plus.equal(result, expected)));
|
||||
}
|
||||
input="xn--bcher.de-65a";
|
||||
expected="xn--bcher\uFFFDde-65a";
|
||||
nontrans.labelToASCII(input, result, info);
|
||||
if( !info.getErrors().equals(EnumSet.of(IDNA.Error.LABEL_HAS_DOT, IDNA.Error.INVALID_ACE_LABEL)) ||
|
||||
!UTF16Plus.equal(result, expected)
|
||||
) {
|
||||
errln(String.format("N.labelToASCII(label-with-dot) failed with errors %s",
|
||||
info.getErrors()));
|
||||
}
|
||||
// Java API tests that are not parallel to C++ tests
|
||||
// because the C++ specifics (error codes etc.) do not apply here.
|
||||
String resultString=trans.nameToUnicode("fA\u00DF.de", result, info).toString();
|
||||
if(info.hasErrors() || !resultString.equals("fass.de")) {
|
||||
errln(String.format("T.nameToUnicode(fA\u00DF.de) info.errors=%s result matches=%b",
|
||||
info.getErrors(), resultString.equals("fass.de")));
|
||||
}
|
||||
try {
|
||||
nontrans.labelToUnicode(result, result, info);
|
||||
errln("N.labelToUnicode(result, result) did not throw an Exception");
|
||||
} catch(Exception e) {
|
||||
// as expected (should be an IllegalArgumentException, or an ICU version of it)
|
||||
}
|
||||
}
|
||||
|
||||
public void TestNotSTD3() {
|
||||
IDNA not3=IDNA.getUTS46Instance(IDNA.CHECK_BIDI);
|
||||
String input="\u0000A_2+2=4\n.e\u00DFen.net";
|
||||
StringBuilder result=new StringBuilder();
|
||||
IDNA.Info info=new IDNA.Info();
|
||||
if( !not3.nameToUnicode(input, result, info).toString().equals("\u0000a_2+2=4\n.essen.net") ||
|
||||
info.hasErrors()
|
||||
) {
|
||||
errln(String.format("notSTD3.nameToUnicode(non-LDH ASCII) unexpected errors %s string %s",
|
||||
info.getErrors(), prettify(result.toString())));
|
||||
}
|
||||
// A space (BiDi class WS) is not allowed in a BiDi domain name.
|
||||
input="a z.xn--4db.edu";
|
||||
not3.nameToASCII(input, result, info);
|
||||
if(!UTF16Plus.equal(result, input) || !info.getErrors().equals(EnumSet.of(IDNA.Error.BIDI))) {
|
||||
errln("notSTD3.nameToASCII(ASCII-with-space.alef.edu) failed");
|
||||
}
|
||||
}
|
||||
|
||||
private static final Map<String, IDNA.Error> errorNamesToErrors;
|
||||
static {
|
||||
errorNamesToErrors=new TreeMap<String, IDNA.Error>();
|
||||
errorNamesToErrors.put("UIDNA_ERROR_EMPTY_LABEL", IDNA.Error.EMPTY_LABEL);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_LABEL_TOO_LONG", IDNA.Error.LABEL_TOO_LONG);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_DOMAIN_NAME_TOO_LONG", IDNA.Error.DOMAIN_NAME_TOO_LONG);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_LEADING_HYPHEN", IDNA.Error.LEADING_HYPHEN);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_TRAILING_HYPHEN", IDNA.Error.TRAILING_HYPHEN);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_HYPHEN_3_4", IDNA.Error.HYPHEN_3_4);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_LEADING_COMBINING_MARK", IDNA.Error.LEADING_COMBINING_MARK);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_DISALLOWED", IDNA.Error.DISALLOWED);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_PUNYCODE", IDNA.Error.PUNYCODE);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_LABEL_HAS_DOT", IDNA.Error.LABEL_HAS_DOT);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_INVALID_ACE_LABEL", IDNA.Error.INVALID_ACE_LABEL);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_BIDI", IDNA.Error.BIDI);
|
||||
errorNamesToErrors.put("UIDNA_ERROR_CONTEXTJ", IDNA.Error.CONTEXTJ);
|
||||
}
|
||||
|
||||
private static final class TestCase {
|
||||
private TestCase() {
|
||||
errors=EnumSet.noneOf(IDNA.Error.class);
|
||||
}
|
||||
private void set(String[] data) {
|
||||
s=data[0];
|
||||
o=data[1];
|
||||
u=data[2];
|
||||
errors.clear();
|
||||
if(data[3].length()!=0) {
|
||||
for(String e: data[3].split("\\|")) {
|
||||
errors.add(errorNamesToErrors.get(e));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Input string and options string (Nontransitional/Transitional/Both).
|
||||
private String s, o;
|
||||
// Expected Unicode result string.
|
||||
private String u;
|
||||
private EnumSet<IDNA.Error> errors;
|
||||
};
|
||||
|
||||
private static final String testCases[][]={
|
||||
{ "www.eXample.cOm", "B", // all ASCII
|
||||
"www.example.com", "" },
|
||||
{ "B\u00FCcher.de", "B", // u-umlaut
|
||||
"b\u00FCcher.de", "" },
|
||||
{ "\u00D6BB", "B", // O-umlaut
|
||||
"\u00F6bb", "" },
|
||||
{ "fa\u00DF.de", "N", // sharp s
|
||||
"fa\u00DF.de", "" },
|
||||
{ "fa\u00DF.de", "T", // sharp s
|
||||
"fass.de", "" },
|
||||
{ "XN--fA-hia.dE", "B", // sharp s in Punycode
|
||||
"fa\u00DF.de", "" },
|
||||
{ "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "N", // Greek with final sigma
|
||||
"\u03B2\u03CC\u03BB\u03BF\u03C2.com", "" },
|
||||
{ "\u03B2\u03CC\u03BB\u03BF\u03C2.com", "T", // Greek with final sigma
|
||||
"\u03B2\u03CC\u03BB\u03BF\u03C3.com", "" },
|
||||
{ "xn--nxasmm1c", "B", // Greek with final sigma in Punycode
|
||||
"\u03B2\u03CC\u03BB\u03BF\u03C2", "" },
|
||||
{ "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "N", // "Sri" in "Sri Lanka" has a ZWJ
|
||||
"www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
|
||||
{ "www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "T", // "Sri" in "Sri Lanka" has a ZWJ
|
||||
"www.\u0DC1\u0DCA\u0DBB\u0DD3.com", "" },
|
||||
{ "www.xn--10cl1a0b660p.com", "B", // "Sri" in Punycode
|
||||
"www.\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com", "" },
|
||||
{ "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "N", // ZWNJ
|
||||
"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "" },
|
||||
{ "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC", "T", // ZWNJ
|
||||
"\u0646\u0627\u0645\u0647\u0627\u06CC", "" },
|
||||
{ "xn--mgba3gch31f060k.com", "B", // ZWNJ in Punycode
|
||||
"\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com", "" },
|
||||
{ "a.b\uFF0Ec\u3002d\uFF61", "B",
|
||||
"a.b.c.d.", "" },
|
||||
{ "U\u0308.xn--tda", "B", // U+umlaut.u-umlaut
|
||||
"\u00FC.\u00FC", "" },
|
||||
{ "xn--u-ccb", "B", // u+umlaut in Punycode
|
||||
"xn--u-ccb\uFFFD", "UIDNA_ERROR_INVALID_ACE_LABEL" },
|
||||
{ "a\u2488com", "B", // contains 1-dot
|
||||
"a\uFFFDcom", "UIDNA_ERROR_DISALLOWED" },
|
||||
{ "xn--a-ecp.ru", "B", // contains 1-dot in Punycode
|
||||
"xn--a-ecp\uFFFD.ru", "UIDNA_ERROR_INVALID_ACE_LABEL" },
|
||||
{ "xn--0.pt", "B", // invalid Punycode
|
||||
"xn--0\uFFFD.pt", "UIDNA_ERROR_PUNYCODE" },
|
||||
{ "xn--a.pt", "B", // U+0080
|
||||
"xn--a\uFFFD.pt", "UIDNA_ERROR_INVALID_ACE_LABEL" },
|
||||
{ "xn--a-\u00C4.pt", "B", // invalid Punycode
|
||||
"xn--a-\u00E4.pt", "UIDNA_ERROR_PUNYCODE" },
|
||||
{ "\u65E5\u672C\u8A9E\u3002\uFF2A\uFF30", "B", // Japanese with fullwidth ".jp"
|
||||
"\u65E5\u672C\u8A9E.jp", "" },
|
||||
{ "\u2615", "B", "\u2615", "" }, // Unicode 4.0 HOT BEVERAGE
|
||||
// many deviation characters, test the special mapping code
|
||||
{ "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
|
||||
"\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "N",
|
||||
"1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
|
||||
"\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_CONTEXTJ" },
|
||||
{ "1.a\u00DF\u200C\u200Db\u200C\u200Dc\u00DF\u00DF\u00DF\u00DFd"+
|
||||
"\u03C2\u03C3\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFe"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFx"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DFy"+
|
||||
"\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u00DF\u0302\u00DFz", "T",
|
||||
"1.assbcssssssssd"+
|
||||
"\u03C3\u03C3sssssssssssssssse"+
|
||||
"ssssssssssssssssssssx"+
|
||||
"ssssssssssssssssssssy"+
|
||||
"sssssssssssssss\u015Dssz", "UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
// "xn--bss" with deviation characters
|
||||
{ "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "N",
|
||||
"\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "UIDNA_ERROR_CONTEXTJ" },
|
||||
{ "\u200Cx\u200Dn\u200C-\u200D-b\u00DF", "T",
|
||||
"\u5919", "" },
|
||||
// "xn--bssffl" written as:
|
||||
// 02E3 MODIFIER LETTER SMALL X
|
||||
// 034F COMBINING GRAPHEME JOINER (ignored)
|
||||
// 2115 DOUBLE-STRUCK CAPITAL N
|
||||
// 200B ZERO WIDTH SPACE (ignored)
|
||||
// FE63 SMALL HYPHEN-MINUS
|
||||
// 00AD SOFT HYPHEN (ignored)
|
||||
// FF0D FULLWIDTH HYPHEN-MINUS
|
||||
// 180C MONGOLIAN FREE VARIATION SELECTOR TWO (ignored)
|
||||
// 212C SCRIPT CAPITAL B
|
||||
// FE00 VARIATION SELECTOR-1 (ignored)
|
||||
// 017F LATIN SMALL LETTER LONG S
|
||||
// 2064 INVISIBLE PLUS (ignored)
|
||||
// 1D530 MATHEMATICAL FRAKTUR SMALL S
|
||||
// E01EF VARIATION SELECTOR-256 (ignored)
|
||||
// FB04 LATIN SMALL LIGATURE FFL
|
||||
{ "\u02E3\u034F\u2115\u200B\uFE63\u00AD\uFF0D\u180C"+
|
||||
"\u212C\uFE00\u017F\u2064"+"\uD835\uDD30\uDB40\uDDEF"/*1D530 E01EF*/+"\uFB04", "B",
|
||||
"\u5921\u591E\u591C\u5919", "" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901.", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901.", "" },
|
||||
// Domain name >256 characters, forces slow path in UTF-8 processing.
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"12345678901234567890123456789012345678901234567890123456789012", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"12345678901234567890123456789012345678901234567890123456789012",
|
||||
"UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789\u05D0", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789\u05D0",
|
||||
"UIDNA_ERROR_DOMAIN_NAME_TOO_LONG|UIDNA_ERROR_BIDI" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890.", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890.",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901234."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
|
||||
// label length 63: xn--1234567890123456789012345678901234567890123456789012345-9te
|
||||
{ "\u00E41234567890123456789012345678901234567890123456789012345", "B",
|
||||
"\u00E41234567890123456789012345678901234567890123456789012345", "" },
|
||||
{ "1234567890\u00E41234567890123456789012345678901234567890123456", "B",
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456", "UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901.", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901.", "" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"12345678901234567890123456789012345678901234567890123456789012", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E4123456789012345678901234567890123456789012345."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"12345678901234567890123456789012345678901234567890123456789012",
|
||||
"UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890.", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"123456789012345678901234567890123456789012345678901234567890.",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG" },
|
||||
{ "123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901", "B",
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890\u00E41234567890123456789012345678901234567890123456."+
|
||||
"123456789012345678901234567890123456789012345678901234567890123."+
|
||||
"1234567890123456789012345678901234567890123456789012345678901",
|
||||
"UIDNA_ERROR_LABEL_TOO_LONG|UIDNA_ERROR_DOMAIN_NAME_TOO_LONG" },
|
||||
// hyphen errors and empty-label errors
|
||||
// "xn---q----jra"=="-q--a-umlaut-"
|
||||
{ "a.b..-q--a-.e", "B", "a.b..-q--a-.e",
|
||||
"UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
|
||||
"UIDNA_ERROR_HYPHEN_3_4" },
|
||||
{ "a.b..-q--\u00E4-.e", "B", "a.b..-q--\u00E4-.e",
|
||||
"UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
|
||||
"UIDNA_ERROR_HYPHEN_3_4" },
|
||||
{ "a.b..xn---q----jra.e", "B", "a.b..-q--\u00E4-.e",
|
||||
"UIDNA_ERROR_EMPTY_LABEL|UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN|"+
|
||||
"UIDNA_ERROR_HYPHEN_3_4" },
|
||||
{ "a..c", "B", "a..c", "UIDNA_ERROR_EMPTY_LABEL" },
|
||||
{ "a.-b.", "B", "a.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
|
||||
{ "a.b-.c", "B", "a.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
|
||||
{ "a.-.c", "B", "a.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
|
||||
{ "a.bc--de.f", "B", "a.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
|
||||
{ "\u00E4.\u00AD.c", "B", "\u00E4..c", "UIDNA_ERROR_EMPTY_LABEL" },
|
||||
{ "\u00E4.-b.", "B", "\u00E4.-b.", "UIDNA_ERROR_LEADING_HYPHEN" },
|
||||
{ "\u00E4.b-.c", "B", "\u00E4.b-.c", "UIDNA_ERROR_TRAILING_HYPHEN" },
|
||||
{ "\u00E4.-.c", "B", "\u00E4.-.c", "UIDNA_ERROR_LEADING_HYPHEN|UIDNA_ERROR_TRAILING_HYPHEN" },
|
||||
{ "\u00E4.bc--de.f", "B", "\u00E4.bc--de.f", "UIDNA_ERROR_HYPHEN_3_4" },
|
||||
{ "a.b.\u0308c.d", "B", "a.b.\uFFFDc.d", "UIDNA_ERROR_LEADING_COMBINING_MARK" },
|
||||
{ "a.b.xn--c-bcb.d", "B",
|
||||
"a.b.xn--c-bcb\uFFFD.d", "UIDNA_ERROR_LEADING_COMBINING_MARK|UIDNA_ERROR_INVALID_ACE_LABEL" },
|
||||
// BiDi
|
||||
{ "A0", "B", "a0", "" },
|
||||
{ "0A", "B", "0a", "" }, // all-LTR is ok to start with a digit (EN)
|
||||
{ "0A.\u05D0", "B", // ASCII label does not start with L/R/AL
|
||||
"0a.\u05D0", "UIDNA_ERROR_BIDI" },
|
||||
{ "c.xn--0-eha.xn--4db", "B", // 2nd label does not start with L/R/AL
|
||||
"c.0\u00FC.\u05D0", "UIDNA_ERROR_BIDI" },
|
||||
{ "b-.\u05D0", "B", // label does not end with L/EN
|
||||
"b-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
|
||||
{ "d.xn----dha.xn--4db", "B", // 2nd label does not end with L/EN
|
||||
"d.\u00FC-.\u05D0", "UIDNA_ERROR_TRAILING_HYPHEN|UIDNA_ERROR_BIDI" },
|
||||
{ "a\u05D0", "B", "a\u05D0", "UIDNA_ERROR_BIDI" }, // first dir != last dir
|
||||
{ "\u05D0\u05C7", "B", "\u05D0\u05C7", "" },
|
||||
{ "\u05D09\u05C7", "B", "\u05D09\u05C7", "" },
|
||||
{ "\u05D0a\u05C7", "B", "\u05D0a\u05C7", "UIDNA_ERROR_BIDI" }, // first dir != last dir
|
||||
{ "\u05D0\u05EA", "B", "\u05D0\u05EA", "" },
|
||||
{ "\u05D0\u05F3\u05EA", "B", "\u05D0\u05F3\u05EA", "" },
|
||||
{ "a\u05D0Tz", "B", "a\u05D0tz", "UIDNA_ERROR_BIDI" }, // mixed dir
|
||||
{ "\u05D0T\u05EA", "B", "\u05D0t\u05EA", "UIDNA_ERROR_BIDI" }, // mixed dir
|
||||
{ "\u05D07\u05EA", "B", "\u05D07\u05EA", "" },
|
||||
{ "\u05D0\u0667\u05EA", "B", "\u05D0\u0667\u05EA", "" }, // Arabic 7 in the middle
|
||||
{ "a7\u0667z", "B", "a7\u0667z", "UIDNA_ERROR_BIDI" }, // AN digit in LTR
|
||||
{ "\u05D07\u0667\u05EA", "B", // mixed EN/AN digits in RTL
|
||||
"\u05D07\u0667\u05EA", "UIDNA_ERROR_BIDI" },
|
||||
// ZWJ
|
||||
{ "\u0BB9\u0BCD\u200D", "N", "\u0BB9\u0BCD\u200D", "" }, // Virama+ZWJ
|
||||
{ "\u0BB9\u200D", "N", "\u0BB9\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama
|
||||
{ "\u200D", "N", "\u200D", "UIDNA_ERROR_CONTEXTJ" }, // no Virama
|
||||
// ZWNJ
|
||||
{ "\u0BB9\u0BCD\u200C", "N", "\u0BB9\u0BCD\u200C", "" }, // Virama+ZWNJ
|
||||
{ "\u0BB9\u200C", "N", "\u0BB9\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama
|
||||
{ "\u200C", "N", "\u200C", "UIDNA_ERROR_CONTEXTJ" }, // no Virama
|
||||
{ "\u0644\u0670\u200C\u06ED\u06EF", "N", // Joining types D T ZWNJ T R
|
||||
"\u0644\u0670\u200C\u06ED\u06EF", "" },
|
||||
{ "\u0644\u0670\u200C\u06EF", "N", // D T ZWNJ R
|
||||
"\u0644\u0670\u200C\u06EF", "" },
|
||||
{ "\u0644\u200C\u06ED\u06EF", "N", // D ZWNJ T R
|
||||
"\u0644\u200C\u06ED\u06EF", "" },
|
||||
{ "\u0644\u200C\u06EF", "N", // D ZWNJ R
|
||||
"\u0644\u200C\u06EF", "" },
|
||||
{ "\u0644\u0670\u200C\u06ED", "N", // D T ZWNJ T
|
||||
"\u0644\u0670\u200C\u06ED", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
|
||||
{ "\u06EF\u200C\u06EF", "N", // R ZWNJ R
|
||||
"\u06EF\u200C\u06EF", "UIDNA_ERROR_CONTEXTJ" },
|
||||
{ "\u0644\u200C", "N", // D ZWNJ
|
||||
"\u0644\u200C", "UIDNA_ERROR_BIDI|UIDNA_ERROR_CONTEXTJ" },
|
||||
// { "", "B",
|
||||
// "", "" },
|
||||
};
|
||||
|
||||
public void TestSomeCases() {
|
||||
StringBuilder aT=new StringBuilder(), uT=new StringBuilder();
|
||||
StringBuilder aN=new StringBuilder(), uN=new StringBuilder();
|
||||
IDNA.Info aTInfo=new IDNA.Info(), uTInfo=new IDNA.Info();
|
||||
IDNA.Info aNInfo=new IDNA.Info(), uNInfo=new IDNA.Info();
|
||||
|
||||
StringBuilder aTuN=new StringBuilder(), uTaN=new StringBuilder();
|
||||
StringBuilder aNuN=new StringBuilder(), uNaN=new StringBuilder();
|
||||
IDNA.Info aTuNInfo=new IDNA.Info(), uTaNInfo=new IDNA.Info();
|
||||
IDNA.Info aNuNInfo=new IDNA.Info(), uNaNInfo=new IDNA.Info();
|
||||
|
||||
StringBuilder aTL=new StringBuilder(), uTL=new StringBuilder();
|
||||
StringBuilder aNL=new StringBuilder(), uNL=new StringBuilder();
|
||||
IDNA.Info aTLInfo=new IDNA.Info(), uTLInfo=new IDNA.Info();
|
||||
IDNA.Info aNLInfo=new IDNA.Info(), uNLInfo=new IDNA.Info();
|
||||
|
||||
EnumSet<IDNA.Error> uniErrors=EnumSet.noneOf(IDNA.Error.class);
|
||||
|
||||
TestCase testCase=new TestCase();
|
||||
int i;
|
||||
for(i=0; i<testCases.length; ++i) {
|
||||
testCase.set(testCases[i]);
|
||||
String input=testCase.s;
|
||||
String expected=testCase.u;
|
||||
// ToASCII/ToUnicode, transitional/nontransitional
|
||||
try {
|
||||
trans.nameToASCII(input, aT, aTInfo);
|
||||
trans.nameToUnicode(input, uT, uTInfo);
|
||||
nontrans.nameToASCII(input, aN, aNInfo);
|
||||
nontrans.nameToUnicode(input, uN, uNInfo);
|
||||
} catch(Exception e) {
|
||||
errln(String.format("first-level processing [%d/%s] %s - %s",
|
||||
i, testCase.o, testCase.s, e));
|
||||
continue;
|
||||
}
|
||||
// ToUnicode does not set length errors.
|
||||
uniErrors.clear();
|
||||
uniErrors.addAll(testCase.errors);
|
||||
uniErrors.removeAll(lengthErrors);
|
||||
char mode=testCase.o.charAt(0);
|
||||
if(mode=='B' || mode=='N') {
|
||||
if(!sameErrors(uNInfo, uniErrors)) {
|
||||
errln(String.format("N.nameToUnicode([%d] %s) unexpected errors %s",
|
||||
i, testCase.s, uNInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(uN, expected)) {
|
||||
errln(String.format("N.nameToUnicode([%d] %s) unexpected string %s",
|
||||
i, testCase.s, prettify(uN.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!sameErrors(aNInfo, testCase.errors)) {
|
||||
errln(String.format("N.nameToASCII([%d] %s) unexpected errors %s",
|
||||
i, testCase.s, aNInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(mode=='B' || mode=='T') {
|
||||
if(!sameErrors(uTInfo, uniErrors)) {
|
||||
errln(String.format("T.nameToUnicode([%d] %s) unexpected errors %s",
|
||||
i, testCase.s, uTInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(uT, expected)) {
|
||||
errln(String.format("T.nameToUnicode([%d] %s) unexpected string %s",
|
||||
i, testCase.s, prettify(uT.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!sameErrors(aTInfo, testCase.errors)) {
|
||||
errln(String.format("T.nameToASCII([%d] %s) unexpected errors %s",
|
||||
i, testCase.s, aTInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// ToASCII is all-ASCII if no severe errors
|
||||
if(!hasCertainErrors(aNInfo, severeErrors) && !isASCII(aN)) {
|
||||
errln(String.format("N.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
|
||||
i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!hasCertainErrors(aTInfo, severeErrors) && !isASCII(aT)) {
|
||||
errln(String.format("T.nameToASCII([%d] %s) (errors %s) result is not ASCII %s",
|
||||
i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
|
||||
continue;
|
||||
}
|
||||
if(isVerbose()) {
|
||||
char m= mode=='B' ? mode : 'N';
|
||||
logln(String.format("%c.nameToASCII([%d] %s) (errors %s) result string: %s",
|
||||
m, i, testCase.s, aNInfo.getErrors(), prettify(aN.toString())));
|
||||
if(mode!='B') {
|
||||
logln(String.format("T.nameToASCII([%d] %s) (errors %s) result string: %s",
|
||||
i, testCase.s, aTInfo.getErrors(), prettify(aT.toString())));
|
||||
}
|
||||
}
|
||||
// second-level processing
|
||||
try {
|
||||
nontrans.nameToUnicode(aT, aTuN, aTuNInfo);
|
||||
nontrans.nameToASCII(uT, uTaN, uTaNInfo);
|
||||
nontrans.nameToUnicode(aN, aNuN, aNuNInfo);
|
||||
nontrans.nameToASCII(uN, uNaN, uNaNInfo);
|
||||
} catch(Exception e) {
|
||||
errln(String.format("second-level processing [%d/%s] %s - %s",
|
||||
i, testCase.o, testCase.s, e));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(aN, uNaN)) {
|
||||
errln(String.format("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() "+
|
||||
"(errors %s) %s vs. %s",
|
||||
i, testCase.s, aNInfo.getErrors(),
|
||||
prettify(aN.toString()), prettify(uNaN.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(aT, uTaN)) {
|
||||
errln(String.format("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() "+
|
||||
"(errors %s) %s vs. %s",
|
||||
i, testCase.s, aNInfo.getErrors(),
|
||||
prettify(aT.toString()), prettify(uTaN.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(uN, aNuN)) {
|
||||
errln(String.format("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() "+
|
||||
"(errors %s) %s vs. %s",
|
||||
i, testCase.s, uNInfo.getErrors(), prettify(uN.toString()), prettify(aNuN.toString())));
|
||||
continue;
|
||||
}
|
||||
if(!UTF16Plus.equal(uT, aTuN)) {
|
||||
errln(String.format("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() "+
|
||||
"(errors %s) %s vs. %s",
|
||||
i, testCase.s, uNInfo.getErrors(),
|
||||
prettify(uT.toString()), prettify(aTuN.toString())));
|
||||
continue;
|
||||
}
|
||||
// labelToUnicode
|
||||
try {
|
||||
trans.labelToASCII(input, aTL, aTLInfo);
|
||||
trans.labelToUnicode(input, uTL, uTLInfo);
|
||||
nontrans.labelToASCII(input, aNL, aNLInfo);
|
||||
nontrans.labelToUnicode(input, uNL, uNLInfo);
|
||||
} catch(Exception e) {
|
||||
errln(String.format("labelToXYZ processing [%d/%s] %s - %s",
|
||||
i, testCase.o, testCase.s, e));
|
||||
continue;
|
||||
}
|
||||
if(aN.indexOf(".")<0) {
|
||||
if(!UTF16Plus.equal(aN, aNL) || !sameErrors(aNInfo, aNLInfo)) {
|
||||
errln(String.format("N.nameToASCII([%d] %s)!=N.labelToASCII() "+
|
||||
"(errors %s vs %04lx) %s vs. %s",
|
||||
i, testCase.s, aNInfo.getErrors(), aNLInfo.getErrors(),
|
||||
prettify(aN.toString()), prettify(aNL.toString())));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if(!hasError(aNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
|
||||
errln(String.format("N.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
|
||||
i, testCase.s, aNLInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(aT.indexOf(".")<0) {
|
||||
if(!UTF16Plus.equal(aT, aTL) || !sameErrors(aTInfo, aTLInfo)) {
|
||||
errln(String.format("T.nameToASCII([%d] %s)!=T.labelToASCII() "+
|
||||
"(errors %s vs %04lx) %s vs. %s",
|
||||
i, testCase.s, aTInfo.getErrors(), aTLInfo.getErrors(),
|
||||
prettify(aT.toString()), prettify(aTL.toString())));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if(!hasError(aTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
|
||||
errln(String.format("T.labelToASCII([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
|
||||
i, testCase.s, aTLInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(uN.indexOf(".")<0) {
|
||||
if(!UTF16Plus.equal(uN, uNL) || !sameErrors(uNInfo, uNLInfo)) {
|
||||
errln(String.format("N.nameToUnicode([%d] %s)!=N.labelToUnicode() "+
|
||||
"(errors %s vs %04lx) %s vs. %s",
|
||||
i, testCase.s, uNInfo.getErrors(), uNLInfo.getErrors(),
|
||||
prettify(uN.toString()), prettify(uNL.toString())));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if(!hasError(uNLInfo, IDNA.Error.LABEL_HAS_DOT)) {
|
||||
errln(String.format("N.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
|
||||
i, testCase.s, uNLInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if(uT.indexOf(".")<0) {
|
||||
if(!UTF16Plus.equal(uT, uTL) || !sameErrors(uTInfo, uTLInfo)) {
|
||||
errln(String.format("T.nameToUnicode([%d] %s)!=T.labelToUnicode() "+
|
||||
"(errors %s vs %04lx) %s vs. %s",
|
||||
i, testCase.s, uTInfo.getErrors(), uTLInfo.getErrors(),
|
||||
prettify(uT.toString()), prettify(uTL.toString())));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if(!hasError(uTLInfo, IDNA.Error.LABEL_HAS_DOT)) {
|
||||
errln(String.format("T.labelToUnicode([%d] %s) errors %s missing UIDNA_ERROR_LABEL_HAS_DOT",
|
||||
i, testCase.s, uTLInfo.getErrors()));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// Differences between transitional and nontransitional processing
|
||||
if(mode=='B') {
|
||||
if( aNInfo.isTransitionalDifferent() ||
|
||||
aTInfo.isTransitionalDifferent() ||
|
||||
uNInfo.isTransitionalDifferent() ||
|
||||
uTInfo.isTransitionalDifferent() ||
|
||||
aNLInfo.isTransitionalDifferent() ||
|
||||
aTLInfo.isTransitionalDifferent() ||
|
||||
uNLInfo.isTransitionalDifferent() ||
|
||||
uTLInfo.isTransitionalDifferent()
|
||||
) {
|
||||
errln(String.format("B.process([%d] %s) isTransitionalDifferent()", i, testCase.s));
|
||||
continue;
|
||||
}
|
||||
if( !UTF16Plus.equal(aN, aT) || !UTF16Plus.equal(uN, uT) ||
|
||||
!UTF16Plus.equal(aNL, aTL) || !UTF16Plus.equal(uNL, uTL) ||
|
||||
!sameErrors(aNInfo, aTInfo) || !sameErrors(uNInfo, uTInfo) ||
|
||||
!sameErrors(aNLInfo, aTLInfo) || !sameErrors(uNLInfo, uTLInfo)
|
||||
) {
|
||||
errln(String.format("N.process([%d] %s) vs. T.process() different errors or result strings",
|
||||
i, testCase.s));
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if( !aNInfo.isTransitionalDifferent() ||
|
||||
!aTInfo.isTransitionalDifferent() ||
|
||||
!uNInfo.isTransitionalDifferent() ||
|
||||
!uTInfo.isTransitionalDifferent() ||
|
||||
!aNLInfo.isTransitionalDifferent() ||
|
||||
!aTLInfo.isTransitionalDifferent() ||
|
||||
!uNLInfo.isTransitionalDifferent() ||
|
||||
!uTLInfo.isTransitionalDifferent()
|
||||
) {
|
||||
errln(String.format("%s.process([%d] %s) !isTransitionalDifferent()",
|
||||
testCase.o, i, testCase.s));
|
||||
continue;
|
||||
}
|
||||
if( UTF16Plus.equal(aN, aT) || UTF16Plus.equal(uN, uT) ||
|
||||
UTF16Plus.equal(aNL, aTL) || UTF16Plus.equal(uNL, uTL)
|
||||
) {
|
||||
errln(String.format("N.process([%d] %s) vs. T.process() same result strings",
|
||||
i, testCase.s));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private final IDNA trans, nontrans;
|
||||
|
||||
private static final EnumSet<IDNA.Error> severeErrors=EnumSet.of(
|
||||
IDNA.Error.LEADING_COMBINING_MARK,
|
||||
IDNA.Error.DISALLOWED,
|
||||
IDNA.Error.PUNYCODE,
|
||||
IDNA.Error.LABEL_HAS_DOT,
|
||||
IDNA.Error.INVALID_ACE_LABEL);
|
||||
private static final EnumSet<IDNA.Error> lengthErrors=EnumSet.of(
|
||||
IDNA.Error.EMPTY_LABEL,
|
||||
IDNA.Error.LABEL_TOO_LONG,
|
||||
IDNA.Error.DOMAIN_NAME_TOO_LONG);
|
||||
|
||||
private boolean hasError(IDNA.Info info, IDNA.Error error) {
|
||||
return info.getErrors().contains(error);
|
||||
}
|
||||
// assumes that certainErrors is not empty
|
||||
private boolean hasCertainErrors(Set<IDNA.Error> errors, Set<IDNA.Error> certainErrors) {
|
||||
return !errors.isEmpty() && !Collections.disjoint(errors, certainErrors);
|
||||
}
|
||||
private boolean hasCertainErrors(IDNA.Info info, Set<IDNA.Error> certainErrors) {
|
||||
return hasCertainErrors(info.getErrors(), certainErrors);
|
||||
}
|
||||
private boolean sameErrors(Set<IDNA.Error> a, Set<IDNA.Error> b) {
|
||||
return a.equals(b);
|
||||
}
|
||||
private boolean sameErrors(IDNA.Info a, IDNA.Info b) {
|
||||
return sameErrors(a.getErrors(), b.getErrors());
|
||||
}
|
||||
private boolean sameErrors(IDNA.Info a, Set<IDNA.Error> b) {
|
||||
return sameErrors(a.getErrors(), b);
|
||||
}
|
||||
|
||||
private static boolean
|
||||
isASCII(CharSequence str) {
|
||||
int length=str.length();
|
||||
for(int i=0; i<length; ++i) {
|
||||
if(str.charAt(i)>=0x80) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user