ICU-7645 First cut at spoof detection changed. All marked @internal for now.

X-SVN-Rev: 32910
This commit is contained in:
Mark Davis 2012-11-30 17:51:08 +00:00
parent 517fd227cb
commit 95098e216b
3 changed files with 821 additions and 13 deletions

View File

@ -0,0 +1,653 @@
/*
***************************************************************************
* Copyright (C) 2008-2012, Google, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*/
package com.ibm.icu.text;
import java.util.BitSet;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.Freezable;
/**
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
* then setIdentifier. At this point:
* <ol>
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
* each of these.
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
* either Katakana or Hiragana.
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
* the identifier.
* <li>call getRestrictionLevel to see what the UTS36 restriction level is. (This has some proposed changes from the
* current one, however.)
* </ol>
*
* @author markdavis
* @internal
*/
public class IdentifierInfo {
public enum RestrictionLevel {
/**
* Only ASCII characters: U+0000..U+007F
*
* @internal
*/
ASCII,
/**
* All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
* Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
* vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
*
* @internal
*/
HIGHLY_RESTRICTIVE,
/**
* Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
*
* @internal
*/
MODERATELY_RESTRICTIVE,
/**
* Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as
* Moderately Restrictive
*
* @internal
*/
MINIMALLY_RESTRICTIVE,
/**
* Any valid identifiers, including characters outside of the Identifier Profile, such as INY.org
*
* @internal
*/
UNRESTRICTIVE
}
private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
private String identifier;
private final BitSet requiredScripts = new BitSet();
private final Set<BitSet> scriptSetSet = new HashSet<BitSet>();
private final BitSet commonAmongAlternates = new BitSet();
private final UnicodeSet numerics = new UnicodeSet();
private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF);
private IdentifierInfo clear() {
requiredScripts.clear();
scriptSetSet.clear();
numerics.clear();
commonAmongAlternates.clear();
return this;
}
/**
* Set the identifier profile, for what is allowed.
*
* @param identifierProfile
* @return
* @internal
*/
public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) {
this.numerics.set(numerics);
return this;
}
/**
* Get the identifier profile
*
* @return
* @internal
*/
public UnicodeSet getIdentifierProfile() {
return new UnicodeSet(identifierProfile);
}
/**
* Set an identifier to analyse.
*
* @param identifier
* @return the identifier info.
* @internal
*/
public IdentifierInfo setIdentifier(String identifier) {
this.identifier = identifier;
clear();
BitSet temp = new BitSet(); // Will reuse this.
int cp;
for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
cp = Character.codePointAt(identifier, i);
// Store a representative character for each kind of decimal digit
if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
numerics.add(cp - UCharacter.getNumericValue(cp));
}
UScript.getScriptExtensions(cp, temp);
temp.clear(UScript.COMMON);
temp.clear(UScript.INHERITED);
// if (temp.cardinality() == 0) {
// // HACK for older version of ICU
// requiredScripts.set(UScript.getScript(cp));
// } else
if (temp.cardinality() == 1) {
// Single script, record it.
requiredScripts.or(temp);
} else if (!requiredScripts.intersects(temp) && scriptSetSet.add(temp)) {
// If the set hasn't been added already, add it and create new temporary for the next pass,
// so we don't rewrite what's already in the set.
temp = new BitSet();
}
}
// Now make a final pass through to remove alternates that came before singles.
// [Kana], [Kana Hira] => [Kana]
// This is relatively infrequent, so doesn't have to be optimized.
if (scriptSetSet.size() == 0) {
commonAmongAlternates.clear();
} else {
commonAmongAlternates.set(0, UScript.CODE_LIMIT);
for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext();) {
final BitSet next = it.next();
if (requiredScripts.intersects(next)) {
it.remove();
} else {
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
for (BitSet other : scriptSetSet) {
if (next != other && contains(next, other)) {
it.remove();
break;
}
}
}
commonAmongAlternates.and(next); // get the intersection.
}
if (commonAmongAlternates.size() == 0) {
commonAmongAlternates.clear();
}
}
// Note that the above code doesn't minimize alternatives. That is, it does not collapse
// [[Arab Syrc Thaa]; [Arab Syrc]] to [[Arab Syrc]]
// That would be a possible optimization, but is probably not worth the extra processing
return this;
}
static final BitSet COMMON_AND_INHERITED = set(new BitSet(), UScript.COMMON, UScript.INHERITED);
// /**
// * Test whether an identifier has multiple scripts
// *
// * @param identifier
// * @return true if it does
// */
// public static boolean isMultiScript(String identifier) {
// // Non-optimized code, for simplicity
// Set<BitSet> setOfScriptSets = new HashSet<BitSet>();
// BitSet temp = new BitSet();
// int cp;
// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
// cp = Character.codePointAt(identifier, i);
// UScript.getScriptExtensions(cp, temp);
// if (temp.cardinality() == 0) {
// // HACK for older version of ICU
// final int script = UScript.getScript(cp);
// temp.set(script);
// }
// temp.andNot(COMMON_AND_INHERITED);
// if (temp.cardinality() != 0 && setOfScriptSets.add(temp)) {
// // If the set hasn't been added already, add it and create new temporary for the next pass,
// // so we don't rewrite what's already in the set.
// temp = new BitSet();
// }
// }
// if (setOfScriptSets.size() == 0) {
// return true; // trivially true
// }
// temp.clear();
// // check to see that there is at least one script common to all the sets
// boolean first = true;
// for (BitSet other : setOfScriptSets) {
// if (first) {
// temp.or(other);
// first = false;
// } else {
// temp.and(other);
// }
// }
// return temp.cardinality() != 0;
// }
//
// /**
// * Test whether an identifier has mixed number systems.
// *
// * @param identifier
// * @return true if mixed
// */
// public static boolean hasMixedNumberSystems(String identifier) {
// int cp;
// UnicodeSet numerics = new UnicodeSet();
// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
// cp = Character.codePointAt(identifier, i);
// // Store a representative character for each kind of decimal digit
// switch (UCharacter.getType(cp)) {
// case UCharacterCategory.DECIMAL_DIGIT_NUMBER:
// // Just store the zero character as a representative for comparison.
// // Unicode guarantees it is cp - value
// numerics.add(cp - UCharacter.getNumericValue(cp));
// break;
// case UCharacterCategory.OTHER_NUMBER:
// case UCharacterCategory.LETTER_NUMBER:
// throw new IllegalArgumentException("Should not be in identifiers.");
// }
// }
// return numerics.size() > 1;
// }
/**
* Get the identifer that was analysed.
*
* @return
* @internal
*/
public String getIdentifier() {
return identifier;
}
/**
* Get the scripts found in the identifiers
*
* @return the set of explicit scripts.
* @internal
*/
public BitSet getScripts() {
return (BitSet) requiredScripts.clone();
}
/**
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
* the set consisting of those scripts will be returned.
*
* @return the set of explicit scripts.
* @internal
*/
public Set<BitSet> getAlternates() {
Set<BitSet> result = new HashSet<BitSet>();
for (BitSet item : scriptSetSet) {
result.add((BitSet) item.clone());
}
return result;
}
/**
* Get the representative characters (zeros) for the numerics found in the identifier.
*
* @return the set of explicit scripts.
* @internal
*/
public UnicodeSet getNumerics() {
return new UnicodeSet(numerics);
}
/**
* Find out which scripts are in common among the alternates.
*
* @return
*/
public BitSet getCommonAmongAlternates() {
return (BitSet) commonAmongAlternates.clone();
}
// BitSet doesn't support "contains(...)", so we have inverted constants
// They are private; they can't be made immutable in Java.
private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA,
UScript.KATAKANA);
private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO);
private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL);
private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK,
UScript.CHEROKEE);
/**
* Find the "tightest" restriction level that the identifier satisfies.
*
* @return the restriction level.
* @internal
*/
public RestrictionLevel getRestrictionLevel() {
if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
return RestrictionLevel.UNRESTRICTIVE;
}
if (ASCII.containsAll(identifier)) {
return RestrictionLevel.ASCII;
}
BitSet temp = new BitSet();
temp.or(requiredScripts);
temp.clear(UScript.COMMON);
temp.clear(UScript.INHERITED);
// This is a bit tricky. We look at a number of factors.
// The number of scripts in the text.
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
final int cardinalityPlus = temp.cardinality() + (commonAmongAlternates.isEmpty() ? scriptSetSet.size() : 1);
if (cardinalityPlus < 2) {
return RestrictionLevel.HIGHLY_RESTRICTIVE;
}
if (containsWithAlternates(JAPANESE, temp) || containsWithAlternates(CHINESE, temp)
|| containsWithAlternates(KOREAN, temp)) {
return RestrictionLevel.HIGHLY_RESTRICTIVE;
}
if (cardinalityPlus == 2 && temp.get(UScript.LATIN) && !temp.intersects(CONFUSABLE_WITH_LATIN)) {
return RestrictionLevel.MODERATELY_RESTRICTIVE;
}
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
@Override
public String toString() {
return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", "
+ displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", "
+ numerics.toPattern(false);
}
private boolean containsWithAlternates(BitSet container, BitSet containee) {
if (!contains(container, containee)) {
return false;
}
for (BitSet alternatives : scriptSetSet) {
if (!container.intersects(alternatives)) {
return false;
}
}
return true;
}
/**
* Produce a readable string of alternates.
*
* @param alternates
* @return display form
* @internal
*/
public static String displayAlternates(Collection<BitSet> alternates) {
StringBuilder result = new StringBuilder();
for (BitSet item : alternates) {
if (result.length() != 0) {
result.append("; ");
}
result.append(displayScripts(item));
}
return result.toString();
}
/**
* Produce a readable string of a set of scripts
*
* @param scripts
* @return
* @internal
*/
public static String displayScripts(BitSet scripts) {
StringBuilder result = new StringBuilder();
for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
if (result.length() != 0) {
result.append(' ');
}
result.append(UScript.getShortName(i));
}
return result.toString();
}
/**
* Parse a list of scripts into a bitset.
*
* @param scripts
* @return BitSet of UScript values.
* @internal
*/
public static BitSet parseScripts(String scriptsString) {
BitSet result = new BitSet();
for (String item : scriptsString.trim().split(",?\\s+")) {
if (!item.isEmpty()) {
result.set(UScript.getCodeFromName(item));
}
}
return result;
}
/**
* Parse a list of alternates into a set of sets of UScript values.
*
* @param scriptsSetString
* @return
* @internal
*/
public static Set<BitSet> parseAlternates(String scriptsSetString) {
Set<BitSet> result = new HashSet<BitSet>();
for (String item : scriptsSetString.trim().split("\\s*;\\s*")) {
if (!item.isEmpty()) {
result.add(parseScripts(item));
}
}
return result;
}
/**
* Test containment. Should be a method on BitSet...
*
* @param container
* @param containee
* @return
* @internal
*/
public static final boolean contains(BitSet container, BitSet containee) {
for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) {
if (!container.get(i)) {
return false;
}
}
return true;
}
/**
* Sets a number of values at once. Should be on BitSet.
*
* @param container
* @param containee
* @return
* @internal
*/
public static final BitSet set(BitSet bitset, int... values) {
for (int value : values) {
bitset.set(value);
}
return bitset;
}
// public static final class FreezableBitSet extends BitSet implements Freezable<FreezableBitSet> {
// private boolean frozen;
//
// public FreezableBitSet() {
// super();
// }
// public FreezableBitSet(int nbits) {
// super(nbits);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#and(java.util.BitSet)
// */
// @Override
// public void and(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.and(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#andNot(java.util.BitSet)
// */
// @Override
// public void andNot(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.andNot(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#cardinality()
// */
//
// @Override
// public void clear() {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear();
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clear(int)
// */
// @Override
// public void clear(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clear(int, int)
// */
// @Override
// public void clear(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.clear(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#clone()
// */
// @Override
// public Object clone() {
// return super.clone();
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#equals(java.lang.Object)
// */
// @Override
// public boolean equals(Object obj) {
// if (obj == null || obj.getClass() != FreezableBitSet.class) {
// return false;
// }
// return super.equals((BitSet)obj);
// }
//
// /* (non-Javadoc)
// * @see java.util.BitSet#flip(int)
// */
// @Override
// public void flip(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.flip(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#flip(int, int)
// */
// @Override
// public void flip(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.flip(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#or(java.util.BitSet)
// */
// @Override
// public void or(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.or(set);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int)
// */
// @Override
// public void set(int bitIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(bitIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, boolean)
// */
// @Override
// public void set(int bitIndex, boolean value) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(bitIndex, value);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, int)
// */
// @Override
// public void set(int fromIndex, int toIndex) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(fromIndex, toIndex);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#set(int, int, boolean)
// */
// @Override
// public void set(int fromIndex, int toIndex, boolean value) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.set(fromIndex, toIndex, value);
// }
// /* (non-Javadoc)
// * @see java.util.BitSet#xor(java.util.BitSet)
// */
// @Override
// public void xor(BitSet set) {
// if (frozen) {
// throw new UnsupportedOperationException();
// }
// super.xor(set);
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#isFrozen()
// */
// public boolean isFrozen() {
// return frozen;
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#freeze()
// */
// public FreezableBitSet freeze() {
// frozen = true;
// return this;
// }
// /* (non-Javadoc)
// * @see com.ibm.icu.util.Freezable#cloneAsThawed()
// */
// public FreezableBitSet cloneAsThawed() {
// FreezableBitSet result = new FreezableBitSet(size());
// result.or(this);
// return result;
// }
// }
}

View File

@ -33,6 +33,7 @@ import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
import com.ibm.icu.util.ULocale;
/**
@ -219,13 +220,28 @@ public class SpoofChecker {
* @stable ICU 4.6
*/
public static final int CHAR_LIMIT = 64;
/**
* Check that an identifier is no looser than the specified RestrictionLevel.
*
* @internal
*/
public static final int RESTRICTION_LEVEL = 128;
/**
* Check that an identifier contains only characters from a specified set of acceptable characters. See
* Builder.setAllowedChars() and Builder.setAllowedLocales().
*
* @internal
*/
public static final int MIXED_NUMBERS = 256;
/**
* Enable all spoof checks.
*
* @stable ICU 4.6
*/
public static final int ALL_CHECKS = 0x7f;
public static final int ALL_CHECKS = 0xFFFFFFFF;
// Magic number for sanity checking spoof binary resource data.
static final int MAGIC = 0x3845fdef;
@ -249,6 +265,7 @@ public class SpoofChecker {
UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
Set<ULocale> fAllowedLocales; // The list of allowed locales.
private RestrictionLevel restrictionLevel;
/**
* Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
@ -263,6 +280,7 @@ public class SpoofChecker {
fSpoofData = null;
fAllowedCharsSet = new UnicodeSet(0, 0x10ffff);
fAllowedLocales = new LinkedHashSet<ULocale>();
restrictionLevel = RestrictionLevel.MINIMALLY_RESTRICTIVE;
}
/**
@ -279,6 +297,7 @@ public class SpoofChecker {
fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed();
fAllowedLocales = new LinkedHashSet<ULocale>();
fAllowedLocales.addAll(src.fAllowedLocales);
restrictionLevel = src.restrictionLevel;
}
/**
@ -305,11 +324,12 @@ public class SpoofChecker {
result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
result.fAllowedCharsSet.freeze();
result.fAllowedLocales = this.fAllowedLocales;
result.restrictionLevel = this.restrictionLevel;
return result;
}
/**
* Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data
* Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data
* files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source
* data is as described in UAX 39 for these files, and the content of these files is acceptable input.
*
@ -447,6 +467,16 @@ public class SpoofChecker {
fChecks |= CHAR_LIMIT;
return this;
}
/**
* Set the loosest restriction level allowed.
* @param restrictionLevel The loosest restriction level allowed.
* @return self
*/
public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
this.restrictionLevel = restrictionLevel;
return this;
}
// Structure for the Whole Script Confusable Data
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
@ -1391,6 +1421,28 @@ public class SpoofChecker {
// haven't done it yet.
int scriptCount = -1;
// Allocate an identifier info if needed.
// Note: we may want to allocate one per SpoofChecker and synchronize
IdentifierInfo identifierInfo = null;
if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) {
identifierInfo = new IdentifierInfo().setIdentifier(text);
}
if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) {
RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel();
if (textRestrictionLevel.compareTo(restrictionLevel) > 0) {
result |= RESTRICTION_LEVEL;
}
}
if (0 != ((this.fChecks) & MIXED_NUMBERS)) {
UnicodeSet numerics = identifierInfo.getNumerics();
if (numerics.size() > 1) {
result |= MIXED_NUMBERS;
}
}
if (0 != ((this.fChecks) & SINGLE_SCRIPT)) {
scriptCount = this.scriptScan(text, checkResult);
// no need to set failPos, it will be set to checkResult.position inside this.scriptScan
@ -1881,6 +1933,7 @@ public class SpoofChecker {
private SpoofData fSpoofData;
private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
private RestrictionLevel restrictionLevel;
// for this Spoof Checker. Defaults to all chars.
//

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2009-2011, International Business Machines Corporation and *
* Copyright (C) 2009-2012, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -10,7 +10,11 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.text.ParseException;
import java.util.Arrays;
import java.util.BitSet;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -19,8 +23,12 @@ import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.IdentifierInfo;
import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.SpoofChecker;
import com.ibm.icu.text.SpoofChecker.CheckResult;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;
@ -185,7 +193,7 @@ public class SpoofCheckerTest extends TestFmwk {
* don't want to see in this test.
*/
sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
checkResults = sc.failsChecks(goodLatin);
assertFalse("", checkResults);
@ -254,7 +262,7 @@ public class SpoofCheckerTest extends TestFmwk {
assertTrue("", checkResults);
assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
assertEquals("", 2, result.position);
result.position = 666;
checkResults = sc.failsChecks(han_Hiragana, result);
assertFalse("", checkResults);
@ -294,7 +302,7 @@ public class SpoofCheckerTest extends TestFmwk {
public void TestSpoofAPI() {
SpoofChecker sc = new SpoofChecker.Builder().build();
String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts.
// If this test starts failing, consult confusablesWholeScript.txt
// If this test starts failing, consult confusablesWholeScript.txt
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
result.position = 666;
boolean checkResults = sc.failsChecks(s, result);
@ -317,7 +325,7 @@ public class SpoofCheckerTest extends TestFmwk {
SpoofChecker sc = new SpoofChecker.Builder().build();
checkSkeleton(sc, "TestSkeleton");
}
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
// Unicode data file confusables.txt
// Test cases chosen for substitutions of various lengths, and
@ -337,11 +345,11 @@ public class SpoofCheckerTest extends TestFmwk {
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
testName);
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
testName);
checkSkeleton(sc, SL, "nochange", "nochange", testName);
checkSkeleton(sc, MA, "love", "love", testName);
@ -428,6 +436,100 @@ public class SpoofCheckerTest extends TestFmwk {
assertEquals("", 7, result.position);
}
public void TestRestrictionLevel() {
Object[][] tests = {
{"a", RestrictionLevel.ASCII},
{"γ", RestrictionLevel.HIGHLY_RESTRICTIVE},
{"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE},
{"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE},
{"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE},
};
IdentifierInfo idInfo = new IdentifierInfo();
CheckResult checkResult = new CheckResult();
for (Object[] test : tests) {
String testString = (String) test[0];
RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
idInfo.setIdentifier(testString);
assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel());
for (RestrictionLevel testLevel : RestrictionLevel.values()) {
SpoofChecker sc = new SpoofChecker.Builder()
.setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
.setRestrictionLevel(testLevel)
.build();
boolean actualValue = sc.failsChecks(testString, checkResult);
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
boolean expectedFailure = expectedLevel.compareTo(testLevel) > 0;
boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + testLevel, expectedFailure, actualValue);
// if (!t) { // debugging
// actualValue = sc.failsChecks(testString, checkResult);
// // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
// expectedFailure = expectedLevel.compareTo(testLevel) > 0;
// }
}
}
}
public void TestMixedNumbers() {
Object[][] tests = {
{"1", "[0]"},
{"", "[]"},
{"1१", "[0]"},
{"١۱", "[٠۰]"},
};
IdentifierInfo idInfo = new IdentifierInfo();
CheckResult checkResult = new CheckResult();
for (Object[] test : tests) {
String testString = (String) test[0];
UnicodeSet expected = new UnicodeSet((String)test[1]);
idInfo.setIdentifier(testString);
assertEquals("", expected, idInfo.getNumerics());
SpoofChecker sc = new SpoofChecker.Builder()
.setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
.build();
boolean actualValue = sc.failsChecks(testString, checkResult);
boolean t = assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
}
}
public void TestIdentifierInfo() {
// contains(BitSet, BitSet)
BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL);
BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL);
assertTrue("", IdentifierInfo.contains(bitset12, bitset2));
assertTrue("", IdentifierInfo.contains(bitset12, bitset12));
assertTrue("", !IdentifierInfo.contains(bitset2, bitset12));
// displayAlternates(Collection<BitSet>)
// displayScripts(BitSet)
String scriptString = IdentifierInfo.displayScripts(bitset12);
assertEquals("", "Hang Latn", scriptString);
Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2));
String alternatesString = IdentifierInfo.displayAlternates(alternates);
assertEquals("", "Hang Latn; Hang", alternatesString);
// parseAlternates(String)
// parseScripts(String)
assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString));
assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString));
IdentifierInfo idInfo = new IdentifierInfo();
String manyAlternates = "aアー〼1१١۱";
idInfo.setIdentifier(manyAlternates);
assertEquals("", manyAlternates, idInfo.getIdentifier());
assertEquals("", null, idInfo.getScripts());
assertEquals("", null, idInfo.getAlternates());
assertEquals("", null, idInfo.getCommonAmongAlternates());
assertEquals("", null, idInfo.getNumerics());
assertEquals("", null, idInfo.getRestrictionLevel());
// TODO
// getIdentifierProfile()
// setIdentifierProfile(UnicodeSet)
}
private String parseHex(String in) {
StringBuilder sb = new StringBuilder();
for (String oneCharAsHexString : in.split("\\s+")) {
@ -483,7 +585,7 @@ public class SpoofCheckerTest extends TestFmwk {
Matcher parseLine = Pattern.compile(
"\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
+ "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
matcher("");
matcher("");
Normalizer2 normalizer = Normalizer2.getNFDInstance();
int lineNum = 0;
String inputLine;