ICU-7645 First cut at spoof detection changed. All marked @internal for now.
X-SVN-Rev: 32910
This commit is contained in:
parent
517fd227cb
commit
95098e216b
653
icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java
Normal file
653
icu4j/main/classes/core/src/com/ibm/icu/text/IdentifierInfo.java
Normal file
@ -0,0 +1,653 @@
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2012, Google, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.Freezable;
|
||||
|
||||
/**
|
||||
* This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
|
||||
* then setIdentifier. At this point:
|
||||
* <ol>
|
||||
* <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
|
||||
* each of these.
|
||||
* <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
|
||||
* either Katakana or Hiragana.
|
||||
* <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
|
||||
* <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
|
||||
* the identifier.
|
||||
* <li>call getRestrictionLevel to see what the UTS36 restriction level is. (This has some proposed changes from the
|
||||
* current one, however.)
|
||||
* </ol>
|
||||
*
|
||||
* @author markdavis
|
||||
* @internal
|
||||
*/
|
||||
public class IdentifierInfo {
|
||||
|
||||
public enum RestrictionLevel {
|
||||
/**
|
||||
* Only ASCII characters: U+0000..U+007F
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
ASCII,
|
||||
/**
|
||||
* All characters in each identifier must be from a single script, or from the combinations: Latin + Han +
|
||||
* Hiragana + Katakana; Latin + Han + Bopomofo; or Latin + Han + Hangul. Note that this level will satisfy the
|
||||
* vast majority of Latin-script users; also that TR36 has ASCII instead of Latin.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
HIGHLY_RESTRICTIVE,
|
||||
/**
|
||||
* Allow Latin with other scripts except Cyrillic, Greek, Cherokee Otherwise, the same as Highly Restrictive
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
MODERATELY_RESTRICTIVE,
|
||||
/**
|
||||
* Allow arbitrary mixtures of scripts, such as Ωmega, Teχ, HλLF-LIFE, Toys-Я-Us. Otherwise, the same as
|
||||
* Moderately Restrictive
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
MINIMALLY_RESTRICTIVE,
|
||||
/**
|
||||
* Any valid identifiers, including characters outside of the Identifier Profile, such as I♥NY.org
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
UNRESTRICTIVE
|
||||
}
|
||||
|
||||
private static final UnicodeSet ASCII = new UnicodeSet(0, 0x7F).freeze();
|
||||
|
||||
private String identifier;
|
||||
private final BitSet requiredScripts = new BitSet();
|
||||
private final Set<BitSet> scriptSetSet = new HashSet<BitSet>();
|
||||
private final BitSet commonAmongAlternates = new BitSet();
|
||||
private final UnicodeSet numerics = new UnicodeSet();
|
||||
private final UnicodeSet identifierProfile = new UnicodeSet(0, 0x10FFFF);
|
||||
|
||||
private IdentifierInfo clear() {
|
||||
requiredScripts.clear();
|
||||
scriptSetSet.clear();
|
||||
numerics.clear();
|
||||
commonAmongAlternates.clear();
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the identifier profile, for what is allowed.
|
||||
*
|
||||
* @param identifierProfile
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public IdentifierInfo setIdentifierProfile(UnicodeSet identifierProfile) {
|
||||
this.numerics.set(numerics);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the identifier profile
|
||||
*
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public UnicodeSet getIdentifierProfile() {
|
||||
return new UnicodeSet(identifierProfile);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set an identifier to analyse.
|
||||
*
|
||||
* @param identifier
|
||||
* @return the identifier info.
|
||||
* @internal
|
||||
*/
|
||||
public IdentifierInfo setIdentifier(String identifier) {
|
||||
this.identifier = identifier;
|
||||
clear();
|
||||
BitSet temp = new BitSet(); // Will reuse this.
|
||||
int cp;
|
||||
for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
|
||||
cp = Character.codePointAt(identifier, i);
|
||||
// Store a representative character for each kind of decimal digit
|
||||
if (UCharacter.getType(cp) == UCharacterCategory.DECIMAL_DIGIT_NUMBER) {
|
||||
// Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
|
||||
numerics.add(cp - UCharacter.getNumericValue(cp));
|
||||
}
|
||||
UScript.getScriptExtensions(cp, temp);
|
||||
temp.clear(UScript.COMMON);
|
||||
temp.clear(UScript.INHERITED);
|
||||
// if (temp.cardinality() == 0) {
|
||||
// // HACK for older version of ICU
|
||||
// requiredScripts.set(UScript.getScript(cp));
|
||||
// } else
|
||||
if (temp.cardinality() == 1) {
|
||||
// Single script, record it.
|
||||
requiredScripts.or(temp);
|
||||
} else if (!requiredScripts.intersects(temp) && scriptSetSet.add(temp)) {
|
||||
// If the set hasn't been added already, add it and create new temporary for the next pass,
|
||||
// so we don't rewrite what's already in the set.
|
||||
temp = new BitSet();
|
||||
}
|
||||
}
|
||||
// Now make a final pass through to remove alternates that came before singles.
|
||||
// [Kana], [Kana Hira] => [Kana]
|
||||
// This is relatively infrequent, so doesn't have to be optimized.
|
||||
if (scriptSetSet.size() == 0) {
|
||||
commonAmongAlternates.clear();
|
||||
} else {
|
||||
commonAmongAlternates.set(0, UScript.CODE_LIMIT);
|
||||
for (Iterator<BitSet> it = scriptSetSet.iterator(); it.hasNext();) {
|
||||
final BitSet next = it.next();
|
||||
if (requiredScripts.intersects(next)) {
|
||||
it.remove();
|
||||
} else {
|
||||
// [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
|
||||
for (BitSet other : scriptSetSet) {
|
||||
if (next != other && contains(next, other)) {
|
||||
it.remove();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
commonAmongAlternates.and(next); // get the intersection.
|
||||
}
|
||||
if (commonAmongAlternates.size() == 0) {
|
||||
commonAmongAlternates.clear();
|
||||
}
|
||||
}
|
||||
// Note that the above code doesn't minimize alternatives. That is, it does not collapse
|
||||
// [[Arab Syrc Thaa]; [Arab Syrc]] to [[Arab Syrc]]
|
||||
// That would be a possible optimization, but is probably not worth the extra processing
|
||||
return this;
|
||||
}
|
||||
|
||||
static final BitSet COMMON_AND_INHERITED = set(new BitSet(), UScript.COMMON, UScript.INHERITED);
|
||||
|
||||
// /**
|
||||
// * Test whether an identifier has multiple scripts
|
||||
// *
|
||||
// * @param identifier
|
||||
// * @return true if it does
|
||||
// */
|
||||
// public static boolean isMultiScript(String identifier) {
|
||||
// // Non-optimized code, for simplicity
|
||||
// Set<BitSet> setOfScriptSets = new HashSet<BitSet>();
|
||||
// BitSet temp = new BitSet();
|
||||
// int cp;
|
||||
// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
|
||||
// cp = Character.codePointAt(identifier, i);
|
||||
// UScript.getScriptExtensions(cp, temp);
|
||||
// if (temp.cardinality() == 0) {
|
||||
// // HACK for older version of ICU
|
||||
// final int script = UScript.getScript(cp);
|
||||
// temp.set(script);
|
||||
// }
|
||||
// temp.andNot(COMMON_AND_INHERITED);
|
||||
// if (temp.cardinality() != 0 && setOfScriptSets.add(temp)) {
|
||||
// // If the set hasn't been added already, add it and create new temporary for the next pass,
|
||||
// // so we don't rewrite what's already in the set.
|
||||
// temp = new BitSet();
|
||||
// }
|
||||
// }
|
||||
// if (setOfScriptSets.size() == 0) {
|
||||
// return true; // trivially true
|
||||
// }
|
||||
// temp.clear();
|
||||
// // check to see that there is at least one script common to all the sets
|
||||
// boolean first = true;
|
||||
// for (BitSet other : setOfScriptSets) {
|
||||
// if (first) {
|
||||
// temp.or(other);
|
||||
// first = false;
|
||||
// } else {
|
||||
// temp.and(other);
|
||||
// }
|
||||
// }
|
||||
// return temp.cardinality() != 0;
|
||||
// }
|
||||
//
|
||||
// /**
|
||||
// * Test whether an identifier has mixed number systems.
|
||||
// *
|
||||
// * @param identifier
|
||||
// * @return true if mixed
|
||||
// */
|
||||
// public static boolean hasMixedNumberSystems(String identifier) {
|
||||
// int cp;
|
||||
// UnicodeSet numerics = new UnicodeSet();
|
||||
// for (int i = 0; i < identifier.length(); i += Character.charCount(i)) {
|
||||
// cp = Character.codePointAt(identifier, i);
|
||||
// // Store a representative character for each kind of decimal digit
|
||||
// switch (UCharacter.getType(cp)) {
|
||||
// case UCharacterCategory.DECIMAL_DIGIT_NUMBER:
|
||||
// // Just store the zero character as a representative for comparison.
|
||||
// // Unicode guarantees it is cp - value
|
||||
// numerics.add(cp - UCharacter.getNumericValue(cp));
|
||||
// break;
|
||||
// case UCharacterCategory.OTHER_NUMBER:
|
||||
// case UCharacterCategory.LETTER_NUMBER:
|
||||
// throw new IllegalArgumentException("Should not be in identifiers.");
|
||||
// }
|
||||
// }
|
||||
// return numerics.size() > 1;
|
||||
// }
|
||||
|
||||
/**
|
||||
* Get the identifer that was analysed.
|
||||
*
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public String getIdentifier() {
|
||||
return identifier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the scripts found in the identifiers
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
public BitSet getScripts() {
|
||||
return (BitSet) requiredScripts.clone();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
|
||||
* the set consisting of those scripts will be returned.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
public Set<BitSet> getAlternates() {
|
||||
Set<BitSet> result = new HashSet<BitSet>();
|
||||
for (BitSet item : scriptSetSet) {
|
||||
result.add((BitSet) item.clone());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the representative characters (zeros) for the numerics found in the identifier.
|
||||
*
|
||||
* @return the set of explicit scripts.
|
||||
* @internal
|
||||
*/
|
||||
public UnicodeSet getNumerics() {
|
||||
return new UnicodeSet(numerics);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find out which scripts are in common among the alternates.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public BitSet getCommonAmongAlternates() {
|
||||
return (BitSet) commonAmongAlternates.clone();
|
||||
}
|
||||
|
||||
// BitSet doesn't support "contains(...)", so we have inverted constants
|
||||
// They are private; they can't be made immutable in Java.
|
||||
private final static BitSet JAPANESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HIRAGANA,
|
||||
UScript.KATAKANA);
|
||||
private final static BitSet CHINESE = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.BOPOMOFO);
|
||||
private final static BitSet KOREAN = set(new BitSet(), UScript.LATIN, UScript.HAN, UScript.HANGUL);
|
||||
private final static BitSet CONFUSABLE_WITH_LATIN = set(new BitSet(), UScript.CYRILLIC, UScript.GREEK,
|
||||
UScript.CHEROKEE);
|
||||
|
||||
/**
|
||||
* Find the "tightest" restriction level that the identifier satisfies.
|
||||
*
|
||||
* @return the restriction level.
|
||||
* @internal
|
||||
*/
|
||||
public RestrictionLevel getRestrictionLevel() {
|
||||
if (!identifierProfile.containsAll(identifier) || getNumerics().size() > 1) {
|
||||
return RestrictionLevel.UNRESTRICTIVE;
|
||||
}
|
||||
if (ASCII.containsAll(identifier)) {
|
||||
return RestrictionLevel.ASCII;
|
||||
}
|
||||
BitSet temp = new BitSet();
|
||||
temp.or(requiredScripts);
|
||||
temp.clear(UScript.COMMON);
|
||||
temp.clear(UScript.INHERITED);
|
||||
// This is a bit tricky. We look at a number of factors.
|
||||
// The number of scripts in the text.
|
||||
// Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
|
||||
// Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
|
||||
final int cardinalityPlus = temp.cardinality() + (commonAmongAlternates.isEmpty() ? scriptSetSet.size() : 1);
|
||||
if (cardinalityPlus < 2) {
|
||||
return RestrictionLevel.HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
if (containsWithAlternates(JAPANESE, temp) || containsWithAlternates(CHINESE, temp)
|
||||
|| containsWithAlternates(KOREAN, temp)) {
|
||||
return RestrictionLevel.HIGHLY_RESTRICTIVE;
|
||||
}
|
||||
if (cardinalityPlus == 2 && temp.get(UScript.LATIN) && !temp.intersects(CONFUSABLE_WITH_LATIN)) {
|
||||
return RestrictionLevel.MODERATELY_RESTRICTIVE;
|
||||
}
|
||||
return RestrictionLevel.MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return identifier + ", " + identifierProfile.toPattern(false) + ", " + getRestrictionLevel() + ", "
|
||||
+ displayScripts(requiredScripts) + ", " + displayAlternates(scriptSetSet) + ", "
|
||||
+ numerics.toPattern(false);
|
||||
}
|
||||
|
||||
private boolean containsWithAlternates(BitSet container, BitSet containee) {
|
||||
if (!contains(container, containee)) {
|
||||
return false;
|
||||
}
|
||||
for (BitSet alternatives : scriptSetSet) {
|
||||
if (!container.intersects(alternatives)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Produce a readable string of alternates.
|
||||
*
|
||||
* @param alternates
|
||||
* @return display form
|
||||
* @internal
|
||||
*/
|
||||
public static String displayAlternates(Collection<BitSet> alternates) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (BitSet item : alternates) {
|
||||
if (result.length() != 0) {
|
||||
result.append("; ");
|
||||
}
|
||||
result.append(displayScripts(item));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Produce a readable string of a set of scripts
|
||||
*
|
||||
* @param scripts
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public static String displayScripts(BitSet scripts) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
for (int i = scripts.nextSetBit(0); i >= 0; i = scripts.nextSetBit(i + 1)) {
|
||||
if (result.length() != 0) {
|
||||
result.append(' ');
|
||||
}
|
||||
result.append(UScript.getShortName(i));
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a list of scripts into a bitset.
|
||||
*
|
||||
* @param scripts
|
||||
* @return BitSet of UScript values.
|
||||
* @internal
|
||||
*/
|
||||
public static BitSet parseScripts(String scriptsString) {
|
||||
BitSet result = new BitSet();
|
||||
for (String item : scriptsString.trim().split(",?\\s+")) {
|
||||
if (!item.isEmpty()) {
|
||||
result.set(UScript.getCodeFromName(item));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a list of alternates into a set of sets of UScript values.
|
||||
*
|
||||
* @param scriptsSetString
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public static Set<BitSet> parseAlternates(String scriptsSetString) {
|
||||
Set<BitSet> result = new HashSet<BitSet>();
|
||||
for (String item : scriptsSetString.trim().split("\\s*;\\s*")) {
|
||||
if (!item.isEmpty()) {
|
||||
result.add(parseScripts(item));
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Test containment. Should be a method on BitSet...
|
||||
*
|
||||
* @param container
|
||||
* @param containee
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public static final boolean contains(BitSet container, BitSet containee) {
|
||||
for (int i = containee.nextSetBit(0); i >= 0; i = containee.nextSetBit(i + 1)) {
|
||||
if (!container.get(i)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a number of values at once. Should be on BitSet.
|
||||
*
|
||||
* @param container
|
||||
* @param containee
|
||||
* @return
|
||||
* @internal
|
||||
*/
|
||||
public static final BitSet set(BitSet bitset, int... values) {
|
||||
for (int value : values) {
|
||||
bitset.set(value);
|
||||
}
|
||||
return bitset;
|
||||
}
|
||||
|
||||
// public static final class FreezableBitSet extends BitSet implements Freezable<FreezableBitSet> {
|
||||
// private boolean frozen;
|
||||
//
|
||||
// public FreezableBitSet() {
|
||||
// super();
|
||||
// }
|
||||
// public FreezableBitSet(int nbits) {
|
||||
// super(nbits);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#and(java.util.BitSet)
|
||||
// */
|
||||
// @Override
|
||||
// public void and(BitSet set) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.and(set);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#andNot(java.util.BitSet)
|
||||
// */
|
||||
// @Override
|
||||
// public void andNot(BitSet set) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.andNot(set);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#cardinality()
|
||||
// */
|
||||
//
|
||||
// @Override
|
||||
// public void clear() {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.clear();
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#clear(int)
|
||||
// */
|
||||
// @Override
|
||||
// public void clear(int bitIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.clear(bitIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#clear(int, int)
|
||||
// */
|
||||
// @Override
|
||||
// public void clear(int fromIndex, int toIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.clear(fromIndex, toIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#clone()
|
||||
// */
|
||||
// @Override
|
||||
// public Object clone() {
|
||||
// return super.clone();
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#equals(java.lang.Object)
|
||||
// */
|
||||
// @Override
|
||||
// public boolean equals(Object obj) {
|
||||
// if (obj == null || obj.getClass() != FreezableBitSet.class) {
|
||||
// return false;
|
||||
// }
|
||||
// return super.equals((BitSet)obj);
|
||||
// }
|
||||
//
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#flip(int)
|
||||
// */
|
||||
// @Override
|
||||
// public void flip(int bitIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.flip(bitIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#flip(int, int)
|
||||
// */
|
||||
// @Override
|
||||
// public void flip(int fromIndex, int toIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.flip(fromIndex, toIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#or(java.util.BitSet)
|
||||
// */
|
||||
// @Override
|
||||
// public void or(BitSet set) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.or(set);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#set(int)
|
||||
// */
|
||||
// @Override
|
||||
// public void set(int bitIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.set(bitIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#set(int, boolean)
|
||||
// */
|
||||
// @Override
|
||||
// public void set(int bitIndex, boolean value) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.set(bitIndex, value);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#set(int, int)
|
||||
// */
|
||||
// @Override
|
||||
// public void set(int fromIndex, int toIndex) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.set(fromIndex, toIndex);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#set(int, int, boolean)
|
||||
// */
|
||||
// @Override
|
||||
// public void set(int fromIndex, int toIndex, boolean value) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.set(fromIndex, toIndex, value);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see java.util.BitSet#xor(java.util.BitSet)
|
||||
// */
|
||||
// @Override
|
||||
// public void xor(BitSet set) {
|
||||
// if (frozen) {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// super.xor(set);
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see com.ibm.icu.util.Freezable#isFrozen()
|
||||
// */
|
||||
// public boolean isFrozen() {
|
||||
// return frozen;
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see com.ibm.icu.util.Freezable#freeze()
|
||||
// */
|
||||
// public FreezableBitSet freeze() {
|
||||
// frozen = true;
|
||||
// return this;
|
||||
// }
|
||||
// /* (non-Javadoc)
|
||||
// * @see com.ibm.icu.util.Freezable#cloneAsThawed()
|
||||
// */
|
||||
// public FreezableBitSet cloneAsThawed() {
|
||||
// FreezableBitSet result = new FreezableBitSet(size());
|
||||
// result.or(this);
|
||||
// return result;
|
||||
// }
|
||||
// }
|
||||
}
|
@ -33,6 +33,7 @@ import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.lang.UCharacterCategory;
|
||||
import com.ibm.icu.lang.UProperty;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
/**
|
||||
@ -219,13 +220,28 @@ public class SpoofChecker {
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public static final int CHAR_LIMIT = 64;
|
||||
|
||||
/**
|
||||
* Check that an identifier is no looser than the specified RestrictionLevel.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
public static final int RESTRICTION_LEVEL = 128;
|
||||
|
||||
/**
|
||||
* Check that an identifier contains only characters from a specified set of acceptable characters. See
|
||||
* Builder.setAllowedChars() and Builder.setAllowedLocales().
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
public static final int MIXED_NUMBERS = 256;
|
||||
|
||||
/**
|
||||
* Enable all spoof checks.
|
||||
*
|
||||
* @stable ICU 4.6
|
||||
*/
|
||||
public static final int ALL_CHECKS = 0x7f;
|
||||
public static final int ALL_CHECKS = 0xFFFFFFFF;
|
||||
|
||||
// Magic number for sanity checking spoof binary resource data.
|
||||
static final int MAGIC = 0x3845fdef;
|
||||
@ -249,6 +265,7 @@ public class SpoofChecker {
|
||||
UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
|
||||
// for this Spoof Checker. Defaults to all chars.
|
||||
Set<ULocale> fAllowedLocales; // The list of allowed locales.
|
||||
private RestrictionLevel restrictionLevel;
|
||||
|
||||
/**
|
||||
* Constructor: Create a default Unicode Spoof Checker Builder, configured to perform all checks except for
|
||||
@ -263,6 +280,7 @@ public class SpoofChecker {
|
||||
fSpoofData = null;
|
||||
fAllowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
fAllowedLocales = new LinkedHashSet<ULocale>();
|
||||
restrictionLevel = RestrictionLevel.MINIMALLY_RESTRICTIVE;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -279,6 +297,7 @@ public class SpoofChecker {
|
||||
fAllowedCharsSet = src.fAllowedCharsSet.cloneAsThawed();
|
||||
fAllowedLocales = new LinkedHashSet<ULocale>();
|
||||
fAllowedLocales.addAll(src.fAllowedLocales);
|
||||
restrictionLevel = src.restrictionLevel;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -305,11 +324,12 @@ public class SpoofChecker {
|
||||
result.fAllowedCharsSet = (UnicodeSet) (this.fAllowedCharsSet.clone());
|
||||
result.fAllowedCharsSet.freeze();
|
||||
result.fAllowedLocales = this.fAllowedLocales;
|
||||
result.restrictionLevel = this.restrictionLevel;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specify the source form of the spoof data Spoof Checker. The Three inputs correspond to the Unicode data
|
||||
* Specify the source form of the spoof data Spoof Checker. The inputs correspond to the Unicode data
|
||||
* files confusables.txt and confusablesWholeScript.txt as described in Unicode UAX 39. The syntax of the source
|
||||
* data is as described in UAX 39 for these files, and the content of these files is acceptable input.
|
||||
*
|
||||
@ -447,6 +467,16 @@ public class SpoofChecker {
|
||||
fChecks |= CHAR_LIMIT;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the loosest restriction level allowed.
|
||||
* @param restrictionLevel The loosest restriction level allowed.
|
||||
* @return self
|
||||
*/
|
||||
public Builder setRestrictionLevel(RestrictionLevel restrictionLevel) {
|
||||
this.restrictionLevel = restrictionLevel;
|
||||
return this;
|
||||
}
|
||||
|
||||
// Structure for the Whole Script Confusable Data
|
||||
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
|
||||
@ -1391,6 +1421,28 @@ public class SpoofChecker {
|
||||
// haven't done it yet.
|
||||
int scriptCount = -1;
|
||||
|
||||
// Allocate an identifier info if needed.
|
||||
// Note: we may want to allocate one per SpoofChecker and synchronize
|
||||
|
||||
IdentifierInfo identifierInfo = null;
|
||||
if (0 != ((this.fChecks) & (RESTRICTION_LEVEL | MIXED_NUMBERS))) {
|
||||
identifierInfo = new IdentifierInfo().setIdentifier(text);
|
||||
}
|
||||
|
||||
if (0 != ((this.fChecks) & RESTRICTION_LEVEL)) {
|
||||
RestrictionLevel textRestrictionLevel = identifierInfo.getRestrictionLevel();
|
||||
if (textRestrictionLevel.compareTo(restrictionLevel) > 0) {
|
||||
result |= RESTRICTION_LEVEL;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 != ((this.fChecks) & MIXED_NUMBERS)) {
|
||||
UnicodeSet numerics = identifierInfo.getNumerics();
|
||||
if (numerics.size() > 1) {
|
||||
result |= MIXED_NUMBERS;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 != ((this.fChecks) & SINGLE_SCRIPT)) {
|
||||
scriptCount = this.scriptScan(text, checkResult);
|
||||
// no need to set failPos, it will be set to checkResult.position inside this.scriptScan
|
||||
@ -1881,6 +1933,7 @@ public class SpoofChecker {
|
||||
private SpoofData fSpoofData;
|
||||
private Set<ULocale> fAllowedLocales; // The Set of allowed locales.
|
||||
private UnicodeSet fAllowedCharsSet; // The UnicodeSet of allowed characters.
|
||||
private RestrictionLevel restrictionLevel;
|
||||
|
||||
// for this Spoof Checker. Defaults to all chars.
|
||||
//
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2009-2011, International Business Machines Corporation and *
|
||||
* Copyright (C) 2009-2012, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
@ -10,7 +10,11 @@ import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.text.ParseException;
|
||||
import java.util.Arrays;
|
||||
import java.util.BitSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
@ -19,8 +23,12 @@ import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.dev.test.TestUtil.JavaVendor;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import com.ibm.icu.lang.UScript;
|
||||
import com.ibm.icu.text.IdentifierInfo;
|
||||
import com.ibm.icu.text.IdentifierInfo.RestrictionLevel;
|
||||
import com.ibm.icu.text.Normalizer2;
|
||||
import com.ibm.icu.text.SpoofChecker;
|
||||
import com.ibm.icu.text.SpoofChecker.CheckResult;
|
||||
import com.ibm.icu.text.UnicodeSet;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
|
||||
@ -185,7 +193,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
* don't want to see in this test.
|
||||
*/
|
||||
sc = new SpoofChecker.Builder().setChecks(SpoofChecker.CHAR_LIMIT).setAllowedLocales(allowedLocales).build();
|
||||
|
||||
|
||||
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
|
||||
checkResults = sc.failsChecks(goodLatin);
|
||||
assertFalse("", checkResults);
|
||||
@ -254,7 +262,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
assertTrue("", checkResults);
|
||||
assertEquals("", SpoofChecker.MIXED_SCRIPT_CONFUSABLE | SpoofChecker.SINGLE_SCRIPT, result.checks);
|
||||
assertEquals("", 2, result.position);
|
||||
|
||||
|
||||
result.position = 666;
|
||||
checkResults = sc.failsChecks(han_Hiragana, result);
|
||||
assertFalse("", checkResults);
|
||||
@ -294,7 +302,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
public void TestSpoofAPI() {
|
||||
SpoofChecker sc = new SpoofChecker.Builder().build();
|
||||
String s = "xyz"; // Many latin ranges are whole-script confusable with other scripts.
|
||||
// If this test starts failing, consult confusablesWholeScript.txt
|
||||
// If this test starts failing, consult confusablesWholeScript.txt
|
||||
SpoofChecker.CheckResult result = new SpoofChecker.CheckResult();
|
||||
result.position = 666;
|
||||
boolean checkResults = sc.failsChecks(s, result);
|
||||
@ -317,7 +325,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
SpoofChecker sc = new SpoofChecker.Builder().build();
|
||||
checkSkeleton(sc, "TestSkeleton");
|
||||
}
|
||||
|
||||
|
||||
// testSkeleton. Spot check a number of confusable skeleton substitutions from the
|
||||
// Unicode data file confusables.txt
|
||||
// Test cases chosen for substitutions of various lengths, and
|
||||
@ -337,11 +345,11 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
|
||||
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
|
||||
+ " A 1ong 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
|
||||
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
|
||||
testName);
|
||||
" A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations."
|
||||
+ " A long 'identifier' that vvill overflovv irnplernentation stack buffers, forcing heap allocations.",
|
||||
testName);
|
||||
|
||||
checkSkeleton(sc, SL, "nochange", "nochange", testName);
|
||||
checkSkeleton(sc, MA, "love", "love", testName);
|
||||
@ -428,6 +436,100 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
assertEquals("", 7, result.position);
|
||||
}
|
||||
|
||||
public void TestRestrictionLevel() {
|
||||
Object[][] tests = {
|
||||
{"a", RestrictionLevel.ASCII},
|
||||
{"γ", RestrictionLevel.HIGHLY_RESTRICTIVE},
|
||||
{"aアー", RestrictionLevel.HIGHLY_RESTRICTIVE},
|
||||
{"aऄ", RestrictionLevel.MODERATELY_RESTRICTIVE},
|
||||
{"aγ", RestrictionLevel.MINIMALLY_RESTRICTIVE},
|
||||
};
|
||||
IdentifierInfo idInfo = new IdentifierInfo();
|
||||
CheckResult checkResult = new CheckResult();
|
||||
for (Object[] test : tests) {
|
||||
String testString = (String) test[0];
|
||||
RestrictionLevel expectedLevel = (RestrictionLevel) test[1];
|
||||
idInfo.setIdentifier(testString);
|
||||
assertEquals("Testing restriction level for '" + testString + "'", expectedLevel, idInfo.getRestrictionLevel());
|
||||
for (RestrictionLevel testLevel : RestrictionLevel.values()) {
|
||||
SpoofChecker sc = new SpoofChecker.Builder()
|
||||
.setChecks(SpoofChecker.RESTRICTION_LEVEL) // only check this
|
||||
.setRestrictionLevel(testLevel)
|
||||
.build();
|
||||
boolean actualValue = sc.failsChecks(testString, checkResult);
|
||||
|
||||
// we want to fail if the text is (say) MODERATE and the testLevel is ASCII
|
||||
boolean expectedFailure = expectedLevel.compareTo(testLevel) > 0;
|
||||
boolean t = assertEquals("Testing spoof restriction level for '" + testString + "', " + testLevel, expectedFailure, actualValue);
|
||||
// if (!t) { // debugging
|
||||
// actualValue = sc.failsChecks(testString, checkResult);
|
||||
// // we want to fail if the text is (say) MODERATE and the testLevel is ASCII
|
||||
// expectedFailure = expectedLevel.compareTo(testLevel) > 0;
|
||||
// }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void TestMixedNumbers() {
|
||||
Object[][] tests = {
|
||||
{"1", "[0]"},
|
||||
{"१", "[०]"},
|
||||
{"1१", "[0०]"},
|
||||
{"١۱", "[٠۰]"},
|
||||
};
|
||||
IdentifierInfo idInfo = new IdentifierInfo();
|
||||
CheckResult checkResult = new CheckResult();
|
||||
for (Object[] test : tests) {
|
||||
String testString = (String) test[0];
|
||||
UnicodeSet expected = new UnicodeSet((String)test[1]);
|
||||
idInfo.setIdentifier(testString);
|
||||
assertEquals("", expected, idInfo.getNumerics());
|
||||
|
||||
SpoofChecker sc = new SpoofChecker.Builder()
|
||||
.setChecks(SpoofChecker.MIXED_NUMBERS) // only check this
|
||||
.build();
|
||||
boolean actualValue = sc.failsChecks(testString, checkResult);
|
||||
boolean t = assertEquals("Testing spoof mixed numbers for '" + testString + "', ", expected.size() > 1, actualValue);
|
||||
}
|
||||
}
|
||||
|
||||
public void TestIdentifierInfo() {
|
||||
// contains(BitSet, BitSet)
|
||||
BitSet bitset12 = IdentifierInfo.set(new BitSet(), UScript.LATIN, UScript.HANGUL);
|
||||
BitSet bitset2 = IdentifierInfo.set(new BitSet(), UScript.HANGUL);
|
||||
assertTrue("", IdentifierInfo.contains(bitset12, bitset2));
|
||||
assertTrue("", IdentifierInfo.contains(bitset12, bitset12));
|
||||
assertTrue("", !IdentifierInfo.contains(bitset2, bitset12));
|
||||
|
||||
// displayAlternates(Collection<BitSet>)
|
||||
// displayScripts(BitSet)
|
||||
String scriptString = IdentifierInfo.displayScripts(bitset12);
|
||||
assertEquals("", "Hang Latn", scriptString);
|
||||
Set<BitSet> alternates = new HashSet(Arrays.asList(bitset12, bitset2));
|
||||
String alternatesString = IdentifierInfo.displayAlternates(alternates);
|
||||
assertEquals("", "Hang Latn; Hang", alternatesString);
|
||||
|
||||
// parseAlternates(String)
|
||||
// parseScripts(String)
|
||||
assertEquals("", bitset12, IdentifierInfo.parseScripts(scriptString));
|
||||
assertEquals("", alternates, IdentifierInfo.parseAlternates(alternatesString));
|
||||
|
||||
IdentifierInfo idInfo = new IdentifierInfo();
|
||||
String manyAlternates = "aアー〼1१١۱";
|
||||
idInfo.setIdentifier(manyAlternates);
|
||||
assertEquals("", manyAlternates, idInfo.getIdentifier());
|
||||
|
||||
assertEquals("", null, idInfo.getScripts());
|
||||
assertEquals("", null, idInfo.getAlternates());
|
||||
assertEquals("", null, idInfo.getCommonAmongAlternates());
|
||||
assertEquals("", null, idInfo.getNumerics());
|
||||
assertEquals("", null, idInfo.getRestrictionLevel());
|
||||
|
||||
// TODO
|
||||
// getIdentifierProfile()
|
||||
// setIdentifierProfile(UnicodeSet)
|
||||
}
|
||||
|
||||
private String parseHex(String in) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (String oneCharAsHexString : in.split("\\s+")) {
|
||||
@ -483,7 +585,7 @@ public class SpoofCheckerTest extends TestFmwk {
|
||||
Matcher parseLine = Pattern.compile(
|
||||
"\\ufeff?" + "(?:([0-9A-F\\s]+);([0-9A-F\\s]+);\\s*(SL|ML|SA|MA)\\s*(?:#.*?)?$)"
|
||||
+ "|\\ufeff?(\\s*(?:#.*)?)"). // Comment line
|
||||
matcher("");
|
||||
matcher("");
|
||||
Normalizer2 normalizer = Normalizer2.getNFDInstance();
|
||||
int lineNum = 0;
|
||||
String inputLine;
|
||||
|
Loading…
Reference in New Issue
Block a user