ICU-12549 Revisions to uspoof.h documentation. Actually removing identifier_info.h and other obsolete files from r39218.

X-SVN-Rev: 39297
2016-09-20 21:06:55 +00:00 · 2016-09-20 21:06:55 +00:00 · d5d266654b
commit d5d266654b
parent 3a8a02cae1
8 changed files with 73 additions and 1083 deletions
--- a/icu4c/source/i18n/identifier_info.cpp
+++ b/icu4c/source/i18n/identifier_info.cpp
@ -1,313 +0,0 @@
 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 **********************************************************************
 *   Copyright (C) 2012-2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
 #include "unicode/utypes.h"
 #include "unicode/uchar.h"
 #include "unicode/utf16.h"
 #include "identifier_info.h"
 #include "mutex.h"
 #include "scriptset.h"
 #include "ucln_in.h"
 #include "uvector.h"
 U_NAMESPACE_BEGIN
 static UnicodeSet *ASCII;
 static ScriptSet *JAPANESE;
 static ScriptSet *CHINESE;
 static ScriptSet *KOREAN;
 static ScriptSet *CONFUSABLE_WITH_LATIN;
 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
 U_CDECL_BEGIN
 static UBool U_CALLCONV
 IdentifierInfo_cleanup(void) {
    delete ASCII;
    ASCII = NULL;
    delete JAPANESE;
    JAPANESE = NULL;
    delete CHINESE;
    CHINESE = NULL;
    delete KOREAN;
    KOREAN = NULL;
    delete CONFUSABLE_WITH_LATIN;
    CONFUSABLE_WITH_LATIN = NULL;
    gIdentifierInfoInitOnce.reset(); 
    return TRUE;
 }
 static void U_CALLCONV
 IdentifierInfo_init(UErrorCode &status) {
    ASCII    = new UnicodeSet(0, 0x7f);
    JAPANESE = new ScriptSet();
    CHINESE  = new ScriptSet();
    KOREAN   = new ScriptSet();
    CONFUSABLE_WITH_LATIN = new ScriptSet();
    if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 
            || CONFUSABLE_WITH_LATIN == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    ASCII->freeze();
    JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
             .set(USCRIPT_KATAKANA, status);
    CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
    KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
    CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
              .set(USCRIPT_CHEROKEE, status);
    ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
 }
 U_CDECL_END
 IdentifierInfo::IdentifierInfo(UErrorCode &status):
         fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 
         fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
    umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
    if (U_FAILURE(status)) {
        return;
    }
    fIdentifier = new UnicodeString();
    fRequiredScripts = new ScriptSet();
    fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
    uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
    fCommonAmongAlternates = new ScriptSet();
    fNumerics = new UnicodeSet();
    fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
    if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
                              fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
 }
 IdentifierInfo::~IdentifierInfo() {
    delete fIdentifier;
    delete fRequiredScripts;
    uhash_close(fScriptSetSet);
    delete fCommonAmongAlternates;
    delete fNumerics;
    delete fIdentifierProfile;
 }
 IdentifierInfo &IdentifierInfo::clear() {
    fRequiredScripts->resetAll();
    uhash_removeAll(fScriptSetSet);
    fNumerics->clear();
    fCommonAmongAlternates->resetAll();
    return *this;
 }
 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
    *fIdentifierProfile = identifierProfile;
    return *this;
 }
 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
    return *fIdentifierProfile;
 }
 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
    if (U_FAILURE(status)) {
        return *this;
    }
    *fIdentifier = identifier;
    clear();
    ScriptSet scriptsForCP;
    UChar32 cp;
    for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
        cp = identifier.char32At(i);
        // Store a representative character for each kind of decimal digit
        if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
            // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
            fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
        }
        UScriptCode extensions[500];
        int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
        if (U_FAILURE(status)) {
            return *this;
        }
        scriptsForCP.resetAll();
        for (int32_t j=0; j<extensionsCount; j++) {
            scriptsForCP.set(extensions[j], status);
        }
        scriptsForCP.reset(USCRIPT_COMMON, status);
        scriptsForCP.reset(USCRIPT_INHERITED, status);
        switch (scriptsForCP.countMembers()) {
          case 0: break;
          case 1:
            // Single script, record it.
            fRequiredScripts->Union(scriptsForCP);
            break;
          default:
            if (!fRequiredScripts->intersects(scriptsForCP) 
                    && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
                // If the set hasn't been added already, add it
                //    (Add a copy, fScriptSetSet takes ownership of the copy.)
                uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
            }
            break;
        }
    }
    // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
    // [Kana], [Kana Hira] => [Kana]
    // This is relatively infrequent, so doesn't have to be optimized.
    // We also compute any commonalities among the alternates.
    if (uhash_count(fScriptSetSet) > 0) {
        fCommonAmongAlternates->setAll();
        for (int32_t it = UHASH_FIRST;;) {
            const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
            if (nextHashEl == NULL) {
                break;
            }
            ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
            // [Kana], [Kana Hira] => [Kana]
            if (fRequiredScripts->intersects(*next)) {
                uhash_removeElement(fScriptSetSet, nextHashEl);
            } else {
                fCommonAmongAlternates->intersect(*next);
                // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
                for (int32_t otherIt = UHASH_FIRST;;) {
                    const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
                    if (otherHashEl == NULL) {
                        break;
                    }
                    ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
                    if (next != other && next->contains(*other)) {
                        uhash_removeElement(fScriptSetSet, nextHashEl);
                        break;
                    }
                }
            }
        }
    }
    if (uhash_count(fScriptSetSet) == 0) {
        fCommonAmongAlternates->resetAll();
    }
    return *this;
 }
 const UnicodeString *IdentifierInfo::getIdentifier() const {
    return fIdentifier;
 }
 const ScriptSet *IdentifierInfo::getScripts() const {
    return fRequiredScripts;
 }
 const UHashtable *IdentifierInfo::getAlternates() const {
    return fScriptSetSet;
 }
 const UnicodeSet *IdentifierInfo::getNumerics() const {
    return fNumerics;
 }
 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
    return fCommonAmongAlternates;
 }
 #if !UCONFIG_NO_NORMALIZATION
 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
    if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
        return USPOOF_UNRESTRICTIVE;
    }
    if (ASCII->containsAll(*fIdentifier)) {
        return USPOOF_ASCII;
    }
    // This is a bit tricky. We look at a number of factors.
    // The number of scripts in the text.
    // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
    // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
    // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
    //       time it is created, in setIdentifier().
    int32_t cardinalityPlus = fRequiredScripts->countMembers() + 
            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    if (cardinalityPlus < 2) {
        return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    }
    if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
            || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
        return USPOOF_HIGHLY_RESTRICTIVE;
    }
    if (cardinalityPlus == 2 && 
            fRequiredScripts->test(USCRIPT_LATIN, status) && 
            !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
        return USPOOF_MODERATELY_RESTRICTIVE;
    }
    return USPOOF_MINIMALLY_RESTRICTIVE;
 }
 #endif /* !UCONFIG_NO_NORMALIZATION */
 int32_t IdentifierInfo::getScriptCount() const {
    // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
    int32_t count = fRequiredScripts->countMembers() +
            (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    return count;
 }
 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
    if (!container.contains(containee)) {
        return FALSE;
    }
    for (int32_t iter = UHASH_FIRST; ;) {
        const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
        if (hashEl == NULL) {
            break;
        }
        ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
        if (!container.intersects(*alternatives)) {
            return false;
        }
    }
    return true;
 }
 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
    UVector sorted(status);
    if (U_FAILURE(status)) {
        return dest;
    }
    for (int32_t pos = UHASH_FIRST; ;) {
        const UHashElement *el = uhash_nextElement(alternates, &pos);
        if (el == NULL) {
            break;
        }
        ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
        sorted.addElement(ss, status);
    }
    sorted.sort(uhash_compareScriptSet, status);
    UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
    for (int32_t i=0; i<sorted.size(); i++) {
        if (i>0) {
            dest.append(separator);
        }
        ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
        ss->displayScripts(dest);
    }
    return dest;
 }
 U_NAMESPACE_END
--- a/icu4c/source/i18n/identifier_info.h
+++ b/icu4c/source/i18n/identifier_info.h
@ -1,192 +0,0 @@
 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 **********************************************************************
 *   Copyright (C) 2014, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 *
 * indentifier_info.h
 * 
 * created on: 2013 Jan 7
 * created by: Andy Heninger
 */
 #ifndef __IDENTIFIER_INFO_H__
 #define __IDENTIFIER_INFO_H__
 #include "unicode/utypes.h"
 #include "unicode/uniset.h"
 #include "unicode/uspoof.h"
 #include "uhash.h"
 U_NAMESPACE_BEGIN
 class ScriptSet;
 // TODO(andy): review consistency of reference vs pointer arguments to the funcions.
 /**
 * This class analyzes a possible identifier for script and identifier status. Use it by calling setIdentifierProfile
 * then setIdentifier. Available methods include:
 * <ol>
 * <li>call getScripts for the specific scripts in the identifier. The identifier contains at least one character in
 * each of these.
 * <li>call getAlternates to get cases where a character is not limited to a single script. For example, it could be
 * either Katakana or Hiragana.
 * <li>call getCommonAmongAlternates to find out if any scripts are common to all the alternates.
 * <li>call getNumerics to get a representative character (with value zero) for each of the decimal number systems in
 * the identifier.
 * <li>call getRestrictionLevel to see what the UTS36 restriction level is.
 * </ol>
 * 
 * This is a port from ICU4J of class com.ibm.icu.text.IdentifierInfo
 */
 class U_I18N_API IdentifierInfo : public UMemory {
  public:
    /**
     * Create an identifier info object. Subsequently, call setIdentifier(), etc.
     * @internal
     */
    IdentifierInfo(UErrorCode &status);
    /**
      * Destructor
      */
    virtual ~IdentifierInfo();
  private:
    /* Disallow copying for now. Can be added if there's a need. */
    IdentifierInfo(const IdentifierInfo &other);
  public:
    /**
     * Set the identifier profile: the characters that are to be allowed in the identifier.
     * 
     * @param identifierProfile the characters that are to be allowed in the identifier
     * @return this
     * @internal
     */
    IdentifierInfo &setIdentifierProfile(const UnicodeSet &identifierProfile);
    /**
     * Get the identifier profile: the characters that are to be allowed in the identifier.
     * 
     * @return The characters that are to be allowed in the identifier.
     * @internal
     */
    const UnicodeSet &getIdentifierProfile() const;
    /**
     * Set an identifier to analyze. Afterwards, call methods like getScripts()
     * 
     * @param identifier the identifier to analyze
     * @param status Errorcode, set if errors occur.
     * @return this
     * @internal
     */
    IdentifierInfo &setIdentifier(const UnicodeString &identifier, UErrorCode &status);
    /**
     * Get the identifier that was analyzed. The returned string is owned by the ICU library,
     * and must not be deleted by the caller.
     * 
     * @return the identifier that was analyzed.
     * @internal
     */
    const UnicodeString *getIdentifier() const;
    /**
     * Get the scripts found in the identifiers.
     * 
     * @return the set of explicit scripts.
     * @internal
     */
    const ScriptSet *getScripts() const;
    /**
     * Get the set of alternate scripts found in the identifiers. That is, when a character can be in two scripts, then
     * the set consisting of those scripts will be returned.
     * 
     * @return a uhash, with each key being of type (ScriptSet *). 
     *         This is a set, not a map, so the value stored in the uhash is not relevant.
     *         (It is, in fact, 1).
     *         Ownership of the uhash and its contents remains with the IndetifierInfo object, 
     *         and remains valid until a new identifer is set or until the object is deleted.
     * @internal
     */
    const UHashtable *getAlternates() const;
    /**
     * Get the representative characters (zeros) for the numerics found in the identifier.
     * 
     * @return the set of explicit scripts.
     * @internal
     */
    const UnicodeSet *getNumerics() const;
    /**
     * Find out which scripts are in common among the alternates.
     * 
     * @return the set of scripts that are in common among the alternates.
     * @internal
     */
    const ScriptSet *getCommonAmongAlternates() const;
    /**
      * Get the number of scripts appearing in the identifier.
      *   Note: Common and Inherited scripts are omitted from the count.
      *   Note: Result may be high when the identifier contains characters
      *         with alternate scripts. The distinction between
      *         0, 1 and > 1 will remain valid, however.
      * @return the number of scripts.
      */
    int32_t getScriptCount() const;
 #if !UCONFIG_NO_NORMALIZATION
    /**
     * Find the "tightest" restriction level that the identifier satisfies.
     * 
     * @return the restriction level.
     * @internal
     */
    URestrictionLevel getRestrictionLevel(UErrorCode &status) const;
 #endif /*!UCONFIG_NO_NORMALIZATION */
    UnicodeString toString() const;
    /**
     * Produce a readable string of alternates.
     * 
     * @param alternates a UHashtable of UScriptSets.
     *        Keys only, no meaningful values in the UHash.
     * @return display form
     * @internal
     */
    static UnicodeString &displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status);
  private:
    IdentifierInfo  & clear();
    UBool             containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const;
    UnicodeString     *fIdentifier;
    ScriptSet         *fRequiredScripts;
    UHashtable        *fScriptSetSet;
    ScriptSet         *fCommonAmongAlternates;
    UnicodeSet        *fNumerics;
    UnicodeSet        *fIdentifierProfile;
 };
 U_NAMESPACE_END
 #endif // __IDENTIFIER_INFO_H__
--- a/icu4c/source/i18n/unicode/uspoof.h
+++ b/icu4c/source/i18n/unicode/uspoof.h
@ -42,10 +42,10 @@
 * <a href="http://unicode.org/reports/tr39">Unicode Technical Standard #39</a>, has two main functions:
 *
 * <ol>
- * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "desordenado" and
+ * <li>Checking whether two strings are visually <em>confusable</em> with each other, such as "Harvest" and
- * "ԁеѕогԁепаԁо".</li>
+ * &quot;&Eta;arvest&quot;, where the second string starts with the Greek capital letter Eta.</li>
 * <li>Checking whether an individual string is likely to be an attempt at confusing the reader (<em>spoof
- * detection</em>), such as "pаypаl" spelled with Cyrillic 'а' characters.</li>
+ * detection</em>), such as "paypal" with some Latin characters substituted with Cyrillic look-alikes.</li>
 * </ol>
 *
 * <p>
@ -63,19 +63,25 @@
 *
 * \code{.c}
 * UErrorCode status = U_ZERO_ERROR;
 * UChar* str1 = (UChar*) u"Harvest";
 * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
 *
 * USpoofChecker* sc = uspoof_open(&status);
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * int32_t bitmask = uspoof_areConfusable(sc, (UChar*) u"desordenado", -1, (UChar*) u"ԁеѕогԁепаԁо", -1, &status);
+ *
- * UBool result = (bitmask & USPOOF_ALL_CHECKS) != 0;
+ * int32_t bitmask = uspoof_areConfusable(sc, str1, -1, str2, -1, &status);
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status));  // areConfusable: 1 (success: 1)
+ * UBool result = bitmask != 0;
 * // areConfusable: 1 (status: U_ZERO_ERROR)
 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
 * uspoof_close(sc);
 * \endcode
 *
 * <p>
- * The second line of the example creates a <code>USpoofChecker</code> object; the third line enables confusable
+ * The call to {@link uspoof_open} creates a <code>USpoofChecker</code> object; the call to {@link uspoof_setChecks}
- * checking and disables all other checks; the fourth line performs the confusability test; and the fifth line extracts
+ * enables confusable checking and disables all other checks; the call to {@link uspoof_areConfusable} performs the
- * the result out of the confusability test. For best performance, the instance should be created once (e.g., upon
+ * confusability test; and the following line extracts the result out of the return value. For best performance,
- * application startup), and the efficient {@link uspoof_areConfusable} method can be used at runtime.
+ * the instance should be created once (e.g., upon application startup), and the efficient
 * {@link uspoof_areConfusable} method can be used at runtime.
 *
 * <p>
 * The type {@link LocalUSpoofCheckerPointer} is exposed for C++ programmers.  It will automatically call
@ -95,27 +101,28 @@
 *
 * \code{.c}
 * UErrorCode status = U_ZERO_ERROR;
- * UChar* str1 = (UChar*) u"desordenado";
+ * UChar* str1 = (UChar*) u"Harvest";
- * UChar* str2 = (UChar*) u"ԁеѕогԁепаԁо";
+ * UChar* str2 = (UChar*) u"\u0397arvest";  // with U+0397 GREEK CAPITAL LETTER ETA
 *
 * USpoofChecker* sc = uspoof_open(&status);
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
 *
 * // Get skeleton 1
 * int32_t skel1Len = uspoof_getSkeleton(sc, 0, str1, -1, NULL, 0, &status);
- * UChar* skel1 = (UChar*) malloc(skel1Len * sizeof(UChar));
+ * UChar* skel1 = (UChar*) malloc(++skel1Len * sizeof(UChar));
 * status = U_ZERO_ERROR;
 * uspoof_getSkeleton(sc, 0, str1, -1, skel1, skel1Len, &status);
 *
 * // Get skeleton 2
 * int32_t skel2Len = uspoof_getSkeleton(sc, 0, str2, -1, NULL, 0, &status);
- * UChar* skel2 = (UChar*) malloc(skel2Len * sizeof(UChar));
+ * UChar* skel2 = (UChar*) malloc(++skel2Len * sizeof(UChar));
 * status = U_ZERO_ERROR;
 * uspoof_getSkeleton(sc, 0, str2, -1, skel2, skel2Len, &status);
 *
 * // Are the skeletons the same?
- * UBool result = (skel1Len == skel2Len) && memcmp(skel1, skel2, skel1Len) == 0;
+ * UBool result = u_strCompare(skel1, -1, skel2, -1, FALSE) == 0;
- * printf("areConfusable: %d (success: %d)\n", result, U_SUCCESS(status));  // areConfusable: 1 (success: 1)
+ * // areConfusable: 1 (status: U_ZERO_ERROR)
 * printf("areConfusable: %d (status: %s)\n", result, u_errorName(status));
 * uspoof_close(sc);
 * free(skel1);
 * free(skel2);
@ -126,21 +133,19 @@
 * {uspoof_areConfusable} many times in a loop, {uspoof_getSkeleton} can be used instead, as shown below:
 *
 * \code{.c}
 * // Setup:
 * UErrorCode status = U_ZERO_ERROR;
- * UChar* dictionary[2] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
+ * #define DICTIONARY_LENGTH 2
- * UChar* skeletons[sizeof(dictionary)/sizeof(UChar*)];
+ * UChar* dictionary[DICTIONARY_LENGTH] = { (UChar*) u"lorem", (UChar*) u"ipsum" };
- * int32_t skeletonLengths[sizeof(dictionary)/sizeof(UChar*)];
+ * UChar* skeletons[DICTIONARY_LENGTH];
 * UChar* str = (UChar*) u"1orern";
 *
 * // Setup:
 * USpoofChecker* sc = uspoof_open(&status);
 * uspoof_setChecks(sc, USPOOF_CONFUSABLE, &status);
- * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
 *     UChar* word = dictionary[i];
 *     int32_t len = uspoof_getSkeleton(sc, 0, word, -1, NULL, 0, &status);
- *     skeletons[i] = (UChar*) malloc(len * sizeof(UChar));
+ *     skeletons[i] = (UChar*) malloc(++len * sizeof(UChar));
 *     skeletonLengths[i] = len;
 *     status = U_ZERO_ERROR;
 *     uspoof_getSkeleton(sc, 0, word, -1, skeletons[i], len, &status);
 * }
@ -148,22 +153,20 @@
 * // Live Check:
 * {
 *     int32_t len = uspoof_getSkeleton(sc, 0, str, -1, NULL, 0, &status);
- *     UChar* skel = (UChar*) malloc(len * sizeof(UChar));
+ *     UChar* skel = (UChar*) malloc(++len * sizeof(UChar));
 *     status = U_ZERO_ERROR;
 *     uspoof_getSkeleton(sc, 0, str, -1, skel, len, &status);
 *     UBool result = FALSE;
- *     for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
+ *     for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
- *         if (len == skeletonLengths[i] && memcmp(skel, skeletons[i], len) == 0) {
+ *         result = u_strCompare(skel, -1, skeletons[i], -1, FALSE) == 0;
- *             result = TRUE;
+ *         if (result == TRUE) { break; }
 *     }
- *     }
+ *     // Has confusable in dictionary: 1 (status: U_ZERO_ERROR)
- *     // Has confusable in dictionary: 1 (success: 1)
+ *     printf("Has confusable in dictionary: %d (status: %s)\n", result, u_errorName(status));
 *     printf("Has confusable in dictionary: %d (success: %d)\n", result, U_SUCCESS(status));
 *     free(skel);
 * }
 *
- * // Cleanup:
+ * for (size_t i=0; i<DICTIONARY_LENGTH; i++) {
 * for (size_t i=0; i<sizeof(dictionary)/sizeof(UChar*); i++) {
 *     free(skeletons[i]);
 * }
 * uspoof_close(sc);
@ -182,7 +185,7 @@
 *
 * \code{.c}
 * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl";  // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
 *
 * // Get the default set of allowable characters:
 * USet* allowed = uset_openEmpty();
@ -195,7 +198,8 @@
 *
 * int32_t bitmask = uspoof_check(sc, str, -1, NULL, &status);
 * UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status));  // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
 * uspoof_close(sc);
 * uset_close(allowed);
 * \endcode
@ -216,7 +220,7 @@
 *
 * \code{.c}
 * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"pаypаl";  // with Cyrillic 'а' characters
+ * UChar* str = (UChar*) u"p\u0430ypal";  // with U+0430 CYRILLIC SMALL LETTER A
 *
 * // Get the default set of allowable characters:
 * USet* allowed = uset_openEmpty();
@ -233,8 +237,8 @@
 * int32_t failures1 = bitmask;
 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult, &status);
 * assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
 *
 * // Cleanup:
 * uspoof_close(sc);
@ -247,7 +251,7 @@
 *
 * \code{.cpp}
 * UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl");  // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
 *
 * // Get the default set of allowable characters:
 * UnicodeSet allowed;
@ -264,8 +268,8 @@
 * int32_t failures1 = bitmask;
 * int32_t failures2 = uspoof_getCheckResultChecks(checkResult.getAlias(), &status);
 * assert(failures1 == failures2);
- * // checks that failed: 16 (success: 1)
+ * // checks that failed: 0x00000010 (status: U_ZERO_ERROR)
- * printf("checks that failed: %d (success: %d)\n", failures1, U_SUCCESS(status));
+ * printf("checks that failed: %#010x (status: %s)\n", failures1, u_errorName(status));
 *
 * // Explicit cleanup not necessary.
 * \endcode
@ -291,14 +295,15 @@
 *
 * \code{.c}
 * UErrorCode status = U_ZERO_ERROR;
- * UChar* str = (UChar*) u"৪8";
+ * UChar* str = (UChar*) u"8\u09EA";  // 8 mixed with U+09EA BENGALI DIGIT FOUR
 *
 * USpoofChecker* sc = uspoof_open(&status);
 * uspoof_setChecks(sc, USPOOF_INVISIBLE | USPOOF_MIXED_NUMBERS, &status);
 *
 * int32_t bitmask = uspoof_check2(sc, str, -1, NULL, &status);
 * UBool result = bitmask != 0;
- * printf("fails checks: %d (success: %d)\n", result, U_SUCCESS(status));  // fails checks: 1 (success: 1)
+ * // fails checks: 1 (status: U_ZERO_ERROR)
 * printf("fails checks: %d (status: %s)\n", result, u_errorName(status));
 * uspoof_close(sc);
 * \endcode
 *
@ -307,7 +312,7 @@
 *
 * \code{.cpp}
 * UErrorCode status = U_ZERO_ERROR;
- * UnicodeString str((UChar*) u"pаypаl");  // with Cyrillic 'а' characters
+ * UnicodeString str((UChar*) u"p\u0430ypal");  // with U+0430 CYRILLIC SMALL LETTER A
 *
 * // Get the default set of allowable characters:
 * UnicodeSet allowed;
@ -323,14 +328,14 @@
 * int32_t bitmask = uspoof_check2UnicodeString(sc.getAlias(), str, checkResult.getAlias(), &status);
 *
 * URestrictionLevel restrictionLevel = uspoof_getCheckResultRestrictionLevel(checkResult.getAlias(), &status);
- * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available via the bitmask:
+ * // Since USPOOF_AUX_INFO was enabled, the restriction level is also available in the upper bits of the bitmask:
 * assert((restrictionLevel & bitmask) == restrictionLevel);
- * // Restriction level: 1342177280 (success: 1)
+ * // Restriction level: 0x50000000 (status: U_ZERO_ERROR)
- * printf("Restriction level: %d (success: %d)\n", restrictionLevel, U_SUCCESS(status));
+ * printf("Restriction level: %#010x (status: %s)\n", restrictionLevel, u_errorName(status));
 * \endcode
 *
 * <p>
- * The code '1342177280' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
+ * The code '0x50000000' corresponds to the restriction level USPOOF_MINIMALLY_RESTRICTIVE.  Since
 * USPOOF_MINIMALLY_RESTRICTIVE is weaker than USPOOF_MODERATELY_RESTRICTIVE, the string fails the check.
 *
 * <p>
@ -351,13 +356,13 @@
 * A <code>USpoofChecker</code> instance may be used repeatedly to perform checks on any number of identifiers.
 *
 * <p>
- * <b>Thread Safety:</b> Thread Safety: The test functions for checking a single identifier, or for testing whether
+ * <b>Thread Safety:</b> The test functions for checking a single identifier, or for testing whether
 * two identifiers are possible confusable, are thread safe. They may called concurrently, from multiple threads,
 * using the same USpoofChecker instance.
 *
 * <p>
 * More generally, the standard ICU thread safety rules apply: functions that take a const USpoofChecker parameter are
- * thread safe. Those that take a non-const USpoofChecier are not thread safe..
+ * thread safe. Those that take a non-const USpoofChecker are not thread safe..
 *
 * @stable ICU 4.6
 */
@ -419,13 +424,9 @@ typedef enum USpoofChecks {
     * the checks to some subset of SINGLE_SCRIPT_CONFUSABLE, MIXED_SCRIPT_CONFUSABLE, or WHOLE_SCRIPT_CONFUSABLE to
     * make {@link uspoof_areConfusable} return only those types of confusables.
     *
     * <p>Note: if you wish to use {@link uspoof_getSkeleton}, it is required that you enable at least one of the
     * CONFUSABLE flags.
     *
     * @see uspoof_areConfusable
     * @see uspoof_getSkeleton
     * @draft ICU 58
     * @provisional This API might change or be removed in a future release.
     */
    USPOOF_CONFUSABLE               =   USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE,
@ -471,7 +472,7 @@ typedef enum USpoofChecks {
    USPOOF_INVISIBLE                =  32,
    /** Check that an identifier contains only characters from a specified set
-      * of acceptable characters.  See {@link uspoof_setAllowedChars}
+      * of acceptable characters.  See {@link uspoof_setAllowedChars} and
      * {@link uspoof_setAllowedLocales}.  Note that a string that fails this check
      * will also fail the {@link USPOOF_RESTRICTION_LEVEL} check.
      */
@ -754,6 +755,8 @@ uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
 * {@link USPOOF_HIGHLY_RESTRICTIVE}. Calling this method enables the {@link USPOOF_RESTRICTION_LEVEL} and
 * {@link USPOOF_MIXED_NUMBERS} checks, corresponding to Sections 5.1 and 5.2 of UTS 39. To customize which checks are
 * to be performed by {@link uspoof_check}, see {@link uspoof_setChecks}.
 *
 * @param sc       The USpoofChecker
 * @param restrictionLevel The loosest restriction level allowed.
 * @see URestrictionLevel
 * @stable ICU 51
@ -1059,6 +1062,8 @@ uspoof_checkUnicodeString(const USpoofChecker *sc,
 * @param sc      The USpoofChecker
 * @param id      The identifier to be checked for possible security issues,
 *                in UTF-16 format.
 * @param length  the length of the string to be checked, or -1 if the string is
 *                zero terminated.
 * @param checkResult  An instance of USpoofCheckResult to be filled with
 *                details about the identifier.  Can be NULL.
 * @param status  The error code, set if an error occurred while attempting to
@ -1259,7 +1264,7 @@ uspoof_getCheckResultNumerics(const USpoofCheckResult *checkResult, UErrorCode *
 *
 * <ul>
 *   <li>{@link USPOOF_SINGLE_SCRIPT_CONFUSABLE}</li>
- *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE</li>
+ *   <li>{@link USPOOF_MIXED_SCRIPT_CONFUSABLE}</li>
 *   <li>{@link USPOOF_WHOLE_SCRIPT_CONFUSABLE}</li>
 * </ul>
 *
--- a/icu4c/source/i18n/uspoof_impl.cpp
+++ b/icu4c/source/i18n/uspoof_impl.cpp
@ -62,13 +62,13 @@ void SpoofImpl::construct(UErrorCode& status) {
    if (U_FAILURE(status)) { return; }
    UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
    allowedCharsSet->freeze();
    fAllowedCharsSet = allowedCharsSet;
    fAllowedLocales  = uprv_strdup("");
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }
    allowedCharsSet->freeze();
 }
@ -85,10 +85,10 @@ SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status)  :
        fSpoofData = src.fSpoofData->addReference();
    }
    fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
-    if (fAllowedCharsSet == NULL) {
+    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
    if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
    }
    fAllowedLocales = uprv_strdup(src.fAllowedLocales);
    fRestrictionLevel = src.fRestrictionLevel;
 }
--- a/icu4c/source/i18n/uspoof_impl.h
+++ b/icu4c/source/i18n/uspoof_impl.h
@ -123,7 +123,7 @@ public:
    // Used to convert this CheckResult to the older int32_t return value API
    int32_t toCombinedBitmask(int32_t expectedChecks);
-    // Data Members (all stack-allocated)
+    // Data Members
    int32_t fMagic;                        // Internal sanity check.
    int32_t fChecks;                       // Bit vector of checks that were failed.
    UnicodeSet fNumerics;                  // Set of numerics found in the string.
--- a/icu4c/source/i18n/uspoof_wsconf.cpp
+++ b/icu4c/source/i18n/uspoof_wsconf.cpp
@ -1,438 +0,0 @@
 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
 *   Copyright (C) 2008-2013, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  uspoof_wsconf.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2009Jan05  (refactoring earlier files)
 *   created by: Andy Heninger
 *
 *   Internal functions for compililing Whole Script confusable source data
 *   into its binary (runtime) form.  The binary data format is described
 *   in uspoof_impl.h
 */
 #include "unicode/utypes.h"
 #include "unicode/uspoof.h"
 #if !UCONFIG_NO_NORMALIZATION
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
 #include "unicode/unorm.h"
 #include "unicode/uregex.h"
 #include "unicode/ustring.h"
 #include "cmemory.h"
 #include "scriptset.h"
 #include "uspoof_impl.h"
 #include "uhash.h"
 #include "uvector.h"
 #include "uassert.h"
 #include "uspoof_wsconf.h"
 U_NAMESPACE_USE
 // Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
 // Example Lines:
 //   006F          ; Latn; Deva; A #      (o)  LATIN SMALL LETTER O
 //   0048..0049    ; Latn; Grek; A #  [2] (H..I)  LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
 //    |               |     |    |
 //    |               |     |    |---- Which table, Any Case or Lower Case (A or L)
 //    |               |     |----------Target script.   We need this.
 //    |               |----------------Src script.  Should match the script of the source
 //    |                                code points.  Beyond checking that, we don't keep it.
 //    |--------------------------------Source code points or range.
 //
 // The expression will match _all_ lines, including erroneous lines.
 // The result of the parse is returned via the contents of the (match) groups.
 static const char *parseExp = 
        "(?m)"                                         // Multi-line mode
        "^([ \\t]*(?:#.*?)?)$"                         // A blank or comment line.  Matches Group 1.
        "|^(?:"                                        //   OR
        "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range.  Groups 2 and 3.
        "\\s*([A-Za-z]+)\\s*;"                         // The source script.  Group 4.
        "\\s*([A-Za-z]+)\\s*;"                         // The target script.  Group 5.
        "\\s*(?:(A)|(L))"                              // The table A or L.   Group 6 or 7
        "[ \\t]*(?:#.*?)?"                             // Trailing commment
        ")$|"                                          //   OR
        "^(.*?)$";                                     // An error line.      Group 8.
                                                       //    Any line not matching the preceding
                                                       //    parts of the expression.will match
                                                       //    this, and thus be flagged as an error
 // Extract a regular expression match group into a char * string.
 //    The group must contain only invariant characters.
 //    Used for script names
 // 
 static void extractGroup(
    URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
    UChar ubuf[50];
    ubuf[0] = 0;
    destBuf[0] = 0;
    int32_t len = uregex_group(e, group, ubuf, 50, &status);
    if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
        return;
    }
    UnicodeString s(FALSE, ubuf, len);   // Aliasing constructor
    s.extract(0, len, destBuf, destCapacity, US_INV);
 }
 U_NAMESPACE_BEGIN
 //  Build the Whole Script Confusable data
 //
 //     TODO:  Reorganize.  Either get rid of the WSConfusableDataBuilder class,
 //                         because everything is local to this one build function anyhow,
 //                           OR
 //                         break this function into more reasonably sized pieces, with
 //                         state in WSConfusableDataBuilder.
 //
 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) 
 {
    if (U_FAILURE(status)) {
        return;
    }
    URegularExpression *parseRegexp = NULL;
    int32_t             inputLen    = 0;
    UChar              *input       = NULL;
    int32_t             lineNum     = 0;
    UVector            *scriptSets        = NULL;
    uint32_t            rtScriptSetsCount = 2;
    UTrie2             *anyCaseTrie   = NULL;
    UTrie2             *lowerCaseTrie = NULL;
    anyCaseTrie = utrie2_open(0, 0, &status);
    lowerCaseTrie = utrie2_open(0, 0, &status);
    UnicodeString pattern(parseExp, -1, US_INV);
    // The scriptSets vector provides a mapping from TRIE values to the set of scripts.
    //
    // Reserved TRIE values:
    //   0:  Code point has no whole script confusables.
    //   1:  Code point is of script Common or Inherited.
    //       These code points do not participate in whole script confusable detection.
    //       (This is logically equivalent to saying that they contain confusables in
    //        all scripts)
    //
    // Because Trie values are indexes into the ScriptSets vector, pre-fill
    // vector positions 0 and 1 to avoid conflicts with the reserved values.
    scriptSets = new UVector(status);
    if (scriptSets == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    scriptSets->addElement((void *)NULL, status);
    scriptSets->addElement((void *)NULL, status);
    // Convert the user input data from UTF-8 to UChar (UTF-16)
    u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
    if (status != U_BUFFER_OVERFLOW_ERROR) {
        goto cleanup;
    }
    status = U_ZERO_ERROR;
    input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
    if (input == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        goto cleanup;
    }
    u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
    parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status);
    // Zap any Byte Order Mark at the start of input.  Changing it to a space is benign
    //   given the syntax of the input.
    if (*input == 0xfeff) {
        *input = 0x20;
    }
    // Parse the input, one line per iteration of this loop.
    uregex_setText(parseRegexp, input, inputLen, &status);
    while (uregex_findNext(parseRegexp, &status)) {
        lineNum++;
        if (uregex_start(parseRegexp, 1, &status) >= 0) {
            // this was a blank or comment line.
            continue;
        }
        if (uregex_start(parseRegexp, 8, &status) >= 0) {
            // input file syntax error.
            status = U_PARSE_ERROR;
            goto cleanup;
        }
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        // Pick up the start and optional range end code points from the parsed line.
        UChar32  startCodePoint = SpoofImpl::ScanHex(
            input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
        UChar32  endCodePoint = startCodePoint;
        if (uregex_start(parseRegexp, 3, &status) >=0) {
            endCodePoint = SpoofImpl::ScanHex(
                input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
        }
        // Extract the two script names from the source line.  We need these in an 8 bit
        //   default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
        //   to the ICU u_getPropertyValueEnum() function.  Ugh.
        char  srcScriptName[20];
        char  targScriptName[20];
        extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
        extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
        UScriptCode srcScript  =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
        UScriptCode targScript =
            static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
        if (U_FAILURE(status)) {
            goto cleanup;
        }
        if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
            status = U_INVALID_FORMAT_ERROR;
            goto cleanup;
        }
        // select the table - (A) any case or (L) lower case only
        UTrie2 *table = anyCaseTrie;
        if (uregex_start(parseRegexp, 7, &status) >= 0) {
            table = lowerCaseTrie;
        }
        // Build the set of scripts containing confusable characters for
        //   the code point(s) specified in this input line.
        // Sanity check that the script of the source code point is the same
        //   as the source script indicated in the input file.  Failure of this check is
        //   an error in the input file.
        // Include the source script in the set (needed for Mixed Script Confusable detection).
        //
        UChar32 cp;
        for (cp=startCodePoint; cp<=endCodePoint; cp++) {
            int32_t setIndex = utrie2_get32(table, cp);
            BuilderScriptSet *bsset = NULL;
            if (setIndex > 0) {
                U_ASSERT(setIndex < scriptSets->size());
                bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
            } else {
                bsset = new BuilderScriptSet();
                if (bsset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                bsset->codePoint = cp;
                bsset->trie = table;
                bsset->sset = new ScriptSet();
                setIndex = scriptSets->size();
                bsset->index = setIndex;
                bsset->rindex = 0;
                if (bsset->sset == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    goto cleanup;
                }
                scriptSets->addElement(bsset, status);
                utrie2_set32(table, cp, setIndex, &status);
            }
            bsset->sset->set(targScript, status);
            bsset->sset->set(srcScript, status);
            if (U_FAILURE(status)) {
                goto cleanup;
            }
            UScriptCode cpScript = uscript_getScript(cp, &status);
            if (cpScript != srcScript) {
                status = U_INVALID_FORMAT_ERROR;
                goto cleanup;
            }
        }
    }
    // Eliminate duplicate script sets.  At this point we have a separate
    // script set for every code point that had data in the input file.
    //
    // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
    //
    // printf("Number of scriptSets: %d\n", scriptSets->size());
    {
        int32_t duplicateCount = 0;
        rtScriptSetsCount = 2;
        for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
            BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
            if (outerSet->index != static_cast<uint32_t>(outeri)) {
                // This set was already identified as a duplicate.
                //   It will not be allocated a position in the runtime array of ScriptSets.
                continue;
            }
            outerSet->rindex = rtScriptSetsCount++;
            for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
                BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
                if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
                    delete innerSet->sset;
                    innerSet->scriptSetOwned = FALSE;
                    innerSet->sset = outerSet->sset;
                    innerSet->index = outeri;
                    innerSet->rindex = outerSet->rindex;
                    duplicateCount++;
                }
                // But this doesn't get all.  We need to fix the TRIE.
            }
        }
        // printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
    }
    // Update the Trie values to be reflect the run time script indexes (after duplicate merging).
    //    (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
    //     are unused, which is why the loop index starts at 2.)
    {
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex != (uint32_t)i) {
                utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
            }
        }
    }
    // For code points with script==Common or script==Inherited,
    //   Set the reserved value of 1 into both Tries.  These characters do not participate
    //   in Whole Script Confusable detection; this reserved value is the means
    //   by which they are detected.
    {
        UnicodeSet ignoreSet;
        ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
        UnicodeSet inheritedSet;
        inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
        ignoreSet.addAll(inheritedSet);
        for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
            UChar32 rangeStart = ignoreSet.getRangeStart(rn);
            UChar32 rangeEnd   = ignoreSet.getRangeEnd(rn);
            utrie2_setRange32(anyCaseTrie,   rangeStart, rangeEnd, 1, TRUE, &status);
            utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
        }
    }
    // Serialize the data to the Spoof Detector
    {
        utrie2_freeze(anyCaseTrie,   UTRIE2_16_VALUE_BITS, &status);
        int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
        // printf("Any case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
        spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
        void *where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(anyCaseTrie, where, size, &status);
        utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
        size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
        // printf("Lower case Trie size: %d\n", size);
        if (status != U_BUFFER_OVERFLOW_ERROR) {
            goto cleanup;
        }
        status = U_ZERO_ERROR;
        spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
        spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
        where = spImpl->fSpoofData->reserveSpace(size, status);
        utrie2_serialize(lowerCaseTrie, where, size, &status);
        spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
        spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
        ScriptSet *rtScriptSets =  static_cast<ScriptSet *>
            (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
        uint32_t rindex = 2;
        for (int32_t i=2; i<scriptSets->size(); i++) {
            BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            if (bSet->rindex < rindex) {
                // We have already copied this script set to the serialized data.
                continue;
            }
            U_ASSERT(rindex == bSet->rindex);
            rtScriptSets[rindex] = *bSet->sset;   // Assignment of a ScriptSet just copies the bits.
            rindex++;
        }
    }
    // Open new utrie2s from the serialized data.  We don't want to keep the ones
    //   we just built because we would then have two copies of the data, one internal to
    //   the utries that we have already constructed, and one in the serialized data area.
    //   An alternative would be to not pre-serialize the Trie data, but that makes the
    //   spoof detector data different, depending on how the detector was constructed.
    //   It's simpler to keep the data always the same.
    spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);
    spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
            UTRIE2_16_VALUE_BITS,
            (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
            spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
            NULL,
            &status);
 cleanup:
    if (U_FAILURE(status)) {
        pe->line = lineNum;
    }
    uregex_close(parseRegexp);
    uprv_free(input);
    int32_t i;
    if (scriptSets != NULL) {
        for (i=0; i<scriptSets->size(); i++) {
            BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
            delete bsset;
        }
        delete scriptSets;
    }
    utrie2_close(anyCaseTrie);
    utrie2_close(lowerCaseTrie);
    return;
 }
 U_NAMESPACE_END
 BuilderScriptSet::BuilderScriptSet() {
    codePoint = -1;
    trie = NULL;
    sset = NULL;
    index = 0;
    rindex = 0;
    scriptSetOwned = TRUE;
 }
 BuilderScriptSet::~BuilderScriptSet() {
    if (scriptSetOwned) {
        delete sset;
    }
 }
 #endif
 #endif //  !UCONFIG_NO_REGULAR_EXPRESSIONS 
--- a/icu4c/source/i18n/uspoof_wsconf.h
+++ b/icu4c/source/i18n/uspoof_wsconf.h
@ -1,72 +0,0 @@
 // Copyright (C) 2016 and later: Unicode, Inc. and others.
 // License & terms of use: http://www.unicode.org/copyright.html
 /*
 ******************************************************************************
 *
 *   Copyright (C) 2008-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  uspoof_buildwsconf.h
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2009Jan19
 *   created by: Andy Heninger
 *
 *   Internal classes and functions
 *   for compiling whole script confusable data into its binary (runtime) form.
 */
 #ifndef __USPOOF_BUILDWSCONF_H__
 #define __USPOOF_BUILDWSCONF_H__
 #include "unicode/utypes.h"
 #if !UCONFIG_NO_NORMALIZATION
 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 
 #include "uspoof_impl.h"
 #include "utrie2.h"
 U_NAMESPACE_BEGIN
 //
 // class BuilderScriptSet.   Represents the set of scripts (Script Codes)
 //             containing characters that are confusable with one specific
 //             code point.
 //
 class BuilderScriptSet: public UMemory {
  public:
    UChar32      codePoint;       // The source code point.
    UTrie2      *trie;            // Any-case or Lower-case Trie.
                                  //   These Trie tables are the final result of the
                                  //   build.  This flag indicates which of the two
                                  //   this set of data is for.
    ScriptSet   *sset;            // The set of scripts itself.
                                  // Vectors of all B
    uint32_t     index;           // Index of this set in the Build Time vector
                                  //   of script sets.
    uint32_t     rindex;          // Index of this set in the final (runtime)
                                  //   array of sets.
    UBool        scriptSetOwned;  // True if this BuilderScriptSet owns (should delete)
                                  //   its underlying sset.
    BuilderScriptSet();
    ~BuilderScriptSet();
 };
 void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
          int32_t confusablesWSLen, UParseError *pe, UErrorCode &status); 
 U_NAMESPACE_END
 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS 
 #endif // !UCONFIG_NO_NORMALIZATION 
 #endif
--- a/icu4c/source/test/cintltst/spooftest.c
+++ b/icu4c/source/test/cintltst/spooftest.c
@ -478,7 +478,7 @@ static void TestUSpoofCAPI(void) {
        const UChar* tests[] = { goodLatin, scMixed, scLatin,
                goodCyrl, goodGreek, lll_Latin_a, lll_Latin_b, han_Hiragana };
-        for (int32_t i=0; i<sizeof(tests)/sizeof(UChar*); i++) {
+        for (int32_t i=0; i<UPRV_LENGTHOF(tests); i++) {
            const UChar* str = tests[i];
            // Basic test