scuffed-code/icu4c/source/test/intltest/itspoof.cpp

/*
**********************************************************************
* Copyright (C) 2009, International Business Machines Corporation 
* and others.  All Rights Reserved.
**********************************************************************
*/
/**
 * IntlTestSpoof tests for USpoofDetector
 */

#include "unicode/utypes.h"

#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO

#include "itspoof.h"
#include "unicode/uspoof.h"
#include "unicode/unistr.h"
#include "unicode/regex.h"
#include "unicode/normlzr.h"
#include "cstring.h"
#include <stdlib.h>
#include <stdio.h>

#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
    errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}

#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
    errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}

#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
    errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
             __FILE__, __LINE__, #a, (a), #b, (b)); }}

#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
    errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
             __FILE__, __LINE__, #a, (a), #b, (b)); }}

/*
 *   TEST_SETUP and TEST_TEARDOWN
 *         macros to handle the boilerplate around setting up test case.
 *         Put arbitrary test code between SETUP and TEARDOWN.
 *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
 */
#define TEST_SETUP {  \
    UErrorCode status = U_ZERO_ERROR; \
    USpoofChecker *sc;     \
    sc = uspoof_open(&status);  \
    TEST_ASSERT_SUCCESS(status);   \
    if (U_SUCCESS(status)){

#define TEST_TEARDOWN  \
    }  \
    TEST_ASSERT_SUCCESS(status);  \
    uspoof_close(sc);  \
}


void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
{
    if (exec) logln("TestSuite spoof: ");
    switch (index) {
        case 0:
            name = "TestSpoofAPI"; 
            if (exec) {
                testSpoofAPI();
            }
            break;
         case 1:
            name = "TestSkeleton"; 
            if (exec) {
                testSkeleton();
            }
            break;
         case 2:
            name = "TestAreConfusable";
            if (exec) {
                testAreConfusable();
            }
            break;
          case 3:
            name = "TestInvisible";
            if (exec) {
                testInvisible();
            }
            break;
          case 4:
            name = "testConfData";
            if (exec) {
                testConfData();
            }
            break;
        default: name=""; break;
    }
}

void IntlTestSpoof::testSpoofAPI() {

    TEST_SETUP
        UnicodeString s("uvw");
        int32_t position = 666;
        int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(0, checkResults);
        TEST_ASSERT_EQ(666, position);
    TEST_TEARDOWN;
    
    TEST_SETUP
        UnicodeString s1("cxs");
        UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
        int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);

    TEST_TEARDOWN;

    TEST_SETUP
        UnicodeString s("I1l0O");
        UnicodeString dest;
        UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(UnicodeString("11100") == dest);
        TEST_ASSERT(&dest == &retStr);
    TEST_TEARDOWN;
}


#define CHECK_SKELETON(type, input, expected) { \
    checkSkeleton(sc, type, input, expected, __LINE__); \
    }


// testSkeleton.   Spot check a number of confusable skeleton substitutions from the 
//                 Unicode data file confusables.txt
//                 Test cases chosen for substitutions of various lengths, and 
//                 membership in different mapping tables.
void IntlTestSpoof::testSkeleton() {
    const uint32_t ML = 0;
    const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
    const uint32_t MA = USPOOF_ANY_CASE;
    const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;

    TEST_SETUP
        // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
        CHECK_SKELETON(SL, " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",

               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations.")

        // FC5F ;	FE74 0651 ;   ML  #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
        //                                ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA	
        //    This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping 
        //    is never used in creating a skeleton.
        CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");

        CHECK_SKELETON(SL, "nochange", "nochange");
        CHECK_SKELETON(MA, "love", "1ove");   // lower case l to digit 1
        CHECK_SKELETON(ML, "OOPS", "OOPS");
        CHECK_SKELETON(MA, "OOPS", "00PS");   // Letter O to digit 0 in any case mode only
        CHECK_SKELETON(SL, "\\u059c", "\\u0301");
        CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
        CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029");
        CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");

        // This mapping exists in the ML and MA tables, does not exist in SL, SA
        //0C83 ;	0C03 ;	ML	# ( ಃ → ః ) KANNADA SIGN VISARGA → TELUGU SIGN VISARGA	# {source:513}
        CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
        CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
        CHECK_SKELETON(ML, "\\u0C83", "\\u0C03");
        CHECK_SKELETON(MA, "\\u0C83", "\\u0C03");
        
        // 0391 ; 0041 ; MA # ( Α → A ) GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A 
        // This mapping exists only in the MA table.
        CHECK_SKELETON(MA, "\\u0391", "A");
        CHECK_SKELETON(SA, "\\u0391", "\\u0391");
        CHECK_SKELETON(ML, "\\u0391", "\\u0391");
        CHECK_SKELETON(SL, "\\u0391", "\\u0391");

        // 13CF ;  0062 ;  MA  #  CHEROKEE LETTER SI to LATIN SMALL LETTER B  
        // This mapping exists in the ML and MA tables
        CHECK_SKELETON(ML, "\\u13CF", "b");
        CHECK_SKELETON(MA, "\\u13CF", "b");
        CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
        CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");

        // 0022 ;  02B9 02B9 ;  SA  #*  QUOTATION MARK to MODIFIER LETTER PRIME, MODIFIER LETTER PRIME 
        // all tables.
        CHECK_SKELETON(SL, "\\u0022", "\\u02B9\\u02B9");
        CHECK_SKELETON(SA, "\\u0022", "\\u02B9\\u02B9");
        CHECK_SKELETON(ML, "\\u0022", "\\u02B9\\u02B9");
        CHECK_SKELETON(MA, "\\u0022", "\\u02B9\\u02B9");

    TEST_TEARDOWN;
}


//
//  Run a single confusable skeleton transformation test case.
//
void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type, 
                                  const char *input, const char *expected, int32_t lineNum) {
    UnicodeString uInput = UnicodeString(input).unescape();
    UnicodeString uExpected = UnicodeString(expected).unescape();
    
    UErrorCode status = U_ZERO_ERROR;
    UnicodeString actual;
    uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
    if (U_FAILURE(status)) {
        errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
              u_errorName(status));
        return;
    }
    if (uExpected != actual) {
        errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
               __FILE__, __LINE__, lineNum);
        errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
              UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
    }
}

void IntlTestSpoof::testAreConfusable() {
    TEST_SETUP
        UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
                         "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
        UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
                         "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
        TEST_ASSERT_SUCCESS(status);

    TEST_TEARDOWN;
}

void IntlTestSpoof::testInvisible() {
    TEST_SETUP
        UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
        int32_t position = -42;
        TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(position == -42);

        UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
        TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(7, position);

        // Tow acute accents, one from the composed a with acute accent, \u00e1,
        // and one separate.
        position = -42;
        UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
        TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT_EQ(7, position);
    TEST_TEARDOWN;
}


static UnicodeString parseHex(const UnicodeString &in) {
    // Convert a series of hex numbers in a Unicode String to a string with the
    // corresponding characters.
    // The conversion is _really_ annoying.  There must be some function to just do it.
    UnicodeString result;
    UChar32 cc = 0;
    for (int32_t i=0; i<in.length(); i++) {
        UChar c = in.charAt(i);
        if (c == 0x20) {   // Space
            if (cc > 0) {
               result.append(cc);
               cc = 0;
            }
        } else if (c>=0x30 && c<=0x39) {
            cc = (cc<<4) + (c - 0x30);
        } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
            cc = (cc<<4) + (c & 0x0f)+9;
        }
        // else do something with bad input.
    }
    if (cc > 0) {
        result.append(cc);
    }
    return result;
}


//
// Append the hex form of a UChar32 to a UnicodeString.
// Used in formatting error messages.
// Match the formatting of numbers in confusables.txt
// Minimum of 4 digits, no leading zeroes for positions 5 and up.
//
static void appendHexUChar(UnicodeString &dest, UChar32 c) {
    UBool   doZeroes = FALSE;    
    for (int bitNum=28; bitNum>=0; bitNum-=4) {
        if (bitNum <= 12) {
            doZeroes = TRUE;
        }
        int hexDigit = (c>>bitNum) & 0x0f;
        if (hexDigit != 0 || doZeroes) {
            doZeroes = TRUE;
            dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
        }
    }
    dest.append((UChar)0x20);
}

//  testConfData - Check each data item from the Unicode confusables.txt file,
//                 verify that it transforms correctly in a skeleton.
//
void IntlTestSpoof::testConfData() {
    UErrorCode status = U_ZERO_ERROR;

    const char *testDataDir = IntlTest::getSourceTestData(status);
    TEST_ASSERT_SUCCESS(status);
    char buffer[2000];
    uprv_strcpy(buffer, testDataDir);
    uprv_strcat(buffer, "confusables.txt");

    FILE     *f = NULL;
    f = fopen(buffer, "rb");
    if (f == 0) {
        errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
        return;
    }
    fseek(f, 0, SEEK_END);
    int32_t  fileSize = ftell(f);
    char *fileBuf = new char[fileSize];
    fseek(f, 0, SEEK_SET);
    int32_t amt_read = fread(fileBuf, 1, fileSize, f);
    TEST_ASSERT_EQ(amt_read, fileSize);
    TEST_ASSERT(fileSize>0);
    if (amt_read != fileSize || fileSize <=0) {
        delete [] fileBuf;
        return;
    }
    fclose(f);
    UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf, fileSize));

    USpoofChecker *sc = uspoof_open(&status);
    TEST_ASSERT_SUCCESS(status);

    // Parse lines from the confusables.txt file.  Example Line:
    // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
    // Three fields.  The hex fields can contain more than one character,
    //                and each character may be more than 4 digits (for supplemntals)
    // This regular expression matches lines and splits the fields into capture groups.
    RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
    TEST_ASSERT_SUCCESS(status);
    while (parseLine.find()) {
        UnicodeString from = parseHex(parseLine.group(1, status));
        if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
            // The source character was not NFKD.
            // Skip this case; the first step in obtaining a skeleton is to NFKD the input,
            //  so the mapping in this line of confusables.txt will never be applied.
            continue;
        }

        UnicodeString rawExpected = parseHex(parseLine.group(2, status));
        UnicodeString expected;
        Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
        TEST_ASSERT_SUCCESS(status);

        int32_t skeletonType = 0;
        UnicodeString tableType = parseLine.group(3, status);
        TEST_ASSERT_SUCCESS(status);
        if (tableType.indexOf("SL") >= 0) {
            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
        } else if (tableType.indexOf("SA") >= 0) {
            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
        } else if (tableType.indexOf("ML") >= 0) {
            skeletonType = 0;
        } else if (tableType.indexOf("MA") >= 0) {
            skeletonType = USPOOF_ANY_CASE;
        }

        UnicodeString actual;
        uspoof_getSkeletonUnicodeString(sc, skeletonType, from, actual, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(actual == expected);
        if (actual != expected) {
            errln(parseLine.group(0, status));
            UnicodeString line = "Actual: ";
            int i = 0;
            while (i < actual.length()) {
                appendHexUChar(line, actual.char32At(i));
                i = actual.moveIndex32(i, 1);
            }
            errln(line);
        }
        if (U_FAILURE(status)) {
            break;
        }
    }

}
#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
+								/*
 								**********************************************************************
 								* Copyright (C) 2009, International Business Machines Corporation
 								* and others.  All Rights Reserved.
 								**********************************************************************
 								*/
 								/**
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								 * IntlTestSpoof tests for USpoofDetector
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
+								 */
 								#include "unicode/utypes.h"
-												ICU-7068 Add statements to ensure that uconfigtest passes

X-SVN-Rev: 26466
											
										
										
											2009-08-04 21:09:17 +00:00
+								#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_NORMALIZATION && !UCONFIG_NO_FILE_IO
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
 								#include "itspoof.h"
 								#include "unicode/uspoof.h"
-												ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
											
										
										
											2009-09-14 04:02:38 +00:00
+								#include "unicode/unistr.h"
 								#include "unicode/regex.h"
 								#include "unicode/normlzr.h"
-												ICU-7087 Include missing header file for strcpy and strcat.

X-SVN-Rev: 26630
											
										
										
											2009-09-14 17:39:35 +00:00
+								#include "cstring.h"
-												ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
											
										
										
											2009-09-14 04:02:38 +00:00
+								#include <stdlib.h>
 								#include <stdio.h>
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
 								#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
-												ICU-6905 Make intltest pass without any data.

X-SVN-Rev: 26096
											
										
										
											2009-06-12 19:34:21 +00:00
+								    errcheckln(status, "Failure at file %s, line %d, error = %s", __FILE__, __LINE__, u_errorName(status));}}
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
 								#define TEST_ASSERT(expr) {if ((expr)==FALSE) { \
 								    errln("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);};}
 								#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
 								    errln("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
 								             __FILE__, __LINE__, #a, (a), #b, (b)); }}
 								#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
 								    errln("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
 								             __FILE__, __LINE__, #a, (a), #b, (b)); }}
 								/*
 								 *   TEST_SETUP and TEST_TEARDOWN
 								 *         macros to handle the boilerplate around setting up test case.
 								 *         Put arbitrary test code between SETUP and TEARDOWN.
 								 *         "sc" is the ready-to-go  SpoofChecker for use in the tests.
 								 */
 								#define TEST_SETUP {  \
 								    UErrorCode status = U_ZERO_ERROR; \
 								    USpoofChecker *sc;     \
 								    sc = uspoof_open(&status);  \
 								    TEST_ASSERT_SUCCESS(status);   \
 								    if (U_SUCCESS(status)){
 								#define TEST_TEARDOWN  \
 								    }  \
 								    TEST_ASSERT_SUCCESS(status);  \
 								    uspoof_close(sc);  \
 								}
 								void IntlTestSpoof::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
 								{
 								    if (exec) logln("TestSuite spoof: ");
 								    switch (index) {
 								        case 0:
 								            name = "TestSpoofAPI";
 								            if (exec) {
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								                testSpoofAPI();
 								            }
 								            break;
 								         case 1:
 								            name = "TestSkeleton";
 								            if (exec) {
 								                testSkeleton();
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
+								            }
 								            break;
-												ICU-4790 spoof cleanups

X-SVN-Rev: 25876
											
										
										
											2009-04-23 01:30:02 +00:00
+								         case 2:
 								            name = "TestAreConfusable";
 								            if (exec) {
 								                testAreConfusable();
 								            }
 								            break;
-												ICU-4790 more spoof detection 

X-SVN-Rev: 25921
											
										
										
											2009-04-27 06:53:38 +00:00
+								          case 3:
 								            name = "TestInvisible";
 								            if (exec) {
 								                testInvisible();
 								            }
 								            break;
-												ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
											
										
										
											2009-09-14 04:02:38 +00:00
+								          case 4:
 								            name = "testConfData";
 								            if (exec) {
 								                testConfData();
 								            }
 								            break;
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
+								        default: name=""; break;
 								    }
 								}
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								void IntlTestSpoof::testSpoofAPI() {
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
 								    TEST_SETUP
 								        UnicodeString s("uvw");
 								        int32_t position = 666;
 								        int32_t checkResults = uspoof_checkUnicodeString(sc, s, &position, &status);
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT_EQ(0, checkResults);
 								        TEST_ASSERT_EQ(666, position);
 								    TEST_TEARDOWN;
 								    TEST_SETUP
 								        UnicodeString s1("cxs");
 								        UnicodeString s2 = UnicodeString("\\u0441\\u0445\\u0455").unescape();  // Cyrillic "cxs"
-												ICU-4790 remove 'position' parameter from areConfusable functions

X-SVN-Rev: 25718
											
										
										
											2009-04-02 06:08:36 +00:00
+								        int32_t checkResults = uspoof_areConfusableUnicodeString(sc, s1, s2, &status);
-												ICU-4790 tweaks to areConfusable function

X-SVN-Rev: 25860
											
										
										
											2009-04-22 02:07:40 +00:00
+								        TEST_ASSERT_EQ(USPOOF_MIXED_SCRIPT_CONFUSABLE | USPOOF_WHOLE_SCRIPT_CONFUSABLE, checkResults);
-												ICU-4790 Spoof detection tests

X-SVN-Rev: 25716
											
										
										
											2009-04-01 23:46:14 +00:00
 								    TEST_TEARDOWN;
 								    TEST_SETUP
 								        UnicodeString s("I1l0O");
 								        UnicodeString dest;
 								        UnicodeString &retStr = uspoof_getSkeletonUnicodeString(sc, USPOOF_ANY_CASE, s, dest, &status);
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT(UnicodeString("11100") == dest);
 								        TEST_ASSERT(&dest == &retStr);
 								    TEST_TEARDOWN;
 								}
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								#define CHECK_SKELETON(type, input, expected) { \
 								    checkSkeleton(sc, type, input, expected, __LINE__); \
 								    }
 								// testSkeleton.   Spot check a number of confusable skeleton substitutions from the
 								//                 Unicode data file confusables.txt
-												ICU-4790 more spoof detection testing, plus a couple of bug fixes.

X-SVN-Rev: 25851
											
										
										
											2009-04-21 04:43:19 +00:00
+								//                 Test cases chosen for substitutions of various lengths, and
 								//                 membership in different mapping tables.
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								void IntlTestSpoof::testSkeleton() {
-												ICU-4790 more spoof detection testing, plus a couple of bug fixes.

X-SVN-Rev: 25851
											
										
										
											2009-04-21 04:43:19 +00:00
+								    const uint32_t ML = 0;
 								    const uint32_t SL = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 								    const uint32_t MA = USPOOF_ANY_CASE;
 								    const uint32_t SA = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								    TEST_SETUP
-												ICU-4790 more spoof detection testing, plus a couple of bug fixes.

X-SVN-Rev: 25851
											
										
										
											2009-04-21 04:43:19 +00:00
+								        // A long "identifier" that will overflow implementation stack buffers, forcing heap allocations.
 								        CHECK_SKELETON(SL, " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 								                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 								                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations."
 								                           " A long 'identifier' that will overflow implementation stack buffers, forcing heap allocations.",
 								               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
 								               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
 								               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations."
 								               " A 1ong \\u02b9identifier\\u02b9 that wi11 overf1ow imp1ementation stack buffers, forcing heap a11ocations.")
 								        // FC5F ;	FE74 0651 ;   ML  #* ARABIC LIGATURE SHADDA WITH KASRATAN ISOLATED FORM to
 								        //                                ARABIC KASRATAN ISOLATED FORM, ARABIC SHADDA
 								        //    This character NFKD normalizes to \u0020 \u064d \u0651, so its confusable mapping
 								        //    is never used in creating a skeleton.
 								        CHECK_SKELETON(SL, "\\uFC5F", " \\u064d\\u0651");
 								        CHECK_SKELETON(SL, "nochange", "nochange");
 								        CHECK_SKELETON(MA, "love", "1ove");   // lower case l to digit 1
 								        CHECK_SKELETON(ML, "OOPS", "OOPS");
 								        CHECK_SKELETON(MA, "OOPS", "00PS");   // Letter O to digit 0 in any case mode only
 								        CHECK_SKELETON(SL, "\\u059c", "\\u0301");
 								        CHECK_SKELETON(SL, "\\u2A74", "\\u003A\\u003A\\u003D");
 								        CHECK_SKELETON(SL, "\\u247E", "\\u0028\\u0031\\u0031\\u0029");
 								        CHECK_SKELETON(SL, "\\uFDFB", "\\u062C\\u0644\\u0020\\u062C\\u0644\\u0627\\u0644\\u0647");
 								        // This mapping exists in the ML and MA tables, does not exist in SL, SA
 								        //0C83 ;	0C03 ;	ML	# ( ಃ → ః ) KANNADA SIGN VISARGA → TELUGU SIGN VISARGA	# {source:513}
 								        CHECK_SKELETON(SL, "\\u0C83", "\\u0C83");
 								        CHECK_SKELETON(SA, "\\u0C83", "\\u0C83");
 								        CHECK_SKELETON(ML, "\\u0C83", "\\u0C03");
 								        CHECK_SKELETON(MA, "\\u0C83", "\\u0C03");
 								        // 0391 ; 0041 ; MA # ( Α → A ) GREEK CAPITAL LETTER ALPHA to LATIN CAPITAL LETTER A
 								        // This mapping exists only in the MA table.
 								        CHECK_SKELETON(MA, "\\u0391", "A");
 								        CHECK_SKELETON(SA, "\\u0391", "\\u0391");
 								        CHECK_SKELETON(ML, "\\u0391", "\\u0391");
 								        CHECK_SKELETON(SL, "\\u0391", "\\u0391");
 								        // 13CF ;  0062 ;  MA  #  CHEROKEE LETTER SI to LATIN SMALL LETTER B
 								        // This mapping exists in the ML and MA tables
 								        CHECK_SKELETON(ML, "\\u13CF", "b");
 								        CHECK_SKELETON(MA, "\\u13CF", "b");
 								        CHECK_SKELETON(SL, "\\u13CF", "\\u13CF");
 								        CHECK_SKELETON(SA, "\\u13CF", "\\u13CF");
 								        // 0022 ;  02B9 02B9 ;  SA  #*  QUOTATION MARK to MODIFIER LETTER PRIME, MODIFIER LETTER PRIME
 								        // all tables.
 								        CHECK_SKELETON(SL, "\\u0022", "\\u02B9\\u02B9");
 								        CHECK_SKELETON(SA, "\\u0022", "\\u02B9\\u02B9");
 								        CHECK_SKELETON(ML, "\\u0022", "\\u02B9\\u02B9");
 								        CHECK_SKELETON(MA, "\\u0022", "\\u02B9\\u02B9");
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								    TEST_TEARDOWN;
 								}
 								//
 								//  Run a single confusable skeleton transformation test case.
 								//
 								void IntlTestSpoof::checkSkeleton(const USpoofChecker *sc, uint32_t type,
 								                                  const char *input, const char *expected, int32_t lineNum) {
 								    UnicodeString uInput = UnicodeString(input).unescape();
 								    UnicodeString uExpected = UnicodeString(expected).unescape();
 								    UErrorCode status = U_ZERO_ERROR;
 								    UnicodeString actual;
 								    uspoof_getSkeletonUnicodeString(sc, type, uInput, actual, &status);
 								    if (U_FAILURE(status)) {
 								        errln("File %s, Line %d, Test case from line %d, status is %s", __FILE__, __LINE__, lineNum,
 								              u_errorName(status));
 								        return;
 								    }
 								    if (uExpected != actual) {
 								        errln("File %s, Line %d, Test case from line %d, Actual and Expected skeletons differ.",
 								               __FILE__, __LINE__, lineNum);
-												ICU-4790 more spoof detection testing, plus a couple of bug fixes.

X-SVN-Rev: 25851
											
										
										
											2009-04-21 04:43:19 +00:00
+								        errln(UnicodeString(" Actual   Skeleton: \"") + actual + UnicodeString("\"\n") +
 								              UnicodeString(" Expected Skeleton: \"") + uExpected + UnicodeString("\""));
-												ICU-4790 add spoof tests

X-SVN-Rev: 25839
											
										
										
											2009-04-20 06:27:34 +00:00
+								    }
 								}
-												ICU-4790 spoof cleanups

X-SVN-Rev: 25876
											
										
										
											2009-04-23 01:30:02 +00:00
+								void IntlTestSpoof::testAreConfusable() {
 								    TEST_SETUP
 								        UnicodeString s1("A long string that will overflow stack buffers.  A long string that will overflow stack buffers. "
 								                         "A long string that will overflow stack buffers.  A long string that will overflow stack buffers. ");
 								        UnicodeString s2("A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. "
 								                         "A long string that wi11 overflow stack buffers.  A long string that will overflow stack buffers. ");
 								        TEST_ASSERT_EQ(USPOOF_SINGLE_SCRIPT_CONFUSABLE, uspoof_areConfusableUnicodeString(sc, s1, s2, &status));
 								        TEST_ASSERT_SUCCESS(status);
 								    TEST_TEARDOWN;
 								}
-												ICU-4790 more spoof detection 

X-SVN-Rev: 25921
											
										
										
											2009-04-27 06:53:38 +00:00
+								void IntlTestSpoof::testInvisible() {
 								    TEST_SETUP
 								        UnicodeString  s = UnicodeString("abcd\\u0301ef").unescape();
 								        int32_t position = -42;
 								        TEST_ASSERT_EQ(0, uspoof_checkUnicodeString(sc, s, &position, &status));
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT(position == -42);
 								        UnicodeString  s2 = UnicodeString("abcd\\u0301\\u0302\\u0301ef").unescape();
 								        TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s2, &position, &status));
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT_EQ(7, position);
 								        // Tow acute accents, one from the composed a with acute accent, \u00e1,
 								        // and one separate.
 								        position = -42;
 								        UnicodeString  s3 = UnicodeString("abcd\\u00e1\\u0301xyz").unescape();
 								        TEST_ASSERT_EQ(USPOOF_INVISIBLE, uspoof_checkUnicodeString(sc, s3, &position, &status));
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT_EQ(7, position);
 								    TEST_TEARDOWN;
 								}
-												ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
											
										
										
											2009-09-14 04:02:38 +00:00
 								static UnicodeString parseHex(const UnicodeString &in) {
 								    // Convert a series of hex numbers in a Unicode String to a string with the
 								    // corresponding characters.
 								    // The conversion is _really_ annoying.  There must be some function to just do it.
 								    UnicodeString result;
 								    UChar32 cc = 0;
 								    for (int32_t i=0; i<in.length(); i++) {
 								        UChar c = in.charAt(i);
 								        if (c == 0x20) {   // Space
 								            if (cc > 0) {
 								               result.append(cc);
 								               cc = 0;
 								            }
 								        } else if (c>=0x30 && c<=0x39) {
 								            cc = (cc<<4) + (c - 0x30);
 								        } else if ((c>=0x41 && c<=0x46) || (c>=0x61 && c<=0x66)) {
 								            cc = (cc<<4) + (c & 0x0f)+9;
 								        }
 								        // else do something with bad input.
 								    }
 								    if (cc > 0) {
 								        result.append(cc);
 								    }
 								    return result;
 								}
 								//
 								// Append the hex form of a UChar32 to a UnicodeString.
 								// Used in formatting error messages.
 								// Match the formatting of numbers in confusables.txt
 								// Minimum of 4 digits, no leading zeroes for positions 5 and up.
 								//
 								static void appendHexUChar(UnicodeString &dest, UChar32 c) {
 								    UBool   doZeroes = FALSE;
 								    for (int bitNum=28; bitNum>=0; bitNum-=4) {
 								        if (bitNum <= 12) {
 								            doZeroes = TRUE;
 								        }
 								        int hexDigit = (c>>bitNum) & 0x0f;
 								        if (hexDigit != 0 || doZeroes) {
 								            doZeroes = TRUE;
 								            dest.append((UChar)(hexDigit<=9? hexDigit + 0x30: hexDigit -10 + 0x41));
 								        }
 								    }
 								    dest.append((UChar)0x20);
 								}
 								//  testConfData - Check each data item from the Unicode confusables.txt file,
 								//                 verify that it transforms correctly in a skeleton.
 								//
 								void IntlTestSpoof::testConfData() {
 								    UErrorCode status = U_ZERO_ERROR;
 								    const char *testDataDir = IntlTest::getSourceTestData(status);
 								    TEST_ASSERT_SUCCESS(status);
 								    char buffer[2000];
-												ICU-7087 Include missing header file for strcpy and strcat.

X-SVN-Rev: 26630
											
										
										
											2009-09-14 17:39:35 +00:00
+								    uprv_strcpy(buffer, testDataDir);
 								    uprv_strcat(buffer, "confusables.txt");
-												ICU-7087 fix spoof detection bugs with buffer handling while computing skeletons, and with supplemental character handling.

X-SVN-Rev: 26628
											
										
										
											2009-09-14 04:02:38 +00:00
 								    FILE     *f = NULL;
 								    f = fopen(buffer, "rb");
 								    if (f == 0) {
 								        errln("Skipping test spoof/testConfData.  File confusables.txt not accessible.");
 								        return;
 								    }
 								    fseek(f, 0, SEEK_END);
 								    int32_t  fileSize = ftell(f);
 								    char *fileBuf = new char[fileSize];
 								    fseek(f, 0, SEEK_SET);
 								    int32_t amt_read = fread(fileBuf, 1, fileSize, f);
 								    TEST_ASSERT_EQ(amt_read, fileSize);
 								    TEST_ASSERT(fileSize>0);
 								    if (amt_read != fileSize || fileSize <=0) {
 								        delete [] fileBuf;
 								        return;
 								    }
 								    fclose(f);
 								    UnicodeString confusablesTxt = UnicodeString::fromUTF8(StringPiece(fileBuf, fileSize));
 								    USpoofChecker *sc = uspoof_open(&status);
 								    TEST_ASSERT_SUCCESS(status);
 								    // Parse lines from the confusables.txt file.  Example Line:
 								    // FF44 ;	0064 ;	SL	# ( d -> d ) FULLWIDTH ....
 								    // Three fields.  The hex fields can contain more than one character,
 								    //                and each character may be more than 4 digits (for supplemntals)
 								    // This regular expression matches lines and splits the fields into capture groups.
 								    RegexMatcher parseLine("(?m)^([0-9A-F]{4}[^#;]*?);([^#;]*?);([^#]*)", confusablesTxt, 0, status);
 								    TEST_ASSERT_SUCCESS(status);
 								    while (parseLine.find()) {
 								        UnicodeString from = parseHex(parseLine.group(1, status));
 								        if (!Normalizer::isNormalized(from, UNORM_NFKD, status)) {
 								            // The source character was not NFKD.
 								            // Skip this case; the first step in obtaining a skeleton is to NFKD the input,
 								            //  so the mapping in this line of confusables.txt will never be applied.
 								            continue;
 								        }
 								        UnicodeString rawExpected = parseHex(parseLine.group(2, status));
 								        UnicodeString expected;
 								        Normalizer::decompose(rawExpected, TRUE, 0, expected, status);
 								        TEST_ASSERT_SUCCESS(status);
 								        int32_t skeletonType = 0;
 								        UnicodeString tableType = parseLine.group(3, status);
 								        TEST_ASSERT_SUCCESS(status);
 								        if (tableType.indexOf("SL") >= 0) {
 								            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
 								        } else if (tableType.indexOf("SA") >= 0) {
 								            skeletonType = USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE;
 								        } else if (tableType.indexOf("ML") >= 0) {
 								            skeletonType = 0;
 								        } else if (tableType.indexOf("MA") >= 0) {
 								            skeletonType = USPOOF_ANY_CASE;
 								        }
 								        UnicodeString actual;
 								        uspoof_getSkeletonUnicodeString(sc, skeletonType, from, actual, &status);
 								        TEST_ASSERT_SUCCESS(status);
 								        TEST_ASSERT(actual == expected);
 								        if (actual != expected) {
 								            errln(parseLine.group(0, status));
 								            UnicodeString line = "Actual: ";
 								            int i = 0;
 								            while (i < actual.length()) {
 								                appendHexUChar(line, actual.char32At(i));
 								                i = actual.moveIndex32(i, 1);
 								            }
 								            errln(line);
 								        }
 								        if (U_FAILURE(status)) {
 								            break;
 								        }
 								    }
 								}
-												ICU-4790 Spoof detection build failures with UCONFIG_NO_REGULAR_EXPRESSIONS

X-SVN-Rev: 25981
											
										
										
											2009-05-04 05:45:27 +00:00
+								#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
-												ICU-4790 spoof cleanups

X-SVN-Rev: 25876
											
										
										
											2009-04-23 01:30:02 +00:00