scuffed-code/icu4c/source/test/intltest/transrt.cpp

/*
**********************************************************************
*   Copyright (C) 2000, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   05/23/00    aliu        Creation.
**********************************************************************
*/
#include "transrt.h"
#include "testutil.h"
#include "unicode/utypes.h"
#include "unicode/translit.h"
#include "unicode/rbt.h"
#include "unicode/uniset.h"

#define CASE(id,test) case id:                          \
                          name = #test;                 \
                          if (exec) {                   \
                              logln(#test "---");       \
                              logln((UnicodeString)""); \
                              test();                   \
                          }                             \
                          break

void
TransliteratorRoundTripTest::runIndexedTest(int32_t index, UBool exec,
                                   char* &name, char* par) {
    switch (index) {
        CASE(0,TestHiragana);
        CASE(1,TestKatakana);
        CASE(2,TestArabic);
        CASE(3,TestHebrew);
        CASE(4,TestHangul);
        CASE(5,TestGreek);
        CASE(6,TestCyrillic);
        /*
        CASE(7,TestJamo);
        CASE(8,TestJamoHangul);
        */
        default: name = ""; break;
    }
}

//--------------------------------------------------------------------
// RTTest Interface
//--------------------------------------------------------------------

class RTTest {

    // PrintWriter out;

    UnicodeString transliteratorID;
    int8_t sourceScript;
    int8_t targetScript;
    int32_t errorLimit;
    int32_t errorCount;
    int32_t pairLimit;
    UnicodeSet sourceRange;
    UnicodeSet targetRange;
    IntlTest* log;

public:

    /*
     * create a test for the given script transliterator.
     */
    RTTest(const UnicodeString& transliteratorID,
           int8_t sourceScript, int8_t targetScript);

    virtual ~RTTest();

    void setErrorLimit(int32_t limit);

    void setPairLimit(int32_t limit);

    void test(const UnicodeString& sourceRange,
              const UnicodeString& targetRange, IntlTest* log);

private:

    void test2();

    void logWrongScript(const UnicodeString& label,
                        const UnicodeString& from,
                        const UnicodeString& to);
    void logRoundTripFailure(const UnicodeString& from,
                             const UnicodeString& to,
                             const UnicodeString& back);

protected:

    /*
     * Characters to filter for source-target mapping completeness
     * Typically is base alphabet, minus extended characters
     * Default is ASCII letters for Latin
     */
    virtual UBool isSource(UChar c);

    /*
     * Characters to check for target back to source mapping.
     * Typically the same as the target script, plus punctuation
     */
    UBool isReceivingSource(UChar c);

    /*
     * Characters to filter for target-source mapping
     * Typically is base alphabet, minus extended characters
     */
    UBool isTarget(UChar c);

    /*
     * Characters to check for target-source mapping
     * Typically the same as the source script, plus punctuation
     */
    UBool isReceivingTarget(UChar c);

    UBool isSource(const UnicodeString& s);
    UBool isTarget(const UnicodeString& s);
    UBool isReceivingSource(const UnicodeString& s);
    UBool isReceivingTarget(const UnicodeString& s);
};

//--------------------------------------------------------------------
// RTTest Implementation
//--------------------------------------------------------------------

/*
 * create a test for the given script transliterator.
 */
RTTest::RTTest(const UnicodeString& transliteratorID,
               int8_t sourceScript, int8_t targetScript) {
    this->transliteratorID = transliteratorID;
    this->sourceScript = sourceScript;
    this->targetScript = targetScript;
    errorLimit = 0x7FFFFFFFL;
    errorCount = 0;
    pairLimit  = 0x10000;
}

RTTest::~RTTest() {
}

void RTTest::setErrorLimit(int32_t limit) {
    errorLimit = limit;
}

void RTTest::setPairLimit(int32_t limit) {
    pairLimit = limit;
}

void RTTest::test(const UnicodeString& sourceRange,
                  const UnicodeString& targetRange, IntlTest* log) {

    UErrorCode status = U_ZERO_ERROR;
    if (sourceRange.length() > 0) {
        this->sourceRange.applyPattern(sourceRange, status);
        if (U_FAILURE(status)) {
            log->errln("FAIL: UnicodeSet::applyPattern(" +
                       sourceRange + ")");
            return;
        }
    } else {
        this->sourceRange.applyPattern("[a-zA-Z]", status);
        if (U_FAILURE(status)) {
            log->errln("FAIL: UnicodeSet::applyPattern([a-z])");
            return;
        }
    }
    this->targetRange.clear();
    if (targetRange.length() > 0) {
        this->targetRange.applyPattern(targetRange, status);
        if (U_FAILURE(status)) {
            log->errln("FAIL: UnicodeSet::applyPattern(" +
                       targetRange + ")");
            return;
        }
    }

    this->log = log;

//|     // make a UTF-8 output file we can read with a browser
//|
//|     // note: check that every transliterator transliterates the null string correctly!
//|
//|     String logFileName = "test_" + transliteratorID + "_"
//|         + sourceScript + "_" + targetScript + ".html";
//|
//|     log.logln("Creating log file " + logFileName);
//|
//|     out = new PrintWriter(new BufferedWriter(new OutputStreamWriter(
//|               new FileOutputStream(logFileName), "UTF8"), 4*1024));
//|     //out.write('\uFFEF');    // BOM
//|     out.println("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">");
//|     out.println("<HTML><HEAD>");
//|     out.println("<META content=\"text/html; charset=utf-8\" http-equiv=Content-Type></HEAD>");
//|     out.println("<BODY>");
//|     out.println("<TABLE>");
//|     try {
        test2();
//|         out.println("</TABLE>");
//|     } catch (TestTruncated e) {
//|         out.println("</TABLE>" + e.getMessage());
//|     }
//|     out.println("</BODY></HTML>");
//|     out.close();

    if (errorCount > 0) {
        log->errln(transliteratorID + " errors: " + errorCount); // + ", see " + logFileName);
    } else {
        log->logln(transliteratorID + " ok");
//|         new File(logFileName).delete();
    }
}

void RTTest::test2() {

    UChar c;
    UnicodeString cs, targ, reverse;

    Transliterator* sourceToTarget = Transliterator::createInstance(transliteratorID);
    if (sourceToTarget == NULL) {
        log->errln("Fail: createInstance(" + transliteratorID +
                   ") returned NULL");
        return;
    }
    Transliterator* targetToSource = sourceToTarget->createInverse();
    if (targetToSource == NULL) {
        log->errln("Fail: " + transliteratorID +
                   ".createInverse() returned NULL");
        delete sourceToTarget;
        return;
    }

    log->logln("Checking that all source characters convert to target - Singles");

    for (c = 0; c < 0xFFFF; ++c) {
        if (Unicode::getType(c) == Unicode::UNASSIGNED ||
            !isSource(c)) continue;
        cs.remove(); cs.append(c);
        targ = cs;
        sourceToTarget->transliterate(targ);
        if (!isReceivingTarget(targ)) {
            logWrongScript("Source-Target", cs, targ);
            if (errorCount >= errorLimit) return;
        }
    }

    log->logln("Checking that all source characters convert to target - Doubles");

    for (c = 0; c < 0xFFFF; ++c) {
        if (Unicode::getType(c) == Unicode::UNASSIGNED ||
            !isSource(c)) continue;
        for (UChar d = 0; d < 0xFFFF; ++d) {
            if (Unicode::getType(d) == Unicode::UNASSIGNED ||
                !isSource(d)) continue;
            cs.remove(); cs.append(c).append(d);
            targ = cs;
            sourceToTarget->transliterate(targ);
            if (!isReceivingTarget(targ)) {
                logWrongScript("Source-Target", cs, targ);
                if (errorCount >= errorLimit) return;
            }
        }
    }

    log->logln("Checking that target characters convert to source and back - Singles");

    for (c = 0; c < 0xFFFF; ++c) {
        if (Unicode::getType(c) == Unicode::UNASSIGNED ||
            !isTarget(c)) continue;
        cs.remove(); cs.append(c);
        targ = cs;
        targetToSource->transliterate(targ);
        reverse = targ;
        sourceToTarget->transliterate(reverse);
        if (!isReceivingSource(targ)) {
            logWrongScript("Target-Source", cs, targ);
            if (errorCount >= errorLimit) return;
        } else if (cs != reverse) {
            logRoundTripFailure(cs, targ, reverse);
            if (errorCount >= errorLimit) return;
        }
    }

    log->logln("Checking that target characters convert to source and back - Doubles");
    int32_t count = 0;
    cs = UNICODE_STRING("aa", 2);
    for (c = 0; c < 0xFFFF; ++c) {
        if (Unicode::getType(c) == Unicode::UNASSIGNED ||
            !isTarget(c)) continue;
        if (++count > pairLimit) {
            //throw new TestTruncated("Test truncated at " + pairLimit + " x 64k pairs");
            log->logln("");
            log->logln((UnicodeString)"Test truncated at " + pairLimit + " x 64k pairs");
            return;
        }
        cs.setCharAt(0, c);
        log->log(TestUtility::hex(c));
        log->log(" ");
        for (UChar d = 0; d < 0xFFFF; ++d) {
            if (Unicode::getType(d) == Unicode::UNASSIGNED ||
                !isTarget(d)) continue;
            cs.setCharAt(1, d);
            targ = cs;
            targetToSource->transliterate(targ);
            reverse = targ;
            sourceToTarget->transliterate(reverse);
            if (!isReceivingSource(targ)) {
                logWrongScript("Target-Source", cs, targ);
                if (errorCount >= errorLimit) return;
            } else if (cs != reverse) {
                logRoundTripFailure(cs, targ, reverse);
                if (errorCount >= errorLimit) return;
            }
        }
    }
    log->logln("");
}

void RTTest::logWrongScript(const UnicodeString& label,
                            const UnicodeString& from,
                            const UnicodeString& to) {
//|     out.println("<TR><TD>Fail " + label + ":</TD><TD><FONT SIZE=\"6\">" +
//|                 from + "</FONT></TD><TD>(" +
//|                 TestUtility::hex(from) + ") =></TD><TD><FONT SIZE=\"6\">" +
//|                 to + "</FONT></TD><TD>(" +
//|                 TestUtility::hex(to) + ")</TD></TR>" );
//|     if (++errorCount >= errorLimit) {
//|         throw new TestTruncated("Test truncated; too many failures");
//|     }
    log->errln((UnicodeString)"Fail " +
               label + ": " +
               from + "(" + TestUtility::hex(from) + ") => " +
               to + "(" + TestUtility::hex(to) + ")");
    ++errorCount;
}

void RTTest::logRoundTripFailure(const UnicodeString& from,
                                 const UnicodeString& to,
                                 const UnicodeString& back) {
//|     out.println("<TR><TD>Fail Roundtrip:</TD><TD><FONT SIZE=\"6\">" +
//|                 from + "</FONT></TD><TD>(" +
//|                 TestUtility::hex(from) + ") =></TD><TD>" +
//|                 to + "</TD><TD>(" +
//|                 TestUtility::hex(to) + ") =></TD><TD><FONT SIZE=\"6\">" +
//|                 back + "</TD><TD>(" +
//|                 TestUtility::hex(back) + ")</TD></TR>" );
//|     if (++errorCount >= errorLimit) {
//|         throw new TestTruncated("Test truncated; too many failures");
//|     }
    log->errln((UnicodeString)"Fail Roundtrip: " +
               from + "(" + TestUtility::hex(from) + ") => " +
               to + "(" + TestUtility::hex(to) + ") => " +
               back + "(" + TestUtility::hex(back) + ") => ");
    ++errorCount;
}

/*
 * Characters to filter for source-target mapping completeness
 * Typically is base alphabet, minus extended characters
 * Default is ASCII letters for Latin
 */
UBool RTTest::isSource(UChar c) {
    int8_t script = TestUtility::getScript(c);
    if (script != sourceScript) return FALSE;
    if (!Unicode::isLetter(c)) return FALSE;
    if (!sourceRange.contains(c)) return FALSE;
    return TRUE;
}

/*
 * Characters to check for target back to source mapping.
 * Typically the same as the target script, plus punctuation
 */
UBool RTTest::isReceivingSource(UChar c) {
    int8_t script = TestUtility::getScript(c);
    return (script == sourceScript || script == TestUtility::COMMON_SCRIPT);
}

/*
 * Characters to filter for target-source mapping
 * Typically is base alphabet, minus extended characters
 */
UBool RTTest::isTarget(UChar c) {
    int8_t script = TestUtility::getScript(c);
    if (script != targetScript) return FALSE;
    if (!Unicode::isLetter(c)) return FALSE;
    if (!targetRange.isEmpty() && !targetRange.contains(c)) return FALSE;
    return TRUE;
}

/*
 * Characters to check for target-source mapping
 * Typically the same as the source script, plus punctuation
 */
UBool RTTest::isReceivingTarget(UChar c) {
    int8_t script = TestUtility::getScript(c);
    return (script == targetScript || script == TestUtility::COMMON_SCRIPT);
}

UBool RTTest::isSource(const UnicodeString& s) {
    for (int32_t i = 0; i < s.length(); ++i) {
        if (!isSource(s.charAt(i))) return FALSE;
    }
    return TRUE;
}

UBool RTTest::isTarget(const UnicodeString& s) {
    for (int32_t i = 0; i < s.length(); ++i) {
        if (!isTarget(s.charAt(i))) return FALSE;
    }
    return TRUE;
}

UBool RTTest::isReceivingSource(const UnicodeString& s) {
    for (int32_t i = 0; i < s.length(); ++i) {
        if (!isReceivingSource(s.charAt(i))) return FALSE;
    }
    return TRUE;
}

UBool RTTest::isReceivingTarget(const UnicodeString& s) {
    for (int32_t i = 0; i < s.length(); ++i) {
        if (!isReceivingTarget(s.charAt(i))) return FALSE;
    }
    return TRUE;
}

//--------------------------------------------------------------------
// RTHangulTest
//--------------------------------------------------------------------

class RTHangulTest : public RTTest {
public:
    RTHangulTest();
protected:
    virtual UBool isSource(UChar c);
};

RTHangulTest::RTHangulTest() : RTTest("Jamo-Hangul",
                                  TestUtility::JAMO_SCRIPT,
                                  TestUtility::HANGUL_SCRIPT) {}

UBool RTHangulTest::isSource(UChar c) {
    if (0x1113 <= c && c <= 0x1160) return FALSE;
    if (0x1176 <= c && c <= 0x11F9) return FALSE;
    if (0x3131 <= c && c <= 0x318E) return FALSE;
    return RTTest::isSource(c);
}

//--------------------------------------------------------------------
// Specific Tests
//--------------------------------------------------------------------

void TransliteratorRoundTripTest::TestHiragana() {
    RTTest test("Latin-Kana",
                TestUtility::LATIN_SCRIPT, TestUtility::HIRAGANA_SCRIPT);
    test.test("[a-z]", UnicodeString("[\\u3040-\\u3094]", ""), this);
}

void TransliteratorRoundTripTest::TestKatakana() {
    RTTest test("Latin-Kana",
                TestUtility::LATIN_SCRIPT, TestUtility::KATAKANA_SCRIPT);
    test.test("[A-Z]", UnicodeString("[\\u30A1-\\u30FA]", ""), this);
}

 void TransliteratorRoundTripTest::TestArabic() {
    RTTest test("Latin-Arabic",
                TestUtility::LATIN_SCRIPT, TestUtility::ARABIC_SCRIPT);
    test.test("[a-z]", UnicodeString("[\\u0620-\\u065F-[\\u0640]]", ""), this);
}

void TransliteratorRoundTripTest::TestHebrew() {
    RTTest test("Latin-Hebrew",
                TestUtility::LATIN_SCRIPT, TestUtility::HEBREW_SCRIPT);
    test.test("", UnicodeString("[\\u05D0-\\u05EF]", ""), this);
}

void TransliteratorRoundTripTest::TestHangul() {
    RTHangulTest t;
    t.setPairLimit(30); // Don't run full test -- too long
    t.test("", "", this);
}

void TransliteratorRoundTripTest::TestJamo() {
    RTTest t("Latin-Jamo",
             TestUtility::LATIN_SCRIPT, TestUtility::JAMO_SCRIPT);
    t.setErrorLimit(200); // Don't run full test -- too long
    t.test("", "", this);
}

void TransliteratorRoundTripTest::TestJamoHangul() {
    RTTest t("Latin-Jamo;Jamo-Hangul",
             TestUtility::LATIN_SCRIPT, TestUtility::HANGUL_SCRIPT);
    t.setErrorLimit(50); // Don't run full test -- too long
    t.test("", "", this);
}

void TransliteratorRoundTripTest::TestGreek() {
    RTTest test("Latin-Greek",
                TestUtility::LATIN_SCRIPT, TestUtility::GREEK_SCRIPT);
    test.test("", UnicodeString("[\\u0380-\\u03CF]", ""), this);
}

void TransliteratorRoundTripTest::TestCyrillic() {
    RTTest test("Latin-Cyrillic",
                TestUtility::LATIN_SCRIPT, TestUtility::CYRILLIC_SCRIPT);
    test.test("", UnicodeString("[\\u0401\\u0410-\\u044F\\u0451]", ""), this);
}