/******************************************************************** * COPYRIGHT: * Copyright (c) 1997-2001, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** * * File CNORMTST.C * * Modification History: * Name Description * Madhu Katragadda Ported for C API * synwee added test for quick check * synwee added test for checkFCD *********************************************************************************/ /*tests for u_normalization*/ #include #include "unicode/utypes.h" #include "unicode/ucol.h" #include "unicode/uloc.h" #include "cintltst.h" #include "cnormtst.h" #include "ccolltst.h" #include "unicode/ustring.h" #include #define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array)) static UCollator *myCollation; static void TestAPI(void); static void TestNormCoverage(); const static char* canonTests[][3] = { /* Input*/ /*Decomposed*/ /*Composed*/ { "cat", "cat", "cat" }, { "\\u00e0ardvark", "a\\u0300ardvark", "\\u00e0ardvark", }, { "\\u1e0a", "D\\u0307", "\\u1e0a" }, /* D-dot_above*/ { "D\\u0307", "D\\u0307", "\\u1e0a" }, /* D dot_above*/ { "\\u1e0c\\u0307", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D-dot_below dot_above*/ { "\\u1e0a\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D-dot_above dot_below */ { "D\\u0307\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D dot_below dot_above */ { "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307", "\\u1e10\\u0323\\u0307" }, /*D dot_below cedilla dot_above*/ { "D\\u0307\\u0328\\u0323", "D\\u0328\\u0323\\u0307", "\\u1e0c\\u0328\\u0307" }, /* D dot_above ogonek dot_below*/ { "\\u1E14", "E\\u0304\\u0300", "\\u1E14" }, /* E-macron-grave*/ { "\\u0112\\u0300", "E\\u0304\\u0300", "\\u1E14" }, /* E-macron + grave*/ { "\\u00c8\\u0304", "E\\u0300\\u0304", "\\u00c8\\u0304" }, /* E-grave + macron*/ { "\\u212b", "A\\u030a", "\\u00c5" }, /* angstrom_sign*/ { "\\u00c5", "A\\u030a", "\\u00c5" }, /* A-ring*/ { "\\u00C4ffin", "A\\u0308ffin", "\\u00C4ffin" }, { "\\u00C4\\uFB03n", "A\\u0308\\uFB03n", "\\u00C4\\uFB03n" }, { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \\u2163", "Henry \\u2163", "Henry \\u2163" }, { "\\u30AC", "\\u30AB\\u3099", "\\u30AC" }, /* ga (Katakana)*/ { "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /*ka + ten*/ { "\\uFF76\\uFF9E", "\\uFF76\\uFF9E", "\\uFF76\\uFF9E" }, /* hw_ka + hw_ten*/ { "\\u30AB\\uFF9E", "\\u30AB\\uFF9E", "\\u30AB\\uFF9E" }, /* ka + hw_ten*/ { "\\uFF76\\u3099", "\\uFF76\\u3099", "\\uFF76\\u3099" }, /* hw_ka + ten*/ { "A\\u0300\\u0316", "A\\u0316\\u0300", "\\u00C0\\u0316" } /* hw_ka + ten*/ }; const static char* compatTests[][3] = { /* Input*/ /*Decomposed */ /*Composed*/ { "cat", "cat", "cat" }, { "\\uFB4f", "\\u05D0\\u05DC", "\\u05D0\\u05DC" }, /* Alef-Lamed vs. Alef, Lamed*/ { "\\u00C4ffin", "A\\u0308ffin", "\\u00C4ffin" }, { "\\u00C4\\uFB03n", "A\\u0308ffin", "\\u00C4ffin" }, /* ffi ligature -> f + f + i*/ { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \\u2163", "Henry IV", "Henry IV" }, { "\\u30AC", "\\u30AB\\u3099", "\\u30AC" }, /* ga (Katakana)*/ { "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /*ka + ten*/ { "\\uFF76\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /* hw_ka + ten*/ /*These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ { "\\uFF76\\uFF9E", "\\u30AB\\u3099", "\\u30AC" }, /* hw_ka + hw_ten*/ { "\\u30AB\\uFF9E", "\\u30AB\\u3099", "\\u30AC" } /* ka + hw_ten*/ }; void addNormTest(TestNode** root) { addTest(root, &TestAPI, "tscoll/cnormtst/TestAPI"); addTest(root, &TestDecomp, "tscoll/cnormtst/TestDecomp"); addTest(root, &TestCompatDecomp, "tscoll/cnormtst/TestCompatDecomp"); addTest(root, &TestCanonDecompCompose, "tscoll/cnormtst/TestCanonDecompCompose"); addTest(root, &TestCompatDecompCompose, "tscoll/cnormtst/CompatDecompCompose"); addTest(root, &TestNull, "tscoll/cnormtst/TestNull"); addTest(root, &TestQuickCheck, "tscoll/cnormtst/TestQuickCheck"); addTest(root, &TestCheckFCD, "tscoll/cnormtst/TestCheckFCD"); addTest(root, &TestNormCoverage, "tscoll/cnormtst/TestNormCoverage"); } void TestDecomp() { UErrorCode status = U_ZERO_ERROR; int32_t x, neededLen, resLen; UChar *source=NULL, *result=NULL; status = U_ZERO_ERROR; myCollation = ucol_open("en_US", &status); if(U_FAILURE(status)){ log_err("ERROR: in creation of rule based collator: %s\n", myErrorName(status)); return; } resLen=0; log_verbose("Testing unorm_normalize with Decomp canonical\n"); for(x=0; x < ARRAY_LENGTH(canonTests); x++) { source=CharsToUChars(canonTests[x][0]); neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFD, UCOL_IGNORE_HANGUL, NULL, 0, &status); if(status==U_BUFFER_OVERFLOW_ERROR) { status=U_ZERO_ERROR; resLen=neededLen+1; result=(UChar*)malloc(sizeof(UChar*) * resLen); unorm_normalize(source, u_strlen(source), UNORM_NFD, UCOL_IGNORE_HANGUL, result, resLen, &status); } if(U_FAILURE(status)){ log_err("ERROR in unorm_normalize at %s: %s\n", austrdup(source), myErrorName(status) ); } assertEqual(result, canonTests[x][1], x); free(result); free(source); } ucol_close(myCollation); } void TestCompatDecomp() { UErrorCode status = U_ZERO_ERROR; int32_t x, neededLen, resLen; UChar *source=NULL, *result=NULL; status = U_ZERO_ERROR; myCollation = ucol_open("en_US", &status); if(U_FAILURE(status)){ log_err("ERROR: in creation of rule based collator: %s\n", myErrorName(status)); return; } resLen=0; log_verbose("Testing unorm_normalize with Decomp compat\n"); for(x=0; x < ARRAY_LENGTH(compatTests); x++) { source=CharsToUChars(compatTests[x][0]); neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFKD, UCOL_IGNORE_HANGUL, NULL, 0, &status); if(status==U_BUFFER_OVERFLOW_ERROR) { status=U_ZERO_ERROR; resLen=neededLen+1; result=(UChar*)malloc(sizeof(UChar*) * resLen); unorm_normalize(source, u_strlen(source), UNORM_NFKD,UCOL_IGNORE_HANGUL, result, resLen, &status); } if(U_FAILURE(status)){ log_err("ERROR in unorm_normalize at %s: %s\n", austrdup(source), myErrorName(status) ); } assertEqual(result, compatTests[x][1], x); free(result); free(source); } ucol_close(myCollation); } void TestCanonDecompCompose() { UErrorCode status = U_ZERO_ERROR; int32_t x, neededLen, resLen; UChar *source=NULL, *result=NULL; status = U_ZERO_ERROR; myCollation = ucol_open("en_US", &status); if(U_FAILURE(status)){ log_err("ERROR: in creation of rule based collator: %s\n", myErrorName(status)); return; } resLen=0; log_verbose("Testing unorm_normalize with Decomp can compose compat\n"); for(x=0; x < ARRAY_LENGTH(canonTests); x++) { source=CharsToUChars(canonTests[x][0]); neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFC, UCOL_IGNORE_HANGUL, NULL, 0, &status); if(status==U_BUFFER_OVERFLOW_ERROR) { status=U_ZERO_ERROR; resLen=neededLen+1; result=(UChar*)malloc(sizeof(UChar*) * resLen); unorm_normalize(source, u_strlen(source), UNORM_NFC, UCOL_IGNORE_HANGUL, result, resLen, &status); } if(U_FAILURE(status)){ log_err("ERROR in unorm_normalize at %s: %s\n", austrdup(source),myErrorName(status) ); } assertEqual(result, canonTests[x][2], x); free(result); free(source); } ucol_close(myCollation); } void TestCompatDecompCompose() { UErrorCode status = U_ZERO_ERROR; int32_t x, neededLen, resLen; UChar *source=NULL, *result=NULL; status = U_ZERO_ERROR; myCollation = ucol_open("en_US", &status); if(U_FAILURE(status)){ log_err("ERROR: in creation of rule based collator: %s\n", myErrorName(status)); return; } resLen=0; log_verbose("Testing unorm_normalize with compat decomp compose can\n"); for(x=0; x < ARRAY_LENGTH(compatTests); x++) { source=CharsToUChars(compatTests[x][0]); neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFKC, UCOL_IGNORE_HANGUL, NULL, 0, &status); if(status==U_BUFFER_OVERFLOW_ERROR) { status=U_ZERO_ERROR; resLen=neededLen+1; result=(UChar*)malloc(sizeof(UChar*) * resLen); unorm_normalize(source, u_strlen(source), UNORM_NFKC, UCOL_IGNORE_HANGUL, result, resLen, &status); } if(U_FAILURE(status)){ log_err("ERROR in unorm_normalize at %s: %s\n", austrdup(source), myErrorName(status) ); } assertEqual(result, compatTests[x][2], x); free(result); free(source); } ucol_close(myCollation); } /* static void assertEqual(const UChar* result, const UChar* expected, int32_t index) { if(u_strcmp(result, expected)!=0){ log_err("ERROR in decomposition at index = %d. EXPECTED: %s , GOT: %s\n", index, austrdup(expected), austrdup(result) ); } } */ static void assertEqual(const UChar* result, const char* expected, int32_t index) { UChar *expectedUni = CharsToUChars(expected); if(u_strcmp(result, expectedUni)!=0){ log_err("ERROR in decomposition at index = %d. EXPECTED: %s , GOT: %s\n", index, expected, austrdup(result) ); } free(expectedUni); } static void TestNull_check(UChar *src, int32_t srcLen, UChar *exp, int32_t expLen, UNormalizationMode mode, const char *name) { UErrorCode status = U_ZERO_ERROR; int32_t len, i; UChar result[50]; status = U_ZERO_ERROR; for(i=0;i<50;i++) { result[i] = 0xFFFD; } len = unorm_normalize(src, srcLen, mode, 0, result, 50, &status); if(U_FAILURE(status)) { log_err("unorm_normalize(%s) with 0x0000 failed: %s\n", name, u_errorName(status)); } else if (len != expLen) { log_err("unorm_normalize(%s) with 0x0000 failed: Expected len %d, got %d\n", name, expLen, len); } { for(i=0;i)=%ld failed with out[]=U+%04x U+%04x U+%04x U+%04x\n", length, out[0], out[1], out[2], out[3]); return; } } /* test cases to improve test code coverage */ enum { HANGUL_K_KIYEOK=0x3131, /* NFKD->Jamo L U+1100 */ HANGUL_K_WEO=0x315d, /* NFKD->Jamo V U+116f */ HANGUL_K_KIYEOK_SIOS=0x3133, /* NFKD->Jamo T U+11aa */ HANGUL_KIYEOK=0x1100, /* Jamo L U+1100 */ HANGUL_WEO=0x116f, /* Jamo V U+116f */ HANGUL_KIYEOK_SIOS=0x11aa, /* Jamo T U+11aa */ HANGUL_AC00=0xac00, /* Hangul syllable = Jamo LV U+ac00 */ HANGUL_SYLLABLE=0xac00+14*28+3, /* Hangul syllable = U+1100 * U+116f * U+11aa */ MUSICAL_VOID_NOTEHEAD=0x1d157, MUSICAL_HALF_NOTE=0x1d15e, /* NFC/NFD->Notehead+Stem */ MUSICAL_STEM=0x1d165, /* cc=216 */ MUSICAL_STACCATO=0x1d17c /* cc=220 */ }; static void TestNormCoverage() { static UChar input[2000], expect[3000], output[3000]; UErrorCode errorCode; int32_t i, length, inLength, expectLength, hangulPrefixLength, preflightLength; /* create a long and nasty string with NFKC-unsafe characters */ inLength=0; /* 3 Jamos L/V/T, all 8 combinations normal/compatibility */ input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* Hangul LV with normal/compatibility Jamo T */ input[inLength++]=HANGUL_AC00; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_AC00; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* compatibility Jamo L, V */ input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; hangulPrefixLength=inLength; input[inLength++]=UTF16_LEAD(MUSICAL_HALF_NOTE); input[inLength++]=UTF16_TRAIL(MUSICAL_HALF_NOTE); for(i=0; i<200; ++i) { input[inLength++]=UTF16_LEAD(MUSICAL_STACCATO); input[inLength++]=UTF16_TRAIL(MUSICAL_STACCATO); input[inLength++]=UTF16_LEAD(MUSICAL_STEM); input[inLength++]=UTF16_TRAIL(MUSICAL_STEM); } /* (compatibility) Jamo L, T do not compose */ input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* quick checks */ errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFD, &errorCode) || U_FAILURE(errorCode)) { log_err("error unorm_quickCheck(long input, UNORM_NFD)!=NO (%s)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFKD, &errorCode) || U_FAILURE(errorCode)) { log_err("error unorm_quickCheck(long input, UNORM_NFKD)!=NO (%s)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFC, &errorCode) || U_FAILURE(errorCode)) { log_err("error unorm_quickCheck(long input, UNORM_NFC)!=NO (%s)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFKC, &errorCode) || U_FAILURE(errorCode)) { log_err("error unorm_quickCheck(long input, UNORM_NFKC)!=NO (%s)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_FCD, &errorCode) || U_FAILURE(errorCode)) { log_err("error unorm_quickCheck(long input, UNORM_FCD)!=NO (%s)\n", u_errorName(errorCode)); } /* NFKC */ expectLength=0; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_AC00+3; expect[expectLength++]=HANGUL_AC00+3; expect[expectLength++]=HANGUL_AC00+14*28; expect[expectLength++]=UTF16_LEAD(MUSICAL_VOID_NOTEHEAD); expect[expectLength++]=UTF16_TRAIL(MUSICAL_VOID_NOTEHEAD); expect[expectLength++]=UTF16_LEAD(MUSICAL_STEM); expect[expectLength++]=UTF16_TRAIL(MUSICAL_STEM); for(i=0; i<200; ++i) { expect[expectLength++]=UTF16_LEAD(MUSICAL_STEM); expect[expectLength++]=UTF16_TRAIL(MUSICAL_STEM); } for(i=0; i<200; ++i) { expect[expectLength++]=UTF16_LEAD(MUSICAL_STACCATO); expect[expectLength++]=UTF16_TRAIL(MUSICAL_STACCATO); } expect[expectLength++]=HANGUL_KIYEOK; expect[expectLength++]=HANGUL_KIYEOK_SIOS; /* try destination overflow first */ errorCode=U_ZERO_ERROR; preflightLength=unorm_normalize(input, inLength, UNORM_NFKC, 0, output, 100, /* too short */ &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { log_err("error unorm_normalize(long input, output too short, UNORM_NFKC) did not overflow but %s\n", u_errorName(errorCode)); } /* real NFKC */ errorCode=U_ZERO_ERROR; length=unorm_normalize(input, inLength, UNORM_NFKC, 0, output, sizeof(output)/U_SIZEOF_UCHAR, &errorCode); if(U_FAILURE(errorCode)) { log_err("error unorm_normalize(long input, UNORM_NFKC) failed with %s\n", u_errorName(errorCode)); } else if(length!=expectLength || u_memcmp(output, expect, length)!=0) { log_err("error unorm_normalize(long input, UNORM_NFKC) produced wrong result\n"); for(i=0; i