/******************************************************************** * COPYRIGHT: * Copyright (c) 1997-2014, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** * * File CNORMTST.C * * Modification History: * Name Description * Madhu Katragadda Ported for C API * synwee added test for quick check * synwee added test for checkFCD *********************************************************************************/ /*tests for u_normalization*/ #include "unicode/utypes.h" #include "unicode/unorm.h" #include "unicode/utf16.h" #include "cintltst.h" #include "cmemory.h" #if !UCONFIG_NO_NORMALIZATION #include #include #include "unicode/uchar.h" #include "unicode/ustring.h" #include "unicode/unorm.h" #include "cnormtst.h" static void TestAPI(void); static void TestNormCoverage(void); static void TestConcatenate(void); static void TestNextPrevious(void); static void TestIsNormalized(void); static void TestFCNFKCClosure(void); static void TestQuickCheckPerCP(void); static void TestComposition(void); static void TestFCD(void); static void TestGetDecomposition(void); static void TestGetRawDecomposition(void); static void TestAppendRestoreMiddle(void); static void TestGetEasyToUseInstance(void); static const char* const canonTests[][3] = { /* Input*/ /*Decomposed*/ /*Composed*/ { "cat", "cat", "cat" }, { "\\u00e0ardvark", "a\\u0300ardvark", "\\u00e0ardvark", }, { "\\u1e0a", "D\\u0307", "\\u1e0a" }, /* D-dot_above*/ { "D\\u0307", "D\\u0307", "\\u1e0a" }, /* D dot_above*/ { "\\u1e0c\\u0307", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D-dot_below dot_above*/ { "\\u1e0a\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D-dot_above dot_below */ { "D\\u0307\\u0323", "D\\u0323\\u0307", "\\u1e0c\\u0307" }, /* D dot_below dot_above */ { "\\u1e10\\u0307\\u0323", "D\\u0327\\u0323\\u0307", "\\u1e10\\u0323\\u0307" }, /*D dot_below cedilla dot_above*/ { "D\\u0307\\u0328\\u0323", "D\\u0328\\u0323\\u0307", "\\u1e0c\\u0328\\u0307" }, /* D dot_above ogonek dot_below*/ { "\\u1E14", "E\\u0304\\u0300", "\\u1E14" }, /* E-macron-grave*/ { "\\u0112\\u0300", "E\\u0304\\u0300", "\\u1E14" }, /* E-macron + grave*/ { "\\u00c8\\u0304", "E\\u0300\\u0304", "\\u00c8\\u0304" }, /* E-grave + macron*/ { "\\u212b", "A\\u030a", "\\u00c5" }, /* angstrom_sign*/ { "\\u00c5", "A\\u030a", "\\u00c5" }, /* A-ring*/ { "\\u00C4ffin", "A\\u0308ffin", "\\u00C4ffin" }, { "\\u00C4\\uFB03n", "A\\u0308\\uFB03n", "\\u00C4\\uFB03n" }, { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \\u2163", "Henry \\u2163", "Henry \\u2163" }, { "\\u30AC", "\\u30AB\\u3099", "\\u30AC" }, /* ga (Katakana)*/ { "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /*ka + ten*/ { "\\uFF76\\uFF9E", "\\uFF76\\uFF9E", "\\uFF76\\uFF9E" }, /* hw_ka + hw_ten*/ { "\\u30AB\\uFF9E", "\\u30AB\\uFF9E", "\\u30AB\\uFF9E" }, /* ka + hw_ten*/ { "\\uFF76\\u3099", "\\uFF76\\u3099", "\\uFF76\\u3099" }, /* hw_ka + ten*/ { "A\\u0300\\u0316", "A\\u0316\\u0300", "\\u00C0\\u0316" }, /* hw_ka + ten*/ { "", "", "" } }; static const char* const compatTests[][3] = { /* Input*/ /*Decomposed */ /*Composed*/ { "cat", "cat", "cat" }, { "\\uFB4f", "\\u05D0\\u05DC", "\\u05D0\\u05DC" }, /* Alef-Lamed vs. Alef, Lamed*/ { "\\u00C4ffin", "A\\u0308ffin", "\\u00C4ffin" }, { "\\u00C4\\uFB03n", "A\\u0308ffin", "\\u00C4ffin" }, /* ffi ligature -> f + f + i*/ { "Henry IV", "Henry IV", "Henry IV" }, { "Henry \\u2163", "Henry IV", "Henry IV" }, { "\\u30AC", "\\u30AB\\u3099", "\\u30AC" }, /* ga (Katakana)*/ { "\\u30AB\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /*ka + ten*/ { "\\uFF76\\u3099", "\\u30AB\\u3099", "\\u30AC" }, /* hw_ka + ten*/ /*These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ { "\\uFF76\\uFF9E", "\\u30AB\\u3099", "\\u30AC" }, /* hw_ka + hw_ten*/ { "\\u30AB\\uFF9E", "\\u30AB\\u3099", "\\u30AC" }, /* ka + hw_ten*/ { "", "", "" } }; static const char* const fcdTests[][3] = { /* Added for testing the below-U+0300 prefix of a NUL-terminated string. */ { "\\u010e\\u0327", "D\\u0327\\u030c", NULL }, /* D-caron + cedilla */ { "\\u010e", "\\u010e", NULL } /* D-caron */ }; void addNormTest(TestNode** root); void addNormTest(TestNode** root) { addTest(root, &TestAPI, "tsnorm/cnormtst/TestAPI"); addTest(root, &TestDecomp, "tsnorm/cnormtst/TestDecomp"); addTest(root, &TestCompatDecomp, "tsnorm/cnormtst/TestCompatDecomp"); addTest(root, &TestCanonDecompCompose, "tsnorm/cnormtst/TestCanonDecompCompose"); addTest(root, &TestCompatDecompCompose, "tsnorm/cnormtst/TestCompatDecompCompose"); addTest(root, &TestFCD, "tsnorm/cnormtst/TestFCD"); addTest(root, &TestNull, "tsnorm/cnormtst/TestNull"); addTest(root, &TestQuickCheck, "tsnorm/cnormtst/TestQuickCheck"); addTest(root, &TestQuickCheckPerCP, "tsnorm/cnormtst/TestQuickCheckPerCP"); addTest(root, &TestIsNormalized, "tsnorm/cnormtst/TestIsNormalized"); addTest(root, &TestCheckFCD, "tsnorm/cnormtst/TestCheckFCD"); addTest(root, &TestNormCoverage, "tsnorm/cnormtst/TestNormCoverage"); addTest(root, &TestConcatenate, "tsnorm/cnormtst/TestConcatenate"); addTest(root, &TestNextPrevious, "tsnorm/cnormtst/TestNextPrevious"); addTest(root, &TestFCNFKCClosure, "tsnorm/cnormtst/TestFCNFKCClosure"); addTest(root, &TestComposition, "tsnorm/cnormtst/TestComposition"); addTest(root, &TestGetDecomposition, "tsnorm/cnormtst/TestGetDecomposition"); addTest(root, &TestGetRawDecomposition, "tsnorm/cnormtst/TestGetRawDecomposition"); addTest(root, &TestAppendRestoreMiddle, "tsnorm/cnormtst/TestAppendRestoreMiddle"); addTest(root, &TestGetEasyToUseInstance, "tsnorm/cnormtst/TestGetEasyToUseInstance"); } static const char* const modeStrings[]={ "UNORM_NONE", "UNORM_NFD", "UNORM_NFKD", "UNORM_NFC", "UNORM_NFKC", "UNORM_FCD", "UNORM_MODE_COUNT" }; static void TestNormCases(UNormalizationMode mode, const char* const cases[][3], int32_t lengthOfCases) { int32_t x, neededLen, length2; int32_t expIndex= (mode==UNORM_NFC || mode==UNORM_NFKC) ? 2 : 1; UChar *source=NULL; UChar result[16]; log_verbose("Testing unorm_normalize(%s)\n", modeStrings[mode]); for(x=0; x < lengthOfCases; x++) { UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR; source=CharsToUChars(cases[x][0]); neededLen= unorm_normalize(source, u_strlen(source), mode, 0, NULL, 0, &status); length2= unorm_normalize(source, -1, mode, 0, NULL, 0, &status2); if(neededLen!=length2) { log_err("ERROR in unorm_normalize(%s)[%d]: " "preflight length/NUL %d!=%d preflight length/srcLength\n", modeStrings[mode], (int)x, (int)neededLen, (int)length2); } if(status==U_BUFFER_OVERFLOW_ERROR) { status=U_ZERO_ERROR; } length2=unorm_normalize(source, u_strlen(source), mode, 0, result, UPRV_LENGTHOF(result), &status); if(U_FAILURE(status) || neededLen!=length2) { log_data_err("ERROR in unorm_normalize(%s/NUL) at %s: %s - (Are you missing data?)\n", modeStrings[mode], austrdup(source), myErrorName(status)); } else { assertEqual(result, cases[x][expIndex], x); } length2=unorm_normalize(source, -1, mode, 0, result, UPRV_LENGTHOF(result), &status); if(U_FAILURE(status) || neededLen!=length2) { log_data_err("ERROR in unorm_normalize(%s/srcLength) at %s: %s - (Are you missing data?)\n", modeStrings[mode], austrdup(source), myErrorName(status)); } else { assertEqual(result, cases[x][expIndex], x); } free(source); } } void TestDecomp() { TestNormCases(UNORM_NFD, canonTests, UPRV_LENGTHOF(canonTests)); } void TestCompatDecomp() { TestNormCases(UNORM_NFKD, compatTests, UPRV_LENGTHOF(compatTests)); } void TestCanonDecompCompose() { TestNormCases(UNORM_NFC, canonTests, UPRV_LENGTHOF(canonTests)); } void TestCompatDecompCompose() { TestNormCases(UNORM_NFKC, compatTests, UPRV_LENGTHOF(compatTests)); } void TestFCD() { TestNormCases(UNORM_FCD, fcdTests, UPRV_LENGTHOF(fcdTests)); } static void assertEqual(const UChar* result, const char* expected, int32_t index) { UChar *expectedUni = CharsToUChars(expected); if(u_strcmp(result, expectedUni)!=0){ log_err("ERROR in decomposition at index = %d. EXPECTED: %s , GOT: %s\n", index, expected, austrdup(result) ); } free(expectedUni); } static void TestNull_check(UChar *src, int32_t srcLen, UChar *exp, int32_t expLen, UNormalizationMode mode, const char *name) { UErrorCode status = U_ZERO_ERROR; int32_t len, i; UChar result[50]; status = U_ZERO_ERROR; for(i=0;i<50;i++) { result[i] = 0xFFFD; } len = unorm_normalize(src, srcLen, mode, 0, result, 50, &status); if(U_FAILURE(status)) { log_data_err("unorm_normalize(%s) with 0x0000 failed: %s - (Are you missing data?)\n", name, u_errorName(status)); } else if (len != expLen) { log_err("unorm_normalize(%s) with 0x0000 failed: Expected len %d, got %d\n", name, expLen, len); } { for(i=0;i=0 (length -1 used for special cases below) */ errorCode=U_ZERO_ERROR; if(!unorm_isNormalized(notNFC[0]+2, 1, UNORM_NFC, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error: !isNormalized(, NFC) (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } /* incoming U_FAILURE */ errorCode=U_TRUNCATED_CHAR_FOUND; (void)unorm_isNormalized(notNFC[0]+2, 1, UNORM_NFC, &errorCode); if(errorCode!=U_TRUNCATED_CHAR_FOUND) { log_err("error: isNormalized(U_TRUNCATED_CHAR_FOUND) changed the error code to %s\n", u_errorName(errorCode)); } /* NULL source */ errorCode=U_ZERO_ERROR; (void)unorm_isNormalized(NULL, 1, UNORM_NFC, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_data_err("error: isNormalized(NULL) did not set U_ILLEGAL_ARGUMENT_ERROR but %s - (Are you missing data?)\n", u_errorName(errorCode)); } /* bad length */ errorCode=U_ZERO_ERROR; (void)unorm_isNormalized(notNFC[0]+2, -2, UNORM_NFC, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_data_err("error: isNormalized([-2]) did not set U_ILLEGAL_ARGUMENT_ERROR but %s - (Are you missing data?)\n", u_errorName(errorCode)); } /* specific cases */ for(i=0; i)=%ld failed with out[]=U+%04x U+%04x U+%04x U+%04x\n", length, out[0], out[1], out[2], out[3]); return; } length=unorm_normalize(NULL, 0, UNORM_NFC, 0, NULL, 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("unorm_normalize(src NULL[0], NFC, dest NULL[0])=%ld failed with %s\n", (long)length, u_errorName(errorCode)); return; } length=unorm_normalize(NULL, 0, UNORM_NFC, 0, out, 20, &errorCode); if(U_FAILURE(errorCode)) { log_err("unorm_normalize(src NULL[0], NFC, dest out[20])=%ld failed with %s\n", (long)length, u_errorName(errorCode)); return; } } /* test cases to improve test code coverage */ enum { HANGUL_K_KIYEOK=0x3131, /* NFKD->Jamo L U+1100 */ HANGUL_K_WEO=0x315d, /* NFKD->Jamo V U+116f */ HANGUL_K_KIYEOK_SIOS=0x3133, /* NFKD->Jamo T U+11aa */ HANGUL_KIYEOK=0x1100, /* Jamo L U+1100 */ HANGUL_WEO=0x116f, /* Jamo V U+116f */ HANGUL_KIYEOK_SIOS=0x11aa, /* Jamo T U+11aa */ HANGUL_AC00=0xac00, /* Hangul syllable = Jamo LV U+ac00 */ HANGUL_SYLLABLE=0xac00+14*28+3, /* Hangul syllable = U+1100 * U+116f * U+11aa */ MUSICAL_VOID_NOTEHEAD=0x1d157, MUSICAL_HALF_NOTE=0x1d15e, /* NFC/NFD->Notehead+Stem */ MUSICAL_STEM=0x1d165, /* cc=216 */ MUSICAL_STACCATO=0x1d17c /* cc=220 */ }; static void TestNormCoverage() { UChar input[1000], expect[1000], output[1000]; UErrorCode errorCode; int32_t i, length, inLength, expectLength, hangulPrefixLength, preflightLength; /* create a long and nasty string with NFKC-unsafe characters */ inLength=0; /* 3 Jamos L/V/T, all 8 combinations normal/compatibility */ input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* Hangul LV with normal/compatibility Jamo T */ input[inLength++]=HANGUL_AC00; input[inLength++]=HANGUL_KIYEOK_SIOS; input[inLength++]=HANGUL_AC00; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* compatibility Jamo L, V */ input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_WEO; hangulPrefixLength=inLength; input[inLength++]=U16_LEAD(MUSICAL_HALF_NOTE); input[inLength++]=U16_TRAIL(MUSICAL_HALF_NOTE); for(i=0; i<200; ++i) { input[inLength++]=U16_LEAD(MUSICAL_STACCATO); input[inLength++]=U16_TRAIL(MUSICAL_STACCATO); input[inLength++]=U16_LEAD(MUSICAL_STEM); input[inLength++]=U16_TRAIL(MUSICAL_STEM); } /* (compatibility) Jamo L, T do not compose */ input[inLength++]=HANGUL_K_KIYEOK; input[inLength++]=HANGUL_K_KIYEOK_SIOS; /* quick checks */ errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFD, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error unorm_quickCheck(long input, UNORM_NFD)!=NO (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFKD, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error unorm_quickCheck(long input, UNORM_NFKD)!=NO (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFC, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error unorm_quickCheck(long input, UNORM_NFC)!=NO (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_NFKC, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error unorm_quickCheck(long input, UNORM_NFKC)!=NO (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; if(UNORM_NO!=unorm_quickCheck(input, inLength, UNORM_FCD, &errorCode) || U_FAILURE(errorCode)) { log_data_err("error unorm_quickCheck(long input, UNORM_FCD)!=NO (%s) - (Are you missing data?)\n", u_errorName(errorCode)); } /* NFKC */ expectLength=0; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_SYLLABLE; expect[expectLength++]=HANGUL_AC00+3; expect[expectLength++]=HANGUL_AC00+3; expect[expectLength++]=HANGUL_AC00+14*28; expect[expectLength++]=U16_LEAD(MUSICAL_VOID_NOTEHEAD); expect[expectLength++]=U16_TRAIL(MUSICAL_VOID_NOTEHEAD); expect[expectLength++]=U16_LEAD(MUSICAL_STEM); expect[expectLength++]=U16_TRAIL(MUSICAL_STEM); for(i=0; i<200; ++i) { expect[expectLength++]=U16_LEAD(MUSICAL_STEM); expect[expectLength++]=U16_TRAIL(MUSICAL_STEM); } for(i=0; i<200; ++i) { expect[expectLength++]=U16_LEAD(MUSICAL_STACCATO); expect[expectLength++]=U16_TRAIL(MUSICAL_STACCATO); } expect[expectLength++]=HANGUL_KIYEOK; expect[expectLength++]=HANGUL_KIYEOK_SIOS; /* try destination overflow first */ errorCode=U_ZERO_ERROR; preflightLength=unorm_normalize(input, inLength, UNORM_NFKC, 0, output, 100, /* too short */ &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR) { log_data_err("error unorm_normalize(long input, output too short, UNORM_NFKC) did not overflow but %s - (Are you missing data?)\n", u_errorName(errorCode)); } /* real NFKC */ errorCode=U_ZERO_ERROR; length=unorm_normalize(input, inLength, UNORM_NFKC, 0, output, sizeof(output)/U_SIZEOF_UCHAR, &errorCode); if(U_FAILURE(errorCode)) { log_data_err("error unorm_normalize(long input, UNORM_NFKC) failed with %s - (Are you missing data?)\n", u_errorName(errorCode)); } else if(length!=expectLength || u_memcmp(output, expect, length)!=0) { log_err("error unorm_normalize(long input, UNORM_NFKC) produced wrong result\n"); for(i=0; ihasNext(iter)) { return; } length=unorm_next(iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, mode, 0, (UBool)(out!=NULL), &neededToNormalize, &errorCode); expectIndex=srcIndexes[i+1]; in=src+prevIndex; inLength=expectIndex-prevIndex; if(out!=NULL) { /* get output piece from between plus signs */ expectLength=0; while((expect+expectLength)!=outLimit && expect[expectLength]!=_PLUS) { ++expectLength; } expectNeeded=(UBool)(0!=u_memcmp(buffer, in, inLength)); } else { expect=in; expectLength=inLength; expectNeeded=FALSE; } } else { if(!iter->hasPrevious(iter)) { return; } length=unorm_previous(iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, mode, 0, (UBool)(out!=NULL), &neededToNormalize, &errorCode); expectIndex=srcIndexes[i]; in=src+expectIndex; inLength=prevIndex-expectIndex; if(out!=NULL) { /* get output piece from between plus signs */ expectLength=0; while(expect!=out && expect[-1]!=_PLUS) { ++expectLength; --expect; } expectNeeded=(UBool)(0!=u_memcmp(buffer, in, inLength)); } else { expect=in; expectLength=inLength; expectNeeded=FALSE; } } index=iter->getIndex(iter, UITER_CURRENT); if(U_FAILURE(errorCode)) { log_data_err("error unorm iteration (next/previous %d %s)[%d]: %s - (Are you missing data?)\n", forward, _modeString[mode], i, u_errorName(errorCode)); return; } if(expectIndex!=index) { log_err("error unorm iteration (next/previous %d %s): index[%d] wrong, got %d expected %d\n", forward, _modeString[mode], i, index, expectIndex); return; } if(expectLength!=length) { log_err("error unorm iteration (next/previous %d %s): length[%d] wrong, got %d expected %d\n", forward, _modeString[mode], i, length, expectLength); return; } if(0!=u_memcmp(expect, buffer, length)) { log_err("error unorm iteration (next/previous %d %s): output string[%d] wrong\n", forward, _modeString[mode], i); return; } if(neededToNormalize!=expectNeeded) { } if(forward) { expect+=expectLength+1; /* go after the + */ ++i; } else { --expect; /* go before the + */ --i; } } } static void TestNextPrevious() { static const UChar src[]={ /* input string */ 0xa0, 0xe4, 0x63, 0x302, 0x327, 0xac00, 0x3133 }, nfd[]={ /* + separates expected output pieces */ 0xa0, _PLUS, 0x61, 0x308, _PLUS, 0x63, 0x327, 0x302, _PLUS, 0x1100, 0x1161, _PLUS, 0x3133 }, nfkd[]={ 0x20, _PLUS, 0x61, 0x308, _PLUS, 0x63, 0x327, 0x302, _PLUS, 0x1100, 0x1161, _PLUS, 0x11aa }, nfc[]={ 0xa0, _PLUS, 0xe4, _PLUS, 0xe7, 0x302, _PLUS, 0xac00, _PLUS, 0x3133 }, nfkc[]={ 0x20, _PLUS, 0xe4, _PLUS, 0xe7, 0x302, _PLUS, 0xac03 }, fcd[]={ 0xa0, _PLUS, 0xe4, _PLUS, 0x63, 0x327, 0x302, _PLUS, 0xac00, _PLUS, 0x3133 }; /* expected iterator indexes in the source string for each iteration piece */ static const int32_t nfdIndexes[]={ 0, 1, 2, 5, 6, 7 }, nfkdIndexes[]={ 0, 1, 2, 5, 6, 7 }, nfcIndexes[]={ 0, 1, 2, 5, 6, 7 }, nfkcIndexes[]={ 0, 1, 2, 5, 7 }, fcdIndexes[]={ 0, 1, 2, 5, 6, 7 }; UCharIterator iter; UChar buffer[4]; int32_t length; UBool neededToNormalize; UErrorCode errorCode; uiter_setString(&iter, src, sizeof(src)/U_SIZEOF_UCHAR); /* test iteration with doNormalize */ iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFD, TRUE, nfd, sizeof(nfd)/U_SIZEOF_UCHAR, nfdIndexes, sizeof(nfdIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKD, TRUE, nfkd, sizeof(nfkd)/U_SIZEOF_UCHAR, nfkdIndexes, sizeof(nfkdIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFC, TRUE, nfc, sizeof(nfc)/U_SIZEOF_UCHAR, nfcIndexes, sizeof(nfcIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKC, TRUE, nfkc, sizeof(nfkc)/U_SIZEOF_UCHAR, nfkcIndexes, sizeof(nfkcIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_FCD, TRUE, fcd, sizeof(fcd)/U_SIZEOF_UCHAR, fcdIndexes, sizeof(fcdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFD, FALSE, nfd, sizeof(nfd)/U_SIZEOF_UCHAR, nfdIndexes, sizeof(nfdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKD, FALSE, nfkd, sizeof(nfkd)/U_SIZEOF_UCHAR, nfkdIndexes, sizeof(nfkdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFC, FALSE, nfc, sizeof(nfc)/U_SIZEOF_UCHAR, nfcIndexes, sizeof(nfcIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKC, FALSE, nfkc, sizeof(nfkc)/U_SIZEOF_UCHAR, nfkcIndexes, sizeof(nfkcIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_FCD, FALSE, fcd, sizeof(fcd)/U_SIZEOF_UCHAR, fcdIndexes, sizeof(fcdIndexes)/4); /* test iteration without doNormalize */ iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFD, TRUE, NULL, 0, nfdIndexes, sizeof(nfdIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKD, TRUE, NULL, 0, nfkdIndexes, sizeof(nfkdIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFC, TRUE, NULL, 0, nfcIndexes, sizeof(nfcIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKC, TRUE, NULL, 0, nfkcIndexes, sizeof(nfkcIndexes)/4); iter.index=0; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_FCD, TRUE, NULL, 0, fcdIndexes, sizeof(fcdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFD, FALSE, NULL, 0, nfdIndexes, sizeof(nfdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKD, FALSE, NULL, 0, nfkdIndexes, sizeof(nfkdIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFC, FALSE, NULL, 0, nfcIndexes, sizeof(nfcIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_NFKC, FALSE, NULL, 0, nfkcIndexes, sizeof(nfkcIndexes)/4); iter.index=iter.length; _testIter(src, sizeof(src)/U_SIZEOF_UCHAR, &iter, UNORM_FCD, FALSE, NULL, 0, fcdIndexes, sizeof(fcdIndexes)/4); /* try without neededToNormalize */ errorCode=U_ZERO_ERROR; buffer[0]=5; iter.index=1; length=unorm_next(&iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, UNORM_NFD, 0, TRUE, NULL, &errorCode); if(U_FAILURE(errorCode) || length!=2 || buffer[0]!=nfd[2] || buffer[1]!=nfd[3]) { log_data_err("error unorm_next(without needed) %s - (Are you missing data?)\n", u_errorName(errorCode)); return; } /* preflight */ neededToNormalize=9; iter.index=1; length=unorm_next(&iter, NULL, 0, UNORM_NFD, 0, TRUE, &neededToNormalize, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || neededToNormalize!=FALSE || length!=2) { log_err("error unorm_next(pure preflighting) %s\n", u_errorName(errorCode)); return; } errorCode=U_ZERO_ERROR; buffer[0]=buffer[1]=5; neededToNormalize=9; iter.index=1; length=unorm_next(&iter, buffer, 1, UNORM_NFD, 0, TRUE, &neededToNormalize, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || neededToNormalize!=FALSE || length!=2 || buffer[1]!=5) { log_err("error unorm_next(preflighting) %s\n", u_errorName(errorCode)); return; } /* no iterator */ errorCode=U_ZERO_ERROR; buffer[0]=buffer[1]=5; neededToNormalize=9; iter.index=1; length=unorm_next(NULL, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, UNORM_NFD, 0, TRUE, &neededToNormalize, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("error unorm_next(no iterator) %s\n", u_errorName(errorCode)); return; } /* illegal mode */ buffer[0]=buffer[1]=5; neededToNormalize=9; iter.index=1; length=unorm_next(&iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, (UNormalizationMode)0, 0, TRUE, &neededToNormalize, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("error unorm_next(illegal mode) %s\n", u_errorName(errorCode)); return; } /* error coming in */ errorCode=U_MISPLACED_QUANTIFIER; buffer[0]=5; iter.index=1; length=unorm_next(&iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR, UNORM_NFD, 0, TRUE, NULL, &errorCode); if(errorCode!=U_MISPLACED_QUANTIFIER) { log_err("error unorm_next(U_MISPLACED_QUANTIFIER) %s\n", u_errorName(errorCode)); return; } } static void TestFCNFKCClosure(void) { static const struct { UChar32 c; const UChar s[6]; } tests[]={ { 0x00C4, { 0 } }, { 0x00E4, { 0 } }, { 0x037A, { 0x0020, 0x03B9, 0 } }, { 0x03D2, { 0x03C5, 0 } }, { 0x20A8, { 0x0072, 0x0073, 0 } }, { 0x210B, { 0x0068, 0 } }, { 0x210C, { 0x0068, 0 } }, { 0x2121, { 0x0074, 0x0065, 0x006C, 0 } }, { 0x2122, { 0x0074, 0x006D, 0 } }, { 0x2128, { 0x007A, 0 } }, { 0x1D5DB, { 0x0068, 0 } }, { 0x1D5ED, { 0x007A, 0 } }, { 0x0061, { 0 } } }; UChar buffer[8]; UErrorCode errorCode; int32_t i, length; for(i=0; i=0) { log_err("unorm2_getDecomposition(fcc, space) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getDecomposition(n2, 0xe4, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x61 || decomp[1]!=0x308 || decomp[2]!=0) { log_err("unorm2_getDecomposition(fcc, a-umlaut) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getDecomposition(n2, 0xac01, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=3 || decomp[0]!=0x1100 || decomp[1]!=0x1161 || decomp[2]!=0x11a8 || decomp[3]!=0) { log_err("unorm2_getDecomposition(fcc, Hangul syllable U+AC01) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getDecomposition(n2, 0xac01, NULL, 0, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3) { log_err("unorm2_getDecomposition(fcc, Hangul syllable U+AC01) overflow failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getDecomposition(n2, 0xac01, decomp, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("unorm2_getDecomposition(fcc, capacity<0) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getDecomposition(n2, 0xac01, NULL, 4, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("unorm2_getDecomposition(fcc, decomposition=NULL) failed\n"); } } static void TestGetRawDecomposition() { UChar decomp[32]; int32_t length; UErrorCode errorCode=U_ZERO_ERROR; const UNormalizer2 *n2=unorm2_getNFKCInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFKCInstance() failed: %s\n", u_errorName(errorCode)); return; } /* * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, * without recursive decomposition. */ length=unorm2_getRawDecomposition(n2, 0x20, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length>=0) { log_err("unorm2_getDecomposition(nfkc, space) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xe4, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x61 || decomp[1]!=0x308 || decomp[2]!=0) { log_err("unorm2_getDecomposition(nfkc, a-umlaut) failed\n"); } /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0x1e08, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0xc7 || decomp[1]!=0x301 || decomp[2]!=0) { log_err("unorm2_getDecomposition(nfkc, c-cedilla-acute) failed\n"); } /* U+212B ANGSTROM SIGN */ errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0x212b, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=1 || decomp[0]!=0xc5 || decomp[1]!=0) { log_err("unorm2_getDecomposition(nfkc, angstrom sign) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xac00, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0x1100 || decomp[1]!=0x1161 || decomp[2]!=0) { log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC00) failed\n"); } /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xac01, decomp, UPRV_LENGTHOF(decomp), &errorCode); if(U_FAILURE(errorCode) || length!=2 || decomp[0]!=0xac00 || decomp[1]!=0x11a8 || decomp[2]!=0) { log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC01) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xac01, NULL, 0, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=2) { log_err("unorm2_getDecomposition(nfkc, Hangul syllable U+AC01) overflow failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xac01, decomp, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("unorm2_getDecomposition(nfkc, capacity<0) failed\n"); } errorCode=U_ZERO_ERROR; length=unorm2_getRawDecomposition(n2, 0xac01, NULL, 4, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) { log_err("unorm2_getDecomposition(nfkc, decomposition=NULL) failed\n"); } } static void TestAppendRestoreMiddle() { UChar a[20]={ 0x61, 0x62, 0x63, 0x41, 0x327, 0 }; /* last chars are 'A' and 'cedilla' NFC */ static const UChar b[]={ 0x30A, 0x64, 0x65, 0x66, 0 }; /* first char is 'ring above' NFC */ /* NFC: C5 is 'A with ring above' */ static const UChar expected[]={ 0x61, 0x62, 0x63, 0xC5, 0x327, 0x64, 0x65, 0x66 }; int32_t length; UErrorCode errorCode=U_ZERO_ERROR; const UNormalizer2 *n2=unorm2_getNFCInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFCInstance() failed: %s\n", u_errorName(errorCode)); return; } /* * Use length=-1 to fool the estimate of the ReorderingBuffer capacity. * Use a capacity of 6 or 7 so that the middle sequence <41 327 30A> * still fits into a[] but the full result still overflows this capacity. * (Let it modify the destination buffer before reallocating internally.) */ length=unorm2_append(n2, a, -1, 6, b, -1, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=UPRV_LENGTHOF(expected)) { log_err("unorm2_append(preflight) returned wrong length of %d\n", (int)length); return; } /* Verify that the middle is unchanged or restored. (ICU ticket #7848) */ if(a[0]!=0x61 || a[1]!=0x62 || a[2]!=0x63 || a[3]!=0x41 || a[4]!=0x327 || a[5]!=0) { log_err("unorm2_append(overflow) modified the first string\n"); return; } errorCode=U_ZERO_ERROR; length=unorm2_append(n2, a, -1, UPRV_LENGTHOF(a), b, -1, &errorCode); if(U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(expected) || 0!=u_memcmp(a, expected, length)) { log_err("unorm2_append(real) failed - %s, length %d\n", u_errorName(errorCode), (int)length); return; } } static void TestGetEasyToUseInstance() { static const UChar in[]={ 0xA0, /* -> 0020 */ 0xC7, 0x301 /* = 1E08 = 0043 0327 0301 */ }; UChar out[32]; int32_t length; UErrorCode errorCode=U_ZERO_ERROR; const UNormalizer2 *n2=unorm2_getNFCInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFCInstance() failed: %s\n", u_errorName(errorCode)); return; } length=unorm2_normalize(n2, in, UPRV_LENGTHOF(in), out, UPRV_LENGTHOF(out), &errorCode); if(U_FAILURE(errorCode) || length!=2 || out[0]!=0xa0 || out[1]!=0x1e08) { log_err("unorm2_getNFCInstance() did not return an NFC instance (normalized length=%d; %s)\n", (int)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; n2=unorm2_getNFDInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFDInstance() failed: %s\n", u_errorName(errorCode)); return; } length=unorm2_normalize(n2, in, UPRV_LENGTHOF(in), out, UPRV_LENGTHOF(out), &errorCode); if(U_FAILURE(errorCode) || length!=4 || out[0]!=0xa0 || out[1]!=0x43 || out[2]!=0x327 || out[3]!=0x301) { log_err("unorm2_getNFDInstance() did not return an NFD instance (normalized length=%d; %s)\n", (int)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; n2=unorm2_getNFKCInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFKCInstance() failed: %s\n", u_errorName(errorCode)); return; } length=unorm2_normalize(n2, in, UPRV_LENGTHOF(in), out, UPRV_LENGTHOF(out), &errorCode); if(U_FAILURE(errorCode) || length!=2 || out[0]!=0x20 || out[1]!=0x1e08) { log_err("unorm2_getNFKCInstance() did not return an NFKC instance (normalized length=%d; %s)\n", (int)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; n2=unorm2_getNFKDInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFKDInstance() failed: %s\n", u_errorName(errorCode)); return; } length=unorm2_normalize(n2, in, UPRV_LENGTHOF(in), out, UPRV_LENGTHOF(out), &errorCode); if(U_FAILURE(errorCode) || length!=4 || out[0]!=0x20 || out[1]!=0x43 || out[2]!=0x327 || out[3]!=0x301) { log_err("unorm2_getNFKDInstance() did not return an NFKD instance (normalized length=%d; %s)\n", (int)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; n2=unorm2_getNFKCCasefoldInstance(&errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "unorm2_getNFKCCasefoldInstance() failed: %s\n", u_errorName(errorCode)); return; } length=unorm2_normalize(n2, in, UPRV_LENGTHOF(in), out, UPRV_LENGTHOF(out), &errorCode); if(U_FAILURE(errorCode) || length!=2 || out[0]!=0x20 || out[1]!=0x1e09) { log_err("unorm2_getNFKCCasefoldInstance() did not return an NFKC_Casefold instance (normalized length=%d; %s)\n", (int)length, u_errorName(errorCode)); } } #endif /* #if !UCONFIG_NO_NORMALIZATION */