/******************************************************************** * COPYRIGHT: * Copyright (c) 1997-1999, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ /******************************************************************************** * * File CREGRTST.C * * Modification History: * Name Description * Madhu Katragadda Ported for C API, added extra functions and tests ********************************************************************************* */ /* C FUNCTIONALITY AND REGRESSION TEST FOR BREAKITERATOR */ #include #include #include "unicode/uloc.h" #include "unicode/ubrk.h" #include "unicode/uchar.h" #include "unicode/utypes.h" #include "unicode/utypes.h" #include "unicode/ustring.h" #include "cintltst.h" #include "cregrtst.h" #include "ccolltst.h" /* -------------------------------------------------------------------------------------- */ /** * "Vector" structure for holding test tables * (this strucure is actually a linked list, but we use the name and API of the * java.util.Vector class to keep as much of our test code as possible the same.) */ struct Vector1 { UChar *text; struct Vector1 *link; }; typedef struct Vector1 Vector; void addElement(Vector *q, const char* string) { Vector *p; p=(Vector*)malloc(sizeof(Vector)); p->text=(UChar*)malloc(sizeof(UChar) * (strlen(string)+1)); u_uastrcpy(p->text, string); p->link=NULL; while(q->link!=NULL) q=q->link; q->link=p; } void addElement2(Vector *q, const UChar* string) { Vector *p; p=(Vector*)malloc(sizeof(Vector)); p->text=(UChar*)malloc(sizeof(UChar) * (u_strlen(string)+1)); u_strcpy(p->text, string); p->link=NULL; while(q->link!=NULL) q=q->link; q->link=p; } int32_t Count(Vector *q) { int32_t c=0; while(q!=NULL){ q=q->link; c++; } return c; } UChar* elementAt(Vector *q, int32_t pos) { int32_t i=0; if(q==NULL) return NULL; for(i=0;ilink; return (q->text); } /* Just to make it easier to use with UChar array.*/ UChar* UCharToUCharArray(const UChar uchar) { UChar *buffer; UChar *alias; buffer=(UChar*)malloc(sizeof(uchar) * 2); alias=buffer; *alias=uchar; alias++; *alias=0x0000; return buffer; } UChar* extractBetween(UTextOffset start, UTextOffset end, UChar* text) { UChar* result; UChar* temp; temp=(UChar*)malloc(sizeof(UChar) * ((u_strlen(text)-start)+1)); result=(UChar*)malloc(sizeof(UChar) * ((end-start)+1)); u_strcpy(temp, &text[start]); u_strncpy(result, temp, end-start); return result; } /* -------------------------------------------------------------------------------------- */ /** * BrealIterator Regression Test is medium top level test class for everything in the C BreakIterator API * (ubrk.h and ubrk.c). */ const UChar cannedTestArray[] = { 0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, 0x005c, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005B, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b, 0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3, 0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000, 0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f, 0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000 }; /*--------------------------------------------- */ /* setup methods */ /*--------------------------------------------- */ void AllocateTextBoundary() { cannedTestChars=(UChar*)malloc(sizeof(UChar) * (u_strlen(cannedTestArray) + 10)); u_uastrcpy(cannedTestChars,""); u_uastrcpy(cannedTestChars,"0x0000"); u_strcat(cannedTestChars, cannedTestArray); } void FreeTextBoundary() { free(cannedTestChars); } /*Add Word Data*/ void addTestWordData() { int32_t elems; wordSelectionData=(Vector*)malloc(sizeof(Vector)); wordSelectionData->text=(UChar*)malloc(sizeof(UChar) * 6); u_uastrcpy(wordSelectionData->text, "12,34"); wordSelectionData->link=NULL; addElement(wordSelectionData, " "); addElement2(wordSelectionData, UCharToUCharArray((UChar)(0x00A2))); /*cent sign */ addElement2(wordSelectionData, UCharToUCharArray((UChar)(0x00A3))); /*pound sign */ addElement2(wordSelectionData, UCharToUCharArray((UChar)(0x00A4))); /*currency sign */ addElement2(wordSelectionData, UCharToUCharArray((UChar)(0x00A5))); /*yen sign */ addElement(wordSelectionData, "alpha-beta-gamma"); addElement(wordSelectionData, "."); addElement(wordSelectionData, " "); addElement(wordSelectionData, "Badges"); addElement(wordSelectionData, "?"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "BADGES"); addElement(wordSelectionData, "!"); addElement(wordSelectionData, "?"); addElement(wordSelectionData, "!"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "We"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "don't"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "need"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "no"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "STINKING"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "BADGES"); addElement(wordSelectionData, "!"); addElement(wordSelectionData, "!"); addElement(wordSelectionData, "!"); addElement(wordSelectionData, "012.566,5"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "123.3434,900"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "1000,233,456.000"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "1,23.322%"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "123.1222"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "$123,000.20"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "179.01%"); addElement(wordSelectionData, "Hello"); addElement(wordSelectionData, ","); addElement(wordSelectionData, " "); addElement(wordSelectionData, "how"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "are"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "you"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "X"); addElement(wordSelectionData, " "); addElement(wordSelectionData, "Now"); addElement(wordSelectionData, "\r"); addElement(wordSelectionData, "is"); addElement(wordSelectionData, "\n"); addElement(wordSelectionData, "the"); addElement(wordSelectionData, "\r\n"); addElement(wordSelectionData, "time"); addElement(wordSelectionData, "\n"); addElement(wordSelectionData, "\r"); addElement(wordSelectionData, "for"); addElement(wordSelectionData, "\r"); addElement(wordSelectionData, "\r"); addElement(wordSelectionData, "all"); addElement(wordSelectionData, " "); /* to test for bug #4097779 */ addElement2(wordSelectionData, CharsToUChars("aa\\u0300a")); addElement(wordSelectionData, " "); /* to test for bug #4098467 What follows is a string of Korean characters (I found it in the Yellow Pages ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed it correctly), first as precomposed syllables, and then as conjoining jamo. Both sequences should be semantically identical and break the same way. precomposed syllables... */ addElement2(wordSelectionData, CharsToUChars("\\uc0c1\\ud56d")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\ud55c\\uc778")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\uc5f0\\ud569")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\uc7a5\\ub85c\\uad50\\ud68c")); addElement(wordSelectionData, " "); /* conjoining jamo... */ addElement2(wordSelectionData, CharsToUChars("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8")); addElement(wordSelectionData, " "); addElement2(wordSelectionData, CharsToUChars("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c")); addElement(wordSelectionData, " "); /* this is a test for bug #4117554: the ideographic iteration mark (U+3005) should count as a Kanji character for the purposes of word breaking */ addElement(wordSelectionData, "abc"); addElement2(wordSelectionData, CharsToUChars("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03")); addElement(wordSelectionData, "abc"); elems= Count(wordSelectionData); log_verbose("In word, the no: of words are: %d\n", elems); testWordText = createTestData(wordSelectionData, elems); } const UChar kParagraphSeparator = 0x2029; const UChar kLineSeparator = 0x2028; /** * Add Sentence Data */ void addTestSentenceData() { int32_t elems; UChar temp[100]; sentenceSelectionData=(Vector*)malloc(sizeof(Vector)); sentenceSelectionData->text=(UChar*)malloc(sizeof(UChar) * (strlen("This is a simple sample sentence. ")+1)); u_uastrcpy(sentenceSelectionData->text, "This is a simple sample sentence. "); sentenceSelectionData->link=NULL; /* addElement(sentenceSelectionData, "This is a simple sample sentence. "); */ addElement(sentenceSelectionData, "(This is it.) "); addElement(sentenceSelectionData, "This is a simple sample sentence. "); addElement(sentenceSelectionData, "\"This isn\'t it.\" "); addElement(sentenceSelectionData, "Hi! "); addElement(sentenceSelectionData, "This is a simple sample sentence. "); addElement(sentenceSelectionData, "It does not have to make any sense as you can see. "); addElement(sentenceSelectionData, "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. "); addElement(sentenceSelectionData, "Che la dritta via aveo smarrita. "); addElement(sentenceSelectionData, "He said, that I said, that you said!! "); u_uastrcpy(temp, "Don't rock the boat"); u_strcat(temp, UCharToUCharArray(kParagraphSeparator)); addElement2(sentenceSelectionData, temp); addElement(sentenceSelectionData, "Because I am the daddy, that is why. "); addElement(sentenceSelectionData, "Not on my time (el timo.)! "); u_uastrcpy(temp, "So what!!"); u_strcat(temp, UCharToUCharArray(kParagraphSeparator)); addElement2(sentenceSelectionData, temp); addElement(sentenceSelectionData, "\"But now,\" he said, \"I know!\" "); addElement(sentenceSelectionData, "Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). "); addElement(sentenceSelectionData, "One species, B. anthracis, is highly virulent.\n"); addElement(sentenceSelectionData, "Wolf said about Sounder:\"Beautifully thought-out and directed.\" "); addElement(sentenceSelectionData, "Have you ever said, \"This is where\tI shall live\"? "); addElement(sentenceSelectionData, "He answered, \"You may not!\" "); addElement(sentenceSelectionData, "Another popular saying is: \"How do you do?\". "); addElement(sentenceSelectionData, "Yet another popular saying is: \'I\'m fine thanks.\' "); addElement(sentenceSelectionData, "What is the proper use of the abbreviation pp.? "); addElement(sentenceSelectionData, "Yes, I am definatelly 12\" tall!!"); /* test for bug #4113835: \n and \r count as spaces, not as paragraph breaks */ u_uastrcpy(temp, "Now\ris\nthe\r\ntime\n\rfor\r\rall"); u_strcat(temp, UCharToUCharArray(kParagraphSeparator)); addElement2(sentenceSelectionData, temp); /* test for bug #4117554: Treat fullwidth variants of .!? the same as their normal counterparts */ addElement2(sentenceSelectionData, CharsToUChars("I know I'm right\\uff0e ")); addElement2(sentenceSelectionData, CharsToUChars("Right\\uff1f ")); addElement2(sentenceSelectionData, CharsToUChars("Right\\uff01 ")); /* test for bug #4117554: Break sentence between a sentence terminator and opening punctuation */ addElement(sentenceSelectionData, "no?"); u_uastrcpy(temp, "(yes)"); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); /* test for bug #4158381: Don't break sentence after period if it isn't followed by a space */ addElement(sentenceSelectionData, "Test Flags.Flag class. "); u_uastrcpy(temp, "Another test."); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); /* test for bug #4158381: No breaks when there are no terminators around */ addElement(sentenceSelectionData, "

Provides a set of "lightweight" (all-javaTM language) components that, to the maximum degree possible, work the same on all platforms. "); u_uastrcpy(temp, "Another test."); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); /* test for bug #4143071: Make sure sentences that end with digits work right */ addElement(sentenceSelectionData, "Today is the 27th of May, 1998. "); addElement(sentenceSelectionData, "Tomorrow with be 28 May 1998. "); u_uastrcpy(temp, "The day after will be the 30th."); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); /* test for bug #4152416: Make sure sentences ending with a capital letter are treated correctly */ addElement(sentenceSelectionData, "The type of all primitive boolean values accessed in the target VM. "); u_uastrcpy(temp, "Calls to xxx will return an implementor of this interface."); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); /* test for bug #4152117: Make sure sentence breaking is handling punctuation correctly */ addElement(sentenceSelectionData, "Constructs a randomly generated BigInteger, uniformly distributed over the range 0 to (2numBits - 1), inclusive. "); addElement(sentenceSelectionData, "The uniformity of the distribution assumes that a fair source of random bits is provided in rnd. "); u_uastrcpy(temp, "Note that this constructor always constructs a non-negative BigInteger."); u_strcat(temp, CharsToUChars("\\u2029")); addElement2(sentenceSelectionData, temp); elems = Count(sentenceSelectionData); log_verbose("In sentence: the no: of sentences are %d\n", elems); testSentenceText = createTestData(sentenceSelectionData, elems); } /** * Add Line Data */ void addTestLineData() { int32_t elems; lineSelectionData=(Vector*)malloc(sizeof(Vector)); lineSelectionData->text=(UChar*)malloc(sizeof(UChar) * 7); u_uastrcpy(lineSelectionData->text, "Multi-"); lineSelectionData->link=NULL; /* lineSelectionData->addElement("Multi-"); */ addElement(lineSelectionData, "Level "); addElement(lineSelectionData, "example "); addElement(lineSelectionData, "of "); addElement(lineSelectionData, "a "); addElement(lineSelectionData, "semi-"); addElement(lineSelectionData, "idiotic "); addElement(lineSelectionData, "non-"); addElement(lineSelectionData, "sensical "); addElement(lineSelectionData, "(non-"); addElement(lineSelectionData, "important) "); addElement(lineSelectionData, "sentence. "); addElement(lineSelectionData, "Hi "); addElement(lineSelectionData, "Hello "); addElement(lineSelectionData, "How\n"); addElement(lineSelectionData, "are\r"); addElement2(lineSelectionData, CharsToUChars("you\\u2028")); /* lineSeperator */ addElement(lineSelectionData, "fine.\t"); addElement(lineSelectionData, "good. "); addElement(lineSelectionData, "Now\r"); addElement(lineSelectionData, "is\n"); addElement(lineSelectionData, "the\r\n"); addElement(lineSelectionData, "time\n"); addElement(lineSelectionData, "\r"); addElement(lineSelectionData, "for\r"); addElement(lineSelectionData, "\r"); addElement(lineSelectionData, "all "); /* to test for bug #4068133 */ addElement2(lineSelectionData, CharsToUChars("\\u96f6")); addElement2(lineSelectionData, CharsToUChars("\\u4e00\\u3002")); addElement2(lineSelectionData, CharsToUChars("\\u4e8c\\u3001")); addElement2(lineSelectionData, CharsToUChars("\\u4e09\\u3002\\u3001")); addElement2(lineSelectionData, CharsToUChars("\\u56db\\u3001\\u3002\\u3001")); addElement2(lineSelectionData, CharsToUChars("\\u4e94,")); addElement2(lineSelectionData, CharsToUChars("\\u516d.")); addElement2(lineSelectionData, CharsToUChars("\\u4e03.\\u3001,\\u3002")); addElement2(lineSelectionData, CharsToUChars("\\u516b")); /* to test for bug #4086052 */ addElement2(lineSelectionData, CharsToUChars("foo\\u00a0bar ")); /* to test for bug #4097920 */ addElement(lineSelectionData, "dog,"); addElement(lineSelectionData, "cat,"); addElement(lineSelectionData, "mouse "); addElement(lineSelectionData, "(one)"); addElement(lineSelectionData, "(two)\n"); /* to test for bug #4035266 */ addElement(lineSelectionData, "The "); addElement(lineSelectionData, "balance "); addElement(lineSelectionData, "is "); addElement(lineSelectionData, "$-23,456.78, "); addElement(lineSelectionData, "not "); addElement(lineSelectionData, "-$32,456.78!\n"); /* to test for bug #4098467 What follows is a string of Korean characters (I found it in the Yellow Pages ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed it correctly), first as precomposed syllables, and then as conjoining jamo. Both sequences should be semantically identical and break the same way. precomposed syllables... */ addElement2(lineSelectionData, CharsToUChars("\\uc0c1\\ud56d ")); addElement2(lineSelectionData, CharsToUChars("\\ud55c\\uc778 ")); addElement2(lineSelectionData, CharsToUChars("\\uc5f0\\ud569 ")); addElement2(lineSelectionData, CharsToUChars("\\uc7a5\\ub85c\\uad50\\ud68c ")); /* conjoining jamo... */ addElement2(lineSelectionData, CharsToUChars("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc ")); addElement2(lineSelectionData, CharsToUChars("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab ")); addElement2(lineSelectionData, CharsToUChars("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 ")); addElement2(lineSelectionData, CharsToUChars("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c")); /* to test for bug #4117554: Fullwidth .!? should be treated as postJwrd */ addElement2(lineSelectionData, CharsToUChars("\\u4e01\\uff0e")); addElement2(lineSelectionData, CharsToUChars("\\u4e02\\uff01")); addElement2(lineSelectionData, CharsToUChars("\\u4e03\\uff1f")); elems = Count(lineSelectionData); log_verbose("In line: the no: of lines are %d\n", elems); testLineText = createTestData(lineSelectionData, elems); } /* const UChar* graveS = "S" + (UniChar)0x0300; const UChar* acuteBelowI = "i" + UCharToUCharArray(0x0317); const UChar* acuteE = "e" + UCharToUCharArray(0x0301); const UChar* circumflexA = "a" + UCharToUCharArray(0x0302); const UChar* tildeE = "e" + UCharToUCharArray(0x0303); */ /** * Add Character Data */ void addTestCharacterData() { int32_t elems; UChar temp[10]; characterSelectionData=(Vector*)malloc(sizeof(Vector)); characterSelectionData->text=(UChar*)malloc(sizeof(UChar) * 2); u_uastrcpy(characterSelectionData->text, "B"); characterSelectionData->link=NULL; u_uastrcpy(temp, "S"); u_strcat(temp, UCharToUCharArray(0x0317)); addElement2(characterSelectionData, temp); /* graveS */ u_uastrcpy(temp, "i"); u_strcat(temp, UCharToUCharArray(0x0301)); addElement2(characterSelectionData, temp); /* acuteBelowI */ addElement(characterSelectionData, "m"); addElement(characterSelectionData, "p"); addElement(characterSelectionData, "l"); u_uastrcpy(temp, "e"); u_strcat(temp, UCharToUCharArray(0x0301)); addElement2(characterSelectionData, temp);/* acuteE */ addElement(characterSelectionData, " "); addElement(characterSelectionData, "s"); u_uastrcpy(temp, "a"); u_strcat(temp, UCharToUCharArray(0x0302)); addElement2(characterSelectionData, temp);/* circumflexA */ addElement(characterSelectionData, "m"); addElement(characterSelectionData, "p"); addElement(characterSelectionData, "l"); u_uastrcpy(temp, "e"); u_strcat(temp, UCharToUCharArray(0x0303)); addElement2(characterSelectionData, temp); /* tildeE */ addElement(characterSelectionData, "."); addElement(characterSelectionData, "w"); u_uastrcpy(temp, "a"); u_strcat(temp, UCharToUCharArray(0x0302)); addElement2(characterSelectionData, temp);/* circumflexA */ addElement(characterSelectionData, "w"); addElement(characterSelectionData, "a"); addElement(characterSelectionData, "f"); addElement(characterSelectionData, "q"); addElement(characterSelectionData, "\n"); addElement(characterSelectionData, "\r"); addElement(characterSelectionData, "\r\n"); addElement(characterSelectionData, "\n"); addElement(characterSelectionData, "E"); /* to test for bug #4098467 What follows is a string of Korean characters (I found it in the Yellow Pages ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed it correctly), first as precomposed syllables, and then as conjoining jamo. Both sequences should be semantically identical and break the same way. precomposed syllables... */ addElement2(characterSelectionData, CharsToUChars("\\uc0c1")); addElement2(characterSelectionData, CharsToUChars("\\ud56d")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\ud55c")); addElement2(characterSelectionData, CharsToUChars("\\uc778")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\uc5f0")); addElement2(characterSelectionData, CharsToUChars("\\ud569")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\uc7a5")); addElement2(characterSelectionData, CharsToUChars("\\ub85c")); addElement2(characterSelectionData, CharsToUChars("\\uad50")); addElement2(characterSelectionData, CharsToUChars("\\ud68c")); addElement(characterSelectionData, " "); /* conjoining jamo... */ addElement2(characterSelectionData, CharsToUChars("\\u1109\\u1161\\u11bc")); addElement2(characterSelectionData, CharsToUChars("\\u1112\\u1161\\u11bc")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\u1112\\u1161\\u11ab")); addElement2(characterSelectionData, CharsToUChars("\\u110b\\u1175\\u11ab")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\u110b\\u1167\\u11ab")); addElement2(characterSelectionData, CharsToUChars("\\u1112\\u1161\\u11b8")); addElement(characterSelectionData, " "); addElement2(characterSelectionData, CharsToUChars("\\u110c\\u1161\\u11bc")); addElement2(characterSelectionData, CharsToUChars("\\u1105\\u1169")); addElement2(characterSelectionData, CharsToUChars("\\u1100\\u116d")); addElement2(characterSelectionData, CharsToUChars("\\u1112\\u116c")); elems = Count(characterSelectionData); log_verbose("In character: the no: of characters are %d", elems); testCharacterText = createTestData(characterSelectionData, elems); } UChar* createTestData(Vector *select, int32_t e) { int32_t i, len; UChar* result; result=(UChar*)malloc(sizeof(UChar) * 2); u_uastrcpy(result, ""); i=0; while (i= 75) return; } } } } ubrk_close(e); /* it does break after hyphens (unless they're followed by a digit, a non-spacing mark, a currency symbol, a non-breaking space, or a line or paragraph separator) */ u_strcpy(dashes, CharsToUChars("-\\u00ad\\u2010\\u2012\\u2013\\u2014")); for (i = 0; i < u_strlen(s); i++) { work[0] = s[i]; for (j = 0; j < u_strlen(dashes); j++) { work[1] = dashes[j]; for (k = 0; k < u_strlen(s); k++) { c = s[k]; if (u_charType(c) == U_DECIMAL_DIGIT_NUMBER || u_charType(c) == U_OTHER_NUMBER || u_charType(c) == U_NON_SPACING_MARK || u_charType(c) == U_ENCLOSING_MARK || u_charType(c) == U_CURRENCY_SYMBOL || u_charType(c) == U_SPACE_SEPARATOR || u_charType(c) == U_DASH_PUNCTUATION || u_charType(c) == U_CONTROL_CHAR || u_charType(c) == U_FORMAT_CHAR || c == '\n' || c == '\r' || c == 0x2028 || c == 0x2029 || c == 0x0003 || c == 0x00a0 || c == 0x2007 || c == 0x2011 || c == 0xfeff) continue; work[2] = c; e = ubrk_open(UBRK_LINE, "en_US", work, u_strlen(work), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator in testLineInvaiants:\n %s \n", myErrorName(status)); return; } saw2 = FALSE; for (l = ubrk_first(e); l != UBRK_DONE; l = ubrk_next(e)) if (l == 2) saw2 = TRUE; if (!saw2) { log_err("Didn't get break between U+%s and U+%s\n", austrdup(UCharToUCharArray(work[1])), austrdup(UCharToUCharArray(work[2])) ); errorCount++; if (errorCount >= 75) return; } } } } ubrk_close(e); free(s); FreeTextBoundary(); } /*--------------------------------------------- CharacterBreak tests --------------------------------------------- */ void TestForwardCharacterSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing forward character selection.....\n"); doForwardSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestFirstCharacterSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing first character selection.....\n"); doFirstSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestLastCharacterSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing last character selection.....\n"); doLastSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestBackwardCharacterSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing backward character selection.....\n"); doBackwardSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestForwardCharacterIndexSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing forward index character selection.....\n"); doForwardIndexSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestBackwardCharacterIndexSelection() { UErrorCode status = U_ZERO_ERROR; UBreakIterator *e; addTestCharacterData(); e = ubrk_open(UBRK_CHARACTER, "en_US", testCharacterText, u_strlen(testCharacterText), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } log_verbose("Testing backward character index selection.....\n"); doBackwardSelectionTest(e, testCharacterText, characterSelectionData); ubrk_close(e); free(characterSelectionData); } void TestCharacterInvariants() { UChar *s; AllocateTextBoundary(); s=(UChar*)malloc(sizeof(UChar) * (u_strlen(cannedTestChars) + 15)); u_strcpy(s, cannedTestChars); u_strcat(s, CharsToUChars("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa")); log_verbose("Testing character break invariant.....\n"); doBreakInvariantTest(UBRK_CHARACTER, s); u_strcpy(s, cannedTestChars); u_strcat(s, CharsToUChars("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa")); log_verbose("Testing character other invariant.....\n"); doOtherInvariantTest(UBRK_CHARACTER, s); free(s); FreeTextBoundary(); } /*--------------------------------------------- other tests --------------------------------------------- */ void TestPreceding() { UTextOffset p1, p2, p3, p4, f, p; UBreakIterator *e; UChar words3[15]; UErrorCode status = U_ZERO_ERROR; u_uastrcpy(words3, "aaa bbb ccc"); log_verbose("Testting preceding...\n"); e = ubrk_open(UBRK_WORD, "en_US", words3, u_strlen(words3), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status)); } ubrk_first(e); p1 = ubrk_next(e); p2 = ubrk_next(e); p3 = ubrk_next(e); p4 = ubrk_next(e); f = ubrk_following(e, p2+1); p = ubrk_preceding(e, p2+1); if (f!=p3) log_err("Error in TestPreceding: %d!=%d\n", (int32_t)f, (int32_t)p3); if (p!=p2) log_err("Error in TestPreceding: %d!=%d\n", (int32_t)p, (int32_t)p2); ubrk_close(e); } /** * @bug 4068137 */ void TestEndBehaviour() { int32_t end, previous; UErrorCode status = U_ZERO_ERROR; UBreakIterator* wb; UChar testString[5]; u_uastrcpy(testString, "boo"); log_verbose("Testing end behaviour\n"); wb = ubrk_open(UBRK_WORD, "en_US", testString, u_strlen(testString), &status); if(U_FAILURE(status)){ log_err("FAIL: Error in opening the word break Iterator: %s\n", myErrorName(status)); return; } end=ubrk_last(wb); previous=ubrk_previous(wb); log_verbose("end= %d and previous=%d %d\n", end, previous, ubrk_previous(wb)); ubrk_close(wb); } /*--------------------------------------------- Test implementation routines --------------------------------------------- */ void doForwardSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { UChar *expectedResult, *selectionResult; int32_t lastOffset, offset; int32_t forwardSelectionCounter = 0; int32_t forwardSelectionOffset = 0; log_verbose("doForwardSelectionTest text of length: %d\n", u_strlen(testText)); lastOffset = ubrk_first(iterator); offset = ubrk_next(iterator); while(offset!=UBRK_DONE && forwardSelectionCounter < Count(result)) { if (offset != ubrk_current(iterator)){ log_err("current() failed: it returned %d and offset was %d\n", ubrk_current(iterator), offset); } expectedResult =elementAt(result, forwardSelectionCounter); forwardSelectionOffset += u_strlen(expectedResult); selectionResult=extractBetween(lastOffset, offset, testText); if (offset != forwardSelectionOffset) { log_err("\n*** Selection #%d\n expected : %s - length %d\n\rselected : %s - length %d\n", forwardSelectionCounter, austrdup(expectedResult), u_strlen(expectedResult), austrdup(selectionResult), u_strlen(selectionResult) ); } log_verbose("#%d [\"%d\",\"%d\"] : %s\n", forwardSelectionCounter, lastOffset, offset, austrdup(selectionResult)); forwardSelectionCounter++; lastOffset = offset; offset = ubrk_next(iterator); } if (forwardSelectionCounter < Count(result) - 1){ log_err("\n*** Selection #%d not found at offset %d !!!\n", forwardSelectionCounter, offset); } else if (forwardSelectionCounter >= Count(result) && offset != UBRK_DONE){ log_err("\n*** Selection #%d should not exist at offset %d !!!\n", forwardSelectionCounter, offset); } } void doBackwardSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { UChar* expectedResult; UChar* selectionResult; int32_t backwardSelectionCounter, neededOffset, lastOffset, offset; backwardSelectionCounter = (Count(result) - 1); neededOffset = u_strlen(testText); lastOffset = ubrk_last(iterator); offset = ubrk_previous(iterator); log_verbose("doBackwardSelectionTest text of length: %d\n", u_strlen(testText)); while(offset != UBRK_DONE) { expectedResult = elementAt(result, backwardSelectionCounter); neededOffset -= u_strlen(expectedResult); selectionResult=extractBetween(offset, lastOffset, testText); if(offset != neededOffset) { log_err("\n*** Selection #%d\nExpected : %d > %s < \n\rSelected : %d > %s < \n", backwardSelectionCounter, neededOffset, austrdup(expectedResult), offset, austrdup(selectionResult) ); } log_verbose("#%d : %s\n", backwardSelectionCounter, selectionResult); backwardSelectionCounter--; lastOffset = offset; offset = ubrk_previous(iterator); } if (backwardSelectionCounter >= 0 && offset != UBRK_DONE){ log_err("*** Selection #%d not found!!!\n", backwardSelectionCounter); } } void doFirstSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { int32_t selectionStart, selectionEnd; UChar* expectedFirstSelection; UChar* tempFirst; bool_t success = TRUE; log_verbose("doFirstSelectionTest.......\n"); selectionStart = ubrk_first(iterator); selectionEnd = ubrk_next(iterator); if(selectionEnd != UBRK_DONE) { tempFirst=extractBetween(selectionStart, selectionEnd, testText); expectedFirstSelection = elementAt(result,0); if(u_strcmp(tempFirst,expectedFirstSelection)!=0) { log_err("\n### Error in doFirstSelectionTest. First selection not equal to what expected\n"); log_err("Expected: %s - length %d\n\rSelected: %s - length %d\n", austrdup(expectedFirstSelection), u_strlen(expectedFirstSelection), austrdup(tempFirst), u_strlen(tempFirst)); success = FALSE; } } else if (selectionStart != 0 || u_strlen(testText)!= 0) { log_err("\n### Error in doFirstSelectionTest. Could not get first selection.\n\r start= %d end= %d\n", selectionStart, selectionEnd); success = FALSE; } if(success) { log_verbose("doFirstSelectionTest\n\nExpexcted first selection: %s\nCalculated first selection: %s is correct\n", austrdup(expectedFirstSelection), austrdup(tempFirst) ); } } void doLastSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { int32_t selectionEnd, selectionStart; UChar *expectedLastSelection; UChar *tempLast; bool_t success = TRUE; log_verbose("doLastSelectionTest.......\n"); selectionEnd = ubrk_last(iterator); selectionStart = ubrk_previous(iterator); if(selectionStart != UBRK_DONE) { tempLast=extractBetween(selectionStart, selectionEnd, testText); expectedLastSelection = elementAt(result,Count(result)-1); if(u_strcmp(tempLast,expectedLastSelection)!=0) { log_err("\n\n### Error in doLastSelectionTest. Last selection not equal to what expected.\n"); log_err("Expected: %s - length %d\n\r Selected: %s - length %d\n", austrdup(expectedLastSelection), u_strlen(expectedLastSelection), austrdup(tempLast), u_strlen(tempLast) ); success = FALSE; } } else if (selectionEnd != 0 || u_strlen(testText)!= 0) { log_err("\n### Error in doLastSelectionTest. Could not get last selection. [%d,%d]\n", selectionStart, selectionEnd); success = FALSE; } if(success) { log_verbose("doLastSelectionTest\n\nExpected Last selection: %s \n", austrdup(expectedLastSelection)); log_verbose("Calculated last Selection: %s is correct\n", austrdup(tempLast) ); } } /** * @bug 4052418 4068139 */ void doForwardIndexSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { int32_t arrayCount, textLength; int32_t selBegin, selEnd, current, entry, pos; UTextOffset offset; log_verbose("doForwardIndexSelectionTest text of length: %d\n", u_strlen(testText)); arrayCount = Count(result); textLength = u_strlen(testText); for(offset = 0; offset < textLength; offset++) { selBegin = ubrk_preceding(iterator, offset); selEnd = ubrk_following(iterator, offset); entry = 0; pos = 0; if (selBegin != UBRK_DONE) { while (pos < selBegin && entry < arrayCount) { pos += u_strlen(elementAt(result, entry)); ++entry; } if (pos != selBegin) { log_err("With offset = %d, got back spurious %d from preceding\n", offset, selBegin); continue; } else { pos += u_strlen(elementAt(result, entry)); ++entry; } } current=ubrk_current(iterator); if(pos==current){ if (pos != selEnd) { log_err("With offset = %d, got back erroneous %d from follwoing\n", offset, selEnd); continue; } } } } /** * @bug 4052418 4068139 */ void doBackwardIndexSelectionTest(UBreakIterator* iterator, UChar* testText, Vector* result) { int32_t arrayCount, textLength; int32_t selBegin, selEnd, current, entry, pos; UTextOffset offset; log_verbose("doBackwardIndexSelectionTest text of length: %d\n", u_strlen(testText)); arrayCount = Count(result); textLength = u_strlen(testText); for(offset = textLength-1; offset >= 0; offset--) { selBegin = ubrk_preceding(iterator, offset); selEnd = ubrk_following(iterator, offset); entry = 0; pos = 0; if (selBegin != UBRK_DONE) { while (pos < selBegin && entry < arrayCount) { pos += u_strlen(elementAt(result, entry)); ++entry; } if (pos != selBegin) { log_err("With offset = %d, got back spurious %d from preceding\n", offset, selBegin); continue; } else { pos += u_strlen(elementAt(result, entry)); ++entry; } } current=ubrk_current(iterator); if(pos==current){ if (pos != selEnd) { log_err("With offset = %d, got back erroneous %d from following\n", offset, selEnd); continue; } } } } void doBreakInvariantTest(UBreakIteratorType type, UChar* testChars) { int l,k; UBreakIterator *tb; UTextOffset i, j; UErrorCode status = U_ZERO_ERROR; UChar work[3]; UChar breaks[10]; UChar c; bool_t seen2; int errorCount = 0; status=U_ZERO_ERROR; u_uastrcpy(work, "aaa"); log_verbose("doBreakInvariantTest text of length: %d\n", u_strlen(testChars)); /* a break should always occur after CR (unless followed by LF), LF, PS, and LS */ u_strcpy(breaks, CharsToUChars("\r\n\\u2029\\u2028")); tb = ubrk_open(type, "en_US", work, u_strlen(work), &status); for (i = 0; i < u_strlen(breaks); i++) { work[1] = breaks[i]; for (j = 0; j < u_strlen(testChars); j++) { work[0] = testChars[j]; for (k = 0; k < u_strlen(testChars); k++) { c = testChars[k]; /* if a cr is followed by lf, ps, ls or etx, don't do the check (that's not supposed to work) */ if (work[1] == '\r' && (c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003)) continue; work[2] = testChars[k]; ubrk_setText(tb, work, u_strlen(work), &status); if(U_FAILURE(status)){ log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status)); } seen2 = FALSE; for (l = ubrk_first(tb); l != UBRK_DONE; l = ubrk_next(tb)) { if (l == 2) seen2 = TRUE; } if (!seen2) { log_err("No break between U+%s and U+%s\n", austrdup(UCharToUCharArray(work[1])), austrdup(UCharToUCharArray(work[2])) ); errorCount++; if (errorCount >= 75) return; } } } } ubrk_close(tb); } void doOtherInvariantTest(UBreakIteratorType type , UChar* testChars) { int32_t k; UBreakIterator *tb; UTextOffset i, j; UErrorCode status = U_ZERO_ERROR; UChar work[5]; UChar c; int32_t errorCount = 0; status=U_ZERO_ERROR; u_uastrcpy(work, "a\r\na"); log_verbose("doOtherInvariantTest text of length: %d\n", u_strlen(testChars)); tb = ubrk_open(type, "en_us", work, u_strlen(work), &status); /* a break should never occur between CR and LF */ for (i = 0; i < u_strlen(testChars); i++) { work[0] = testChars[i]; for (j = 0; j < u_strlen(testChars); j++) { work[3] = testChars[j]; ubrk_setText(tb, work, u_strlen(work), &status); if(U_FAILURE(status)){ log_err("ERROR in opening the breakIterator in doVariant Function: %s\n", myErrorName(status)); } for ( k = ubrk_first(tb); k != UBRK_DONE; k = ubrk_next(tb)) if (k == 2) { log_err("Break between CR and LF in string U+%s, U+d U+a U+%s\n", austrdup(UCharToUCharArray(work[0])), austrdup(UCharToUCharArray(work[3])) ); errorCount++; if (errorCount >= 75) return; } } } /* a break should never occur before a non-spacing mark, unless the preceding character is CR, LF, PS, or LS */ u_uastrcpy(work,"aaaa"); for (i = 0; i < u_strlen(testChars); i++) { c = testChars[i]; if (c == '\n' || c == '\r' || c == 0x2029 || c == 0x2028 || c == 0x0003) continue; work[1] = c; for (j = 0; j < u_strlen(testChars); j++) { c = testChars[j]; if ((u_charType(c) != U_NON_SPACING_MARK) && (u_charType(c) != U_ENCLOSING_MARK)) continue; work[2] = c; ubrk_setText(tb, work, u_strlen(work), &status); if(U_FAILURE(status)){ log_err("ERROR in opening the breakIterator in doOtherVariant Function %s\n", myErrorName(status)); } for (k = ubrk_first(tb); k != UBRK_DONE; k = ubrk_next(tb)) if (k == 2) { log_err("Break between U+%s and U+%s\n", austrdup(UCharToUCharArray(work[1])), austrdup(UCharToUCharArray(work[2])) ); errorCount++; if (errorCount >= 75) return; } } } ubrk_close(tb); } void sample(UBreakIterator* tb, UChar* text) { int32_t start, end; UChar* substring; log_verbose("-------------------------\n"); log_verbose("%s of length %d\n", austrdup(text), u_strlen(text)); start = ubrk_first(tb); for (end = ubrk_next(tb); end != UBRK_DONE; end = ubrk_next(tb)) { substring=extractBetween(start, end, text); log_err("[%d,%d] \"%s\" \n", start, end, austrdup(substring) ); start = end; } } void addBrkIterRegrTest(TestNode** root) { addTest(root, &TestForwardWordSelection, "tstxtbd/cregrtst/TestForwardWordSelection" ); addTest(root, &TestBackwardWordSelection, "tstxtbd/cregrtst/TestBackwardWordSelection" ); addTest(root, &TestFirstWordSelection, "tstxtbd/cregrtst/TestFirstWordSelection" ); addTest(root, &TestLastWordSelection, "tstxtbd/cregrtst/TestLastWordSelection" ); addTest(root, &TestForwardWordIndexSelection, "tstxtbd/cregrtst/TestForwardWordIndexSelection"); addTest(root, &TestBackwardWordIndexSelection, "tstxtbd/cregrtst/TestBackwardWordIndexSelection"); addTest(root, &TestForwardSentenceSelection, "tstxtbd/cregrtst/TestForwardSentenceSelection"); addTest(root, &TestBackwardSentenceSelection, "tstxtbd/cregrtst/TestBackwardSentenceSelection"); addTest(root, &TestFirstSentenceSelection, "tstxtbd/cregrtst/TestFirstSentenceSelection"); addTest(root, &TestLastSentenceSelection, "tstxtbd/cregrtst/TestLastSentenceSelection"); addTest(root, &TestForwardSentenceIndexSelection, "tstxtbd/cregrtst/TestForwardSentenceIndexSelection"); addTest(root, &TestBackwardSentenceIndexSelection, "tstxtbd/cregrtst/TestBackwardSentenceIndexSelection"); addTest(root, &TestForwardLineSelection, "tstxtbd/cregrtst/TestForwardLineSelection"); addTest(root, &TestBackwardLineSelection, "tstxtbd/cregrtst/TestBackwardLineSelection"); addTest(root, &TestFirstLineSelection, "tstxtbd/cregrtst/TestFirstLineSelection"); addTest(root, &TestLastLineSelection, "tstxtbd/cregrtst/TestLastLineSelection"); addTest(root, &TestForwardLineIndexSelection, "tstxtbd/cregrtst/TestForwardLineIndexSelection"); addTest(root, &TestBackwardLineIndexSelection, "tstxtbd/cregrtst/TestBackwardLineIndexSelection"); addTest(root, &TestForwardCharacterSelection, "tstxtbd/cregrtst/TestForwardCharacterSelection"); addTest(root, &TestBackwardCharacterSelection, "tstxtbd/cregrtst/TestBackwardCharacterSelection"); addTest(root, &TestFirstCharacterSelection, "tstxtbd/cregrtst/TestFirstCharacterSelection"); addTest(root, &TestLastCharacterSelection, "tstxtbd/cregrtst/TestLastCharacterSelection"); addTest(root, &TestForwardCharacterIndexSelection, "tstxtbd/cregrtst/TestForwardCharacterIndexSelection"); addTest(root, &TestBackwardCharacterIndexSelection, "tstxtbd/cregrtst/TestBackwardCharacterIndexSelection"); addTest(root, &TestPreceding, "tstxtbd/cregrtst/TestPreceding"); addTest(root, &TestEndBehaviour, "tstxtbd/cregrtst/TestEndBehaviour"); addTest(root, &TestWordInvariants, "tstxtbd/cregrtst/TestWordInvariants"); addTest(root, &TestSentenceInvariants, "tstxtbd/cregrtst/TestSentenceInvariants"); addTest(root, &TestCharacterInvariants, "tstxtbd/cregrtst/TestCharacterInvariants"); addTest(root, &TestLineInvariants, "tstxtbd/cregrtst/TestLineInvariants"); }