32c09250b7
X-SVN-Rev: 8939
1667 lines
71 KiB
C++
1667 lines
71 KiB
C++
/********************************************************************
|
|
* COPYRIGHT:
|
|
* Copyright (c) 1997-2001, International Business Machines Corporation and
|
|
* others. All Rights Reserved.
|
|
********************************************************************/
|
|
|
|
#include "intltest.h"
|
|
#include "unicode/brkiter.h"
|
|
#include "unicode/unicode.h"
|
|
#include "unicode/uchar.h"
|
|
#include <stdio.h>
|
|
//#include "txbdapi.h" // BreakIteratorAPIC
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
/**
|
|
* "Vector" class for holding test tables
|
|
* (this class is actually a linked list, but we use the name and API of the
|
|
* java.util.Vector class to keep as much of our test code as possible the same.)
|
|
*/
|
|
class Enumeration { // text enumeration
|
|
public:
|
|
virtual UBool hasMoreElements() = 0;
|
|
virtual UnicodeString nextElement() = 0;
|
|
};
|
|
|
|
class Vector { // text vector
|
|
public:
|
|
|
|
class TextLink {
|
|
public:
|
|
TextLink() : fLink(0), fText() {}
|
|
TextLink(TextLink* link, UnicodeString text) : fLink(link), fText(text) {}
|
|
|
|
TextLink* fLink;
|
|
UnicodeString fText;
|
|
};
|
|
|
|
public:
|
|
TextLink fBase;
|
|
TextLink* fEnd;
|
|
int32_t fSize;
|
|
|
|
public:
|
|
class VectorEnumeration : public Enumeration {
|
|
public:
|
|
VectorEnumeration(Vector* vector) : fVector(vector), fPos(&vector->fBase) {}
|
|
|
|
UBool hasMoreElements() { return fPos->fLink != &fVector->fBase; }
|
|
UnicodeString nextElement() { fPos = fPos->fLink; return fPos->fText; }
|
|
|
|
Vector* fVector;
|
|
TextLink* fPos;
|
|
};
|
|
|
|
Vector() : fBase(), fEnd(&fBase), fSize(0) { fBase.fLink = &fBase; }
|
|
|
|
~Vector() {
|
|
while (fBase.fLink != &fBase) {
|
|
TextLink* link = fBase.fLink;
|
|
fBase.fLink = link->fLink;
|
|
delete link;
|
|
}
|
|
}
|
|
|
|
void addElement(UnicodeString text) { fEnd->fLink = new TextLink(&fBase, text); fEnd = fEnd->fLink; ++fSize; }
|
|
void insertElementAt(UnicodeString text, int pos) {
|
|
if(pos >= fSize || pos < 0)
|
|
;
|
|
else if(pos == 0){
|
|
TextLink* insert = new TextLink(&fBase, text);
|
|
insert->fLink=fBase.fLink;
|
|
++fSize;
|
|
fBase.fLink=insert;
|
|
}
|
|
else{
|
|
TextLink* link = fBase.fLink;
|
|
while(--pos > 0)
|
|
link=link->fLink;
|
|
TextLink* insert = new TextLink(&fBase, text);
|
|
insert->fLink =link->fLink;
|
|
link->fLink=insert;
|
|
++fSize;
|
|
|
|
}
|
|
|
|
}
|
|
UnicodeString elementAt(int32_t pos) {
|
|
if (pos >= fSize)
|
|
return UnicodeString();
|
|
|
|
TextLink* link = fBase.fLink;
|
|
while (pos-- > 0) link = link->fLink;
|
|
return link->fText;
|
|
}
|
|
UnicodeString lastElement() { return fEnd == &fBase ? UnicodeString() : fEnd->fText; }
|
|
int32_t size() { return fSize; }
|
|
|
|
Enumeration* elements() { return new VectorEnumeration(this); }
|
|
|
|
};
|
|
|
|
//--------------------------------------------------------------------------------------
|
|
/**
|
|
* IntlTestTextBoundary is medium top level test class for everything in the directory "findword".
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "ittxtbd.h"
|
|
|
|
#include <string.h>
|
|
#include "unicode/schriter.h"
|
|
|
|
const UChar IntlTestTextBoundary::cannedTestArray[] = {
|
|
0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
|
|
0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
|
|
0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
|
|
0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
|
|
0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
|
|
0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
|
|
0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
|
|
0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
|
|
};
|
|
|
|
UnicodeString* IntlTestTextBoundary::cannedTestChars = 0;
|
|
|
|
//---------------------------------------------
|
|
// setup methods
|
|
//---------------------------------------------
|
|
|
|
IntlTestTextBoundary::IntlTestTextBoundary()
|
|
{
|
|
UnicodeString temp(cannedTestArray);
|
|
cannedTestChars = new UnicodeString();
|
|
*cannedTestChars += (UChar)0x0000;
|
|
*cannedTestChars += temp;
|
|
addTestWordData();
|
|
addTestSentenceData();
|
|
addTestLineData();
|
|
addTestCharacterData();
|
|
}
|
|
|
|
IntlTestTextBoundary::~IntlTestTextBoundary()
|
|
{
|
|
delete wordSelectionData;
|
|
delete sentenceSelectionData;
|
|
delete lineSelectionData;
|
|
delete characterSelectionData;
|
|
delete cannedTestChars;
|
|
}
|
|
|
|
/**
|
|
* @bug 4097779 4098467 4117554
|
|
*/
|
|
void IntlTestTextBoundary::addTestWordData()
|
|
{
|
|
wordSelectionData = new Vector();
|
|
|
|
wordSelectionData->addElement("12,34");
|
|
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A2))); //cent sign
|
|
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A3))); //pound sign
|
|
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A4))); //currency sign
|
|
wordSelectionData->addElement(UCharToUnicodeString((UChar)(0x00A5))); //yen sign
|
|
wordSelectionData->addElement(CharsToUnicodeString("alpha\\u00adbeta\\u00adgamma"));
|
|
wordSelectionData->addElement(".");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("Badges");
|
|
wordSelectionData->addElement("?");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("BADGES");
|
|
wordSelectionData->addElement("!");
|
|
wordSelectionData->addElement("?");
|
|
wordSelectionData->addElement("!");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("We");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("don't");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("need");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("no");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("STINKING");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("BADGES");
|
|
wordSelectionData->addElement("!");
|
|
wordSelectionData->addElement("!");
|
|
wordSelectionData->addElement("!");
|
|
|
|
wordSelectionData->addElement("012.566,5");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("123.3434,900");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("1000,233,456.000");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("1,23.322%");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("123.1222");
|
|
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("$123,000.20");
|
|
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("179.01%");
|
|
|
|
wordSelectionData->addElement("Hello");
|
|
wordSelectionData->addElement(",");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("how");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("are");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("you");
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement("X");
|
|
wordSelectionData->addElement(" ");
|
|
|
|
wordSelectionData->addElement("Now");
|
|
wordSelectionData->addElement("\r");
|
|
wordSelectionData->addElement("is");
|
|
wordSelectionData->addElement("\n");
|
|
wordSelectionData->addElement("the");
|
|
wordSelectionData->addElement("\r\n");
|
|
wordSelectionData->addElement("time");
|
|
wordSelectionData->addElement("\n");
|
|
wordSelectionData->addElement("\r");
|
|
wordSelectionData->addElement("for");
|
|
wordSelectionData->addElement("\r");
|
|
wordSelectionData->addElement("\r");
|
|
wordSelectionData->addElement("all");
|
|
wordSelectionData->addElement(" ");
|
|
|
|
// to test for bug #4097779
|
|
wordSelectionData->addElement(CharsToUnicodeString("aa\\u0300a"));
|
|
wordSelectionData->addElement(" ");
|
|
|
|
// to test for bug #4098467
|
|
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
|
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
|
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
|
// Both sequences should be semantically identical and break the same way.
|
|
// precomposed syllables...
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c"));
|
|
wordSelectionData->addElement(" ");
|
|
// conjoining jamo...
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8"));
|
|
wordSelectionData->addElement(" ");
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
|
|
wordSelectionData->addElement(" ");
|
|
|
|
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
|
|
// count as a Kanji character for the purposes of word breaking
|
|
wordSelectionData->addElement("abc");
|
|
// Unicode TR29: Ideographs do NOT group together into words.
|
|
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u4e01"));
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u4e02"));
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u3005"));
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u4e03"));
|
|
wordSelectionData->addElement(CharsToUnicodeString("\\u4e03"));
|
|
wordSelectionData->addElement("abc");
|
|
|
|
|
|
|
|
}
|
|
|
|
const UChar kParagraphSeparator = 0x2029;
|
|
const UChar kLineSeparator = 0x2028;
|
|
|
|
/**
|
|
* @bug 4111338 4117554 4113835
|
|
*/
|
|
void IntlTestTextBoundary::addTestSentenceData()
|
|
{
|
|
sentenceSelectionData = new Vector();
|
|
sentenceSelectionData->addElement("This is a simple sample sentence. ");
|
|
sentenceSelectionData->addElement("(This is it.) ");
|
|
sentenceSelectionData->addElement("This is a simple sample sentence. ");
|
|
sentenceSelectionData->addElement("\"This isn\'t it.\" ");
|
|
sentenceSelectionData->addElement("Hi! ");
|
|
sentenceSelectionData->addElement("This is a simple sample sentence. ");
|
|
sentenceSelectionData->addElement("It does not have to make any sense as you can see. ");
|
|
sentenceSelectionData->addElement("Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ");
|
|
sentenceSelectionData->addElement("Che la dritta via aveo smarrita. ");
|
|
sentenceSelectionData->addElement("He said, that I said, that you said!! ");
|
|
|
|
sentenceSelectionData->addElement("Don't rock the boat." + UCharToUnicodeString(kParagraphSeparator));
|
|
|
|
sentenceSelectionData->addElement("Because I am the daddy, that is why. ");
|
|
sentenceSelectionData->addElement("Not on my time (el timo.)! ");
|
|
|
|
sentenceSelectionData->addElement("So what!!" + UCharToUnicodeString(kParagraphSeparator));
|
|
|
|
sentenceSelectionData->addElement("\"But now,\" he said, \"I know!\" ");
|
|
sentenceSelectionData->addElement("Harris thumbed down several, including \"Away We Go\" (which became the huge success Oklahoma!). ");
|
|
sentenceSelectionData->addElement("One species, B. anthracis, is highly virulent.\n");
|
|
sentenceSelectionData->addElement("Wolf said about Sounder:\"Beautifully thought-out and directed.\" ");
|
|
sentenceSelectionData->addElement("Have you ever said, \"This is where\tI shall live\"? ");
|
|
sentenceSelectionData->addElement("He answered, \"You may not!\" ");
|
|
sentenceSelectionData->addElement("Another popular saying is: \"How do you do?\". ");
|
|
sentenceSelectionData->addElement("Yet another popular saying is: \'I\'m fine thanks.\' ");
|
|
sentenceSelectionData->addElement("What is the proper use of the abbreviation pp.? ");
|
|
sentenceSelectionData->addElement("Yes, I am definatelly 12\" tall!!");
|
|
|
|
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("Now\ris\nthe\r\ntime\n\rfor\r\rall\\u037e"));
|
|
|
|
// test for bug #4111338: Don't break sentences at the boundary between CJK
|
|
// and other letters
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165:\"JAVA\\u821c")
|
|
+ CharsToUnicodeString("\\u8165\\u7fc8\\u51ce\\u306d,\\u2494\\u56d8\\u4ec0\\u60b1\\u8560\\u51ba")
|
|
+ CharsToUnicodeString("\\u611d\\u57b6\\u2510\\u5d46\".\\u2029"));
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
|
+ CharsToUnicodeString("\\u97e4JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
|
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u3002"));
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8\\u97e4")
|
|
+ CharsToUnicodeString("\\u6470\\u8790JAVA\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8")
|
|
+ CharsToUnicodeString("\\u4ec0\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751\\u2048"));
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("He said, \"I can go there.\"\\u2029"));
|
|
|
|
// test for bug #4117554: Treat fullwidth variants of .!? the same as their
|
|
// normal counterparts
|
|
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("I know I'm right\\uff0e "));
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff1f "));
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("Right\\uff01 "));
|
|
#endif
|
|
|
|
// test for bug #4117554: Don't break sentences at boundary between CJK and digits
|
|
sentenceSelectionData->addElement(CharsToUnicodeString("\\u5487\\u67ff\\ue591\\u5017\\u61b3\\u60a1\\u9510\\u8165\\u9de8")
|
|
+ CharsToUnicodeString("\\u97e48888\\u821c\\u8165\\u7fc8\\u51ce\\u306d\\ue30b\\u2494\\u56d8\\u4ec0")
|
|
+ CharsToUnicodeString("\\u60b1\\u8560\\u51ba\\u611d\\u57b6\\u2510\\u5d46\\u97e5\\u7751.\\u2029"));
|
|
|
|
// test for bug #4117554: Break sentence between a sentence terminator and
|
|
// opening punctuation
|
|
sentenceSelectionData->addElement("Say no?");
|
|
sentenceSelectionData->addElement("(yes)." + CharsToUnicodeString("\\u2029"));
|
|
|
|
// test for bug #4158381: Don't break sentence after period if it isn't
|
|
// followed by a space
|
|
sentenceSelectionData->addElement("Test <code>Flags.Flag</code> class. ");
|
|
sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));
|
|
|
|
// test for bug #4158381: No breaks when there are no terminators around
|
|
sentenceSelectionData->addElement("<P>Provides a set of "lightweight" (all-java<FONT SIZE=\"-2\"><SUP>TM</SUP></FONT> language) components that, to the maximum degree possible, work the same on all platforms. ");
|
|
sentenceSelectionData->addElement("Another test." + CharsToUnicodeString("\\u2029"));
|
|
|
|
// test for bug #4143071: Make sure sentences that end with digits
|
|
// work right
|
|
sentenceSelectionData->addElement("Today is the 27th of May, 1998. ");
|
|
sentenceSelectionData->addElement("Tomorrow with be 28 May 1998. ");
|
|
sentenceSelectionData->addElement("The day after will be the 30th."
|
|
+ CharsToUnicodeString("\\u2029"));
|
|
|
|
// test for bug #4152416: Make sure sentences ending with a capital
|
|
// letter are treated correctly
|
|
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
|
|
sentenceSelectionData->addElement("The type of all primitive <code>boolean</code> values accessed in the target VM. "
|
|
"Calls to xxx will return an implementor of this interface. " + CharsToUnicodeString("\\u2029"));
|
|
|
|
// test for bug #4152117: Make sure sentence breaking is handling
|
|
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
|
|
// HERE TO MAKE SURE IT DOESN'T CROP UP]
|
|
sentenceSelectionData->addElement("Constructs a randomly generated BigInteger, uniformly distributed over the range <tt>0</tt> to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. ");
|
|
sentenceSelectionData->addElement("The uniformity of the distribution assumes that a fair source of random bits is provided in <tt>rnd</tt>. ");
|
|
sentenceSelectionData->addElement("Note that this constructor always constructs a non-negative BigInteger." + CharsToUnicodeString("\\u2029"));
|
|
|
|
}
|
|
|
|
/**
|
|
* @bug 4068133 4086052 4035266 4097920 4098467 4117554
|
|
*/
|
|
void IntlTestTextBoundary::addTestLineData()
|
|
{
|
|
lineSelectionData = new Vector();
|
|
lineSelectionData->addElement("Multi-");
|
|
lineSelectionData->addElement("Level ");
|
|
lineSelectionData->addElement("example ");
|
|
lineSelectionData->addElement("of ");
|
|
lineSelectionData->addElement("a ");
|
|
lineSelectionData->addElement("semi-");
|
|
lineSelectionData->addElement("idiotic ");
|
|
lineSelectionData->addElement("non-");
|
|
lineSelectionData->addElement("sensical ");
|
|
lineSelectionData->addElement("(non-");
|
|
lineSelectionData->addElement("important) ");
|
|
lineSelectionData->addElement("sentence. ");
|
|
|
|
lineSelectionData->addElement("Hi ");
|
|
lineSelectionData->addElement("Hello ");
|
|
lineSelectionData->addElement("How\n");
|
|
lineSelectionData->addElement("are\r");
|
|
lineSelectionData->addElement("you" + UCharToUnicodeString(kLineSeparator));
|
|
lineSelectionData->addElement("fine.\t");
|
|
lineSelectionData->addElement("good. ");
|
|
|
|
lineSelectionData->addElement("Now\r");
|
|
lineSelectionData->addElement("is\n");
|
|
lineSelectionData->addElement("the\r\n");
|
|
lineSelectionData->addElement("time\n");
|
|
lineSelectionData->addElement("\r");
|
|
lineSelectionData->addElement("for\r");
|
|
lineSelectionData->addElement("\r");
|
|
lineSelectionData->addElement("all");
|
|
|
|
// to test for bug #4068133
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u96f6"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e00\\u3002"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e8c\\u3001"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e09\\u3002\\u3001"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u56db\\u3001\\u3002\\u3001"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e94,"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u516d."));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e03.\\u3001,\\u3002"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u516b"));
|
|
|
|
// to test for bug #4086052
|
|
lineSelectionData->addElement(CharsToUnicodeString("foo\\u00a0bar "));
|
|
// lineSelectionData->addElement("foo\\ufeffbar");
|
|
|
|
// to test for bug #4097920
|
|
lineSelectionData->addElement("dog,");
|
|
lineSelectionData->addElement("cat,");
|
|
lineSelectionData->addElement("mouse ");
|
|
lineSelectionData->addElement("(one)");
|
|
lineSelectionData->addElement("(two)\n");
|
|
|
|
// to test for bug #4035266
|
|
lineSelectionData->addElement("The ");
|
|
lineSelectionData->addElement("balance ");
|
|
lineSelectionData->addElement("is ");
|
|
lineSelectionData->addElement("$-23,456.78, ");
|
|
lineSelectionData->addElement("not ");
|
|
// lineSelectionData->addElement("-$32,456.78!\n"); // Doesn't break this way according to TR29
|
|
lineSelectionData->addElement("-");
|
|
lineSelectionData->addElement("$32,456.78!\n");
|
|
|
|
// to test for bug #4098467
|
|
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
|
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
|
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
|
// Both sequences should be semantically identical and break the same way.
|
|
// precomposed syllables...
|
|
|
|
// By TR14, precomposed Hangul syllables should not be grouped together.
|
|
// Also, identical test is in rbbitst.cpp.
|
|
#if 0
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\uc0c1\\ud56d "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\ud55c\\uc778 "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\uc5f0\\ud569 "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\uc7a5\\ub85c\\uad50\\ud68c "));
|
|
|
|
// conjoining jamo...
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc\\u1112\\u1161\\u11bc "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab\\u110b\\u1175\\u11ab "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab\\u1112\\u1161\\u11b8 "));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc\\u1105\\u1169\\u1100\\u116d\\u1112\\u116c"));
|
|
#endif
|
|
|
|
// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e01\\uff0e"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e02\\uff01"));
|
|
lineSelectionData->addElement(CharsToUnicodeString("\\u4e03\\uff1f"));
|
|
|
|
}
|
|
|
|
/*
|
|
const UnicodeString graveS = "S" + (UChar)0x0300;
|
|
const UnicodeString acuteBelowI = "i" + UCharToUnicodeString(0x0317);
|
|
const UnicodeString acuteE = "e" + UCharToUnicodeString(0x0301);
|
|
const UnicodeString circumflexA = "a" + UCharToUnicodeString(0x0302);
|
|
const UnicodeString tildeE = "e" + UCharToUnicodeString(0x0303);
|
|
*/
|
|
|
|
/**
|
|
* @bug 4098467
|
|
*/
|
|
void IntlTestTextBoundary::addTestCharacterData()
|
|
{
|
|
characterSelectionData = new Vector();
|
|
characterSelectionData->addElement("S" + UCharToUnicodeString(0x0300)); //graveS
|
|
characterSelectionData->addElement("i" + UCharToUnicodeString(0x0301)); // acuteBelowI
|
|
characterSelectionData->addElement("m");
|
|
characterSelectionData->addElement("p");
|
|
characterSelectionData->addElement("l");
|
|
characterSelectionData->addElement("e" + UCharToUnicodeString(0x0301)); // acuteE
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement("s");
|
|
characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA
|
|
characterSelectionData->addElement("m");
|
|
characterSelectionData->addElement("p");
|
|
characterSelectionData->addElement("l");
|
|
characterSelectionData->addElement("e" + UCharToUnicodeString(0x0303)); // tildeE
|
|
characterSelectionData->addElement(".");
|
|
characterSelectionData->addElement("w");
|
|
characterSelectionData->addElement("a" + UCharToUnicodeString(0x0302)); // circumflexA
|
|
characterSelectionData->addElement("w");
|
|
characterSelectionData->addElement("a");
|
|
characterSelectionData->addElement("f");
|
|
characterSelectionData->addElement("q");
|
|
characterSelectionData->addElement("\n");
|
|
characterSelectionData->addElement("\r");
|
|
characterSelectionData->addElement("\r\n");
|
|
characterSelectionData->addElement("\n");
|
|
|
|
// to test for bug #4098467
|
|
// What follows is a string of Korean characters (I found it in the Yellow Pages
|
|
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
|
|
// it correctly), first as precomposed syllables, and then as conjoining jamo.
|
|
// Both sequences should be semantically identical and break the same way.
|
|
// precomposed syllables...
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\uc0c1"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\ud56d"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\ud55c"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\uc778"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\uc5f0"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\ud569"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\uc7a5"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\ub85c"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\uad50"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\ud68c"));
|
|
characterSelectionData->addElement(" ");
|
|
// conjoining jamo...
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1109\\u1161\\u11bc"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11bc"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11ab"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1175\\u11ab"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u110b\\u1167\\u11ab"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u1161\\u11b8"));
|
|
characterSelectionData->addElement(" ");
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u110c\\u1161\\u11bc"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1105\\u1169"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1100\\u116d"));
|
|
characterSelectionData->addElement(CharsToUnicodeString("\\u1112\\u116c"));
|
|
|
|
}
|
|
|
|
UnicodeString IntlTestTextBoundary::createTestData(Enumeration* e)
|
|
{
|
|
UnicodeString result = "";
|
|
|
|
while (e->hasMoreElements()) {
|
|
result += e->nextElement();
|
|
}
|
|
return result;
|
|
}
|
|
|
|
//---------------------------------------------
|
|
// SentenceBreak tests
|
|
//---------------------------------------------
|
|
|
|
void IntlTestTextBoundary::TestSentenceIteration()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator* e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestSentenceIteration.\n");
|
|
return;
|
|
}
|
|
generalIteratorTest(*e, sentenceSelectionData);
|
|
delete e;
|
|
}
|
|
|
|
void IntlTestTextBoundary::TestSentenceInvariants()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator *e = BreakIterator::createSentenceInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestSentenceInvariant.\n");
|
|
return;
|
|
}
|
|
UnicodeString s = *cannedTestChars + CharsToUnicodeString(".,\\u3001\\u3002\\u3041\\u3042\\u3043\\ufeff");
|
|
doOtherInvariantTest(*e, s);
|
|
delete e;
|
|
}
|
|
//---------------------------------------------
|
|
// WordBreak tests
|
|
//---------------------------------------------
|
|
void IntlTestTextBoundary::TestWordIteration()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestWordIteration.\n");
|
|
return;
|
|
}
|
|
generalIteratorTest(*e, wordSelectionData);
|
|
delete e;
|
|
}
|
|
void IntlTestTextBoundary::TestWordInvariants()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator *e = BreakIterator::createWordInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestWordInvariants.\n");
|
|
return;
|
|
}
|
|
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
|
|
doBreakInvariantTest(*e, s);
|
|
s = *cannedTestChars + CharsToUnicodeString("\',.\\u3041\\u3042\\u3043\\u309b\\u309c\\u30a1\\u30a2\\u30a3\\u4e00\\u4e01\\u4e02");
|
|
doOtherInvariantTest(*e, s);
|
|
delete e;
|
|
}
|
|
//---------------------------------------------
|
|
// CharacterBreak tests
|
|
//---------------------------------------------
|
|
void IntlTestTextBoundary::TestCharacterIteration()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator* e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestCharacterIteration.\n");
|
|
return;
|
|
}
|
|
// generalIteratorTest(*e, testCharacterText, characterSelectionData);
|
|
generalIteratorTest(*e, characterSelectionData);
|
|
delete e;
|
|
}
|
|
void IntlTestTextBoundary::TestCharacterInvariants()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator *e = BreakIterator::createCharacterInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestCharacterInvariants.\n");
|
|
return;
|
|
}
|
|
UnicodeString s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
|
|
doBreakInvariantTest(*e, s);
|
|
s = *cannedTestChars + CharsToUnicodeString("\\u1100\\u1101\\u1102\\u1160\\u1161\\u1162\\u11a8\\u11a9\\u11aa");
|
|
doOtherInvariantTest(*e, s);
|
|
delete e;
|
|
}
|
|
//---------------------------------------------
|
|
// LineBreak tests
|
|
//---------------------------------------------
|
|
void IntlTestTextBoundary::TestLineIteration()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator* e = BreakIterator::createLineInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestLineIteration.\n");
|
|
return;
|
|
}
|
|
generalIteratorTest(*e, lineSelectionData);
|
|
delete e;
|
|
}
|
|
void IntlTestTextBoundary::TestLineInvariants()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator *e = BreakIterator::createLineInstance(Locale::US, status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestLineInvariants.\n");
|
|
return;
|
|
}
|
|
UnicodeString s = CharsToUnicodeString(".,;:\\u3001\\u3002\\u3041\\u3042\\u3043\\u3044\\u3045\\u30a3\\u4e00\\u4e01\\u4e02");
|
|
UnicodeString testChars = *cannedTestChars + s;
|
|
doBreakInvariantTest(*e, testChars);
|
|
doOtherInvariantTest(*e, testChars);
|
|
|
|
int32_t errCount = 0, testCharsLen, noBreakLen, dashesLen;
|
|
int32_t i, j, k;
|
|
|
|
// in addition to the other invariants, a line-break iterator should make sure that:
|
|
// it doesn't break around the non-breaking characters,
|
|
// EXCEPT breaking after a space takes precedence over not breaking before
|
|
// an non-breaking char. So says TR 14.
|
|
UnicodeString noBreak = CharsToUnicodeString("\\u00a0\\u2007\\u2011\\ufeff");
|
|
UnicodeString work("aaa");
|
|
testCharsLen = testChars.length();
|
|
noBreakLen = noBreak.length();
|
|
for (i = 0; i < testCharsLen; i++) {
|
|
UChar c = testChars[i];
|
|
if (c == '\r' || c == '\n' || c == 0x2029 || c == 0x2028 || c == 0x0003 ||
|
|
u_charType(c) == U_CONTROL_CHAR) {
|
|
continue;
|
|
}
|
|
work[0] = c;
|
|
for (j = 0; j < noBreakLen; j++) {
|
|
work[1] = noBreak[j];
|
|
for (k = 0; k < testCharsLen; k++) {
|
|
work[2] = testChars[k];
|
|
e->setText(work);
|
|
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
|
UChar c1 = work[l - 1];
|
|
UChar c2 = work[l];
|
|
if (c1 == 0x20 && l == 1) {
|
|
continue;
|
|
}
|
|
if (l == 1 || l == 2) {
|
|
errln("Got break between U+" + UCharToUnicodeString(c1) +
|
|
" and U+" + UCharToUnicodeString(c2));
|
|
errCount++;
|
|
if (errCount >= 75)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// it does break after hyphens (Rule 15B from TR 14
|
|
// (unless they're followed by a digit, a non-spacing mark,
|
|
// a currency symbol, a non-breaking space, or a line or paragraph separator
|
|
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
|
|
|
|
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
|
|
//
|
|
UnicodeString dashes = CharsToUnicodeString("-\\u00ad\\u2010\\u2012\\u2013\\u2014");
|
|
dashesLen = dashes.length();
|
|
for (i = 0; i < testCharsLen; i++) {
|
|
work[0] = testChars[i];
|
|
for (j = 0; j < dashesLen; j++) {
|
|
UChar c1 = work[1] = dashes[j];
|
|
for (k = 0; k < testCharsLen; k++) {
|
|
UChar c2 = work[2] = testChars[k];
|
|
int8_t type = Unicode::getType(c2);
|
|
if (type == Unicode::DECIMAL_DIGIT_NUMBER ||
|
|
type == Unicode::OTHER_NUMBER ||
|
|
type == Unicode::NON_SPACING_MARK ||
|
|
type == Unicode::ENCLOSING_MARK ||
|
|
type == Unicode::CURRENCY_SYMBOL ||
|
|
type == Unicode::SPACE_SEPARATOR ||
|
|
type == Unicode::DASH_PUNCTUATION ||
|
|
type == Unicode::CONTROL ||
|
|
type == Unicode::FORMAT ||
|
|
c2 == '\n' || c2 == '\r' || c2 == 0x2028 || c2 == 0x2029 ||
|
|
c2 == 0x0003 || c2 == 0x00a0 || c2 == 0x2007 || c2 == 0x2011 ||
|
|
c2 == 0xfeff)
|
|
{
|
|
continue;
|
|
}
|
|
// If c1 == hyphen-minus, and ...
|
|
if (c1 == 0x002d && (
|
|
c2 == 0x0021 || // !
|
|
c2 == 0x002c || // ,
|
|
c2 == 0x002d || // -
|
|
c2 == 0x002e || // . (TR 14 class IS)
|
|
c2 == 0x0029 || // )
|
|
c2 == 0x003a || // :
|
|
c2 == 0x003b || // ; (TR 14 class IS)
|
|
c2 == 0x005d || // ]
|
|
c2 == 0x007c || // | (TR 14 class BA, rule 15)
|
|
c2 == 0x007d || // }
|
|
c2 == 0x0903 || // Devanagari sign visarga, combining, what's it doing in this test?
|
|
c2 == 0x093E || // Devanagari , combining, what's it doing in this test?
|
|
c2 == 0x093F || // Devanagari , combining, what's it doing in this test?
|
|
c2 == 0x0940 || // Devanagari , combining, what's it doing in this test?
|
|
c2 == 0x0949 || // Devanagari , combining, what's it doing in this test?
|
|
c2 == 0x0f3b || // Tibetan closing bracket
|
|
c2 == 0x3001 || // CJK closing bracket
|
|
c2 == 0x3002 // CJK closing bracket
|
|
)) {
|
|
continue;
|
|
}
|
|
|
|
e->setText(work);
|
|
UBool saw2 = FALSE;
|
|
for (int l = e->first(); l != BreakIterator::DONE; l = e->next()) {
|
|
if (l == 2) {
|
|
saw2 = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
if (!saw2) {
|
|
// TODO: This test is completely out of sync with the spec. Fix it.
|
|
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
|
|
// " and U+" + UCharToUnicodeString(work[2]));
|
|
// errCount++;
|
|
// if (errCount >= 75)
|
|
// return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
delete e;
|
|
}
|
|
|
|
void IntlTestTextBoundary::TestThaiLineBreak() {
|
|
Vector* thaiLineSelection = new Vector();
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
// \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
|
|
// represents elided letters at the end of a long word. It should be bound to
|
|
// the end of the word and not treated as an independent punctuation mark.
|
|
|
|
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e30"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e21"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e08\\u0e49\\u0e32"));
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32"));
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48"));
|
|
// the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e2d\\u0e01"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e23\\u0e48\\u0e07"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e15\\u0e47\\u0e21"));
|
|
|
|
// the one time where the paiyannoi occurs somewhere other than at the end
|
|
// of a word is in the Thai abbrevation for "etc.", which both begins and
|
|
// ends with a paiyannoi
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2f\\u0e25\\u0e2f"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19"));
|
|
|
|
BreakIterator* e = BreakIterator::createLineInstance(
|
|
Locale("th"), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for Thai locale in TestThaiLineBreak.\n");
|
|
return;
|
|
}
|
|
|
|
generalIteratorTest(*e, thaiLineSelection);
|
|
delete e;
|
|
delete thaiLineSelection;
|
|
}
|
|
|
|
void IntlTestTextBoundary::TestMixedThaiLineBreak()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
Vector* thaiLineSelection= new Vector();
|
|
|
|
// Arabic numerals should always be separated from surrounding Thai text
|
|
/*
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));
|
|
thaiLineSelection->addElement("39");
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));
|
|
|
|
// words in non-Thai scripts should always be separated from surrounding Thai text
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e14"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2a\\u0e2d\\u0e1a"));
|
|
thaiLineSelection->addElement("Java");
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e19"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e04\\u0e23\\u0e37\\u0e48\\u0e2d\\u0e07"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21 "));
|
|
|
|
// Thai numerals should always be separated from the text surrounding them
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e04\\u0e48\\u0e32"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e07\\u0e34\\u0e19"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e15\\u0e30"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e14\\u0e31\\u0e1a"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e53\\u0e59"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e1a\\u0e32\\u0e17 "));
|
|
|
|
// Thai text should interact correctly with punctuation and symbols
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e2d\\u0e1a\\u0e35\\u0e40\\u0e2d\\u0e47\\u0e21"));
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28"));
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e17\\u0e22)"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28\\u0e44\\u0e17\\u0e22)"));
|
|
// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e08\\u0e33\\u0e01\\u0e31\\u0e14"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e1b\\u0e34\\u0e14"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e15\\u0e31\\u0e27\""));
|
|
*/
|
|
|
|
// The Unicode Linebreak TR says do not break before or after quotes.
|
|
// So this test is changed ot not break around the quote.
|
|
// TODO: should Thai break around the around the quotes, like the original behavior here?
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));
|
|
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""
|
|
"\\u0e23\\u0e38\\u0e48\\u0e19"));
|
|
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e14\\u0e37\\u0e2d\\u0e19\\u0e21\\u0e34."));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e22."));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e35\\u0e49"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e32\\u0e04\\u0e32"));
|
|
thaiLineSelection->addElement("$200");
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e17\\u0e48\\u0e32"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e19\\u0e31\\u0e49\\u0e19 "));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("(\"\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\")."));
|
|
|
|
BreakIterator* e = BreakIterator::createLineInstance(Locale("th"), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak.\n");
|
|
return;
|
|
}
|
|
|
|
|
|
generalIteratorTest(*e, thaiLineSelection);
|
|
delete e;
|
|
delete thaiLineSelection;
|
|
}
|
|
|
|
|
|
void IntlTestTextBoundary::TestMaiyamok()
|
|
{
|
|
Vector* thaiLineSelection= new Vector();
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
// the Thai maiyamok character is a shorthand symbol that means "repeat the previous
|
|
// word". Instead of appearing as a word unto itself, however, it's kept together
|
|
// with the word before it
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e1b\\u0e46"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e21\\u0e32\\u0e46"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e01\\u0e23\\u0e38\\u0e07\\u0e40\\u0e17\\u0e1e"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e41\\u0e25\\u0e30"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e40\\u0e03\\u0e35\\u0e22\\u0e07"));
|
|
thaiLineSelection->addElement(CharsToUnicodeString("\\u0e43\\u0e2b\\u0e21\\u0e48"));
|
|
|
|
BreakIterator* e = BreakIterator::createLineInstance(
|
|
Locale("th"), status);
|
|
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for Thai locale in TestMaiyamok.\n");
|
|
return;
|
|
}
|
|
generalIteratorTest(*e, thaiLineSelection);
|
|
delete e;
|
|
delete thaiLineSelection;
|
|
}
|
|
|
|
void IntlTestTextBoundary::TestThaiWordBreak() {
|
|
Vector* thaiWordSelection = new Vector();
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1A\\u0E17")); //2
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E35\\u0E48")); //5
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E51")); //6
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E1E\\u0E32\\u0E22\\u0E38")); //10
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E0B\\u0E42\\u0E04\\u0E25\\u0E19")); //16
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u000D\\u000A")); //18
|
|
|
|
// This is the correct result
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35")); //24
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29
|
|
|
|
// and this is what the dictionary does...
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14")); // 20
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E23\\u0E18\\u0E35\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29
|
|
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E22\\u0E39\\u0E48")); //33
|
|
|
|
// This is the correct result
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21")); //37
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E25\\u0E32\\u0E07")); //41
|
|
|
|
// and this is what the dictionary does
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21\\u0E01\\u0E25\\u0E32\\u0E07")); //41
|
|
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E38\\u0E48\\u0E07")); //45
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E2B\\u0E0D\\u0E48")); //49
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E43\\u0E19")); //51
|
|
|
|
// This is the correct result
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A")); //57
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E31\\u0E1A")); //60
|
|
|
|
// and this is what the dictionary does
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19")); // 54
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0B\\u0E31\\u0E2A\\u0E01\\u0E31\\u0E1A")); //60
|
|
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E25\\u0E38\\u0E07")); //63
|
|
|
|
// This is the correct result
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35")); //68
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0A\\u0E32\\u0E27")); //71
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E23\\u0E48")); //74
|
|
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E25\\u0E30")); //77
|
|
|
|
// and this is what the dictionary does
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E")); // 65
|
|
thaiWordSelection->addElement(CharsToUnicodeString("\\u0E19\\u0E23\\u0E35\\u0E0A\\u0E32\\u0E27\\u0E44\\u0E23\\u0E48\\u0E41\\u0E25\\u0E30")); //77
|
|
|
|
BreakIterator* e = BreakIterator::createWordInstance(
|
|
Locale("th"), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for Thai locale in TestThaiWordBreak.\n");
|
|
return;
|
|
}
|
|
|
|
generalIteratorTest(*e, thaiWordSelection);
|
|
delete e;
|
|
delete thaiWordSelection;
|
|
}
|
|
|
|
/**
|
|
* Test Japanese Line Break
|
|
* @bug 4095322
|
|
*/
|
|
void IntlTestTextBoundary::TestJapaneseLineBreak()
|
|
{
|
|
// Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
|
|
// as opening and closing punctuation for line breaking.
|
|
// Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
|
|
// from these tests. 6-13-2002
|
|
//
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
|
|
UnicodeString precedingChars = CharsToUnicodeString(
|
|
//"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
|
|
"([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
|
|
UnicodeString followingChars = CharsToUnicodeString(
|
|
// ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
|
|
")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
|
|
// ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
|
|
":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
|
|
"\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
|
|
BreakIterator *iter = BreakIterator::createLineInstance(Locale::JAPAN, status);
|
|
|
|
int32_t i;
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
|
|
return;
|
|
}
|
|
|
|
for (i = 0; i < precedingChars.length(); i++) {
|
|
testString[1] = precedingChars[i];
|
|
iter->setText(testString);
|
|
int32_t j = iter->first();
|
|
if (j != 0)
|
|
errln("ja line break failure: failed to start at 0");
|
|
j = iter->next();
|
|
if (j != 1)
|
|
errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
|
|
+ "' (" + ((int)(precedingChars[i])) + ")");
|
|
j = iter->next();
|
|
if (j != 3)
|
|
errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
|
|
+ "' (" + ((int)(precedingChars[i])) + ")");
|
|
}
|
|
|
|
for (i = 0; i < followingChars.length(); i++) {
|
|
testString[1] = followingChars[i];
|
|
iter->setText(testString);
|
|
int j = iter->first();
|
|
if (j != 0)
|
|
errln("ja line break failure: failed to start at 0");
|
|
j = iter->next();
|
|
if (j != 2)
|
|
errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
|
|
+ "' (" + ((int)(followingChars[i])) + ")");
|
|
j = iter->next();
|
|
if (j != 3)
|
|
errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
|
|
+ "' (" + ((int)(followingChars[i])) + ")");
|
|
}
|
|
delete iter;
|
|
}
|
|
|
|
//---------------------------------------------
|
|
// other tests
|
|
//---------------------------------------------/
|
|
|
|
void IntlTestTextBoundary::TestEmptyString()
|
|
{
|
|
UnicodeString text = "";
|
|
Vector x;
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
x.addElement(text);
|
|
BreakIterator* bi = BreakIterator::createLineInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestEmptyString.\n");
|
|
return;
|
|
}
|
|
generalIteratorTest(*bi, &x);
|
|
|
|
delete bi;
|
|
}
|
|
|
|
void IntlTestTextBoundary::TestGetAvailableLocales()
|
|
{
|
|
int32_t locCount = 0;
|
|
const Locale* locList = BreakIterator::getAvailableLocales(locCount);
|
|
|
|
if (locCount == 0)
|
|
errln("getAvailableLocales() returned an empty list!");
|
|
// Just make sure that it's returning good memory.
|
|
for (int32_t i = 0; i < locCount; ++i) {
|
|
logln(locList[i].getName());
|
|
}
|
|
}
|
|
|
|
//Testing the BreakIterator::getDisplayName() function
|
|
void IntlTestTextBoundary::TestGetDisplayName()
|
|
{
|
|
UnicodeString result;
|
|
|
|
BreakIterator::getDisplayName(Locale::getUS(), result);
|
|
if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
|
|
errln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
|
|
+ result);
|
|
|
|
BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
|
|
if (result != "French (France)")
|
|
errln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
|
|
+ result);
|
|
}
|
|
/**
|
|
* Test End Behaviour
|
|
* @bug 4068137
|
|
*/
|
|
void IntlTestTextBoundary::TestEndBehaviour()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UnicodeString testString("boo.");
|
|
BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestEndBehaviour.\n");
|
|
return;
|
|
}
|
|
wb->setText(testString);
|
|
|
|
if (wb->first() != 0)
|
|
errln("Didn't get break at beginning of string.");
|
|
if (wb->next() != 3)
|
|
errln("Didn't get break before period in \"boo.\"");
|
|
if (wb->current() != 4 && wb->next() != 4)
|
|
errln("Didn't get break at end of string.");
|
|
delete wb;
|
|
}
|
|
/*
|
|
* @bug 4153072
|
|
*/
|
|
void IntlTestTextBoundary::TestBug4153072() {
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestBug4153072\n");
|
|
return;
|
|
}
|
|
UnicodeString str("...Hello, World!...");
|
|
int32_t begin = 3;
|
|
int32_t end = str.length() - 3;
|
|
UBool dummy;
|
|
|
|
StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
|
|
iter->adoptText(textIterator);
|
|
for (int index = -1; index < begin + 1; ++index) {
|
|
dummy = iter->isBoundary(index);
|
|
if (index < begin && dummy == TRUE) {
|
|
errln((UnicodeString)"Didn't handle preceeding correctly with offset = " + index +
|
|
" and begin index = " + begin);
|
|
}
|
|
}
|
|
delete iter;
|
|
}
|
|
|
|
/*
|
|
* Test Preceding()
|
|
*/
|
|
void IntlTestTextBoundary::TestPreceding()
|
|
{
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
UnicodeString words3("aaa bbb ccc");
|
|
BreakIterator* e = BreakIterator::createWordInstance(Locale::getDefault(), status);
|
|
if (U_FAILURE(status))
|
|
{
|
|
errln("Failed to create the BreakIterator for default locale in TestPreceeding.\n");
|
|
return;
|
|
}
|
|
|
|
e->setText( words3 );
|
|
e->first();
|
|
int32_t p1 = e->next();
|
|
int32_t p2 = e->next();
|
|
int32_t p3 = e->next();
|
|
int32_t p4 = e->next();
|
|
|
|
int32_t f = e->following(p2+1);
|
|
int32_t p = e->preceding(p2+1);
|
|
if (f!=p3)
|
|
errln("IntlTestTextBoundary::TestPreceding: f!=p3");
|
|
if (p!=p2)
|
|
errln("IntlTestTextBoundary::TestPreceding: p!=p2");
|
|
|
|
if (p1+1!=p2)
|
|
errln("IntlTestTextBoundary::TestPreceding: p1+1!=p2");
|
|
|
|
if (p3+1!=p4)
|
|
errln("IntlTestTextBoundary::TestPreceding: p3+1!=p4");
|
|
|
|
if (!e->isBoundary(p2) || e->isBoundary(p2+1) || !e->isBoundary(p3))
|
|
{
|
|
errln("IntlTestTextBoundary::TestPreceding: isBoundary err");
|
|
}
|
|
delete e;
|
|
}
|
|
//---------------------------------------------
|
|
// runIndexedTest
|
|
//---------------------------------------------
|
|
|
|
void IntlTestTextBoundary::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
|
|
{
|
|
if (exec) logln("TestSuite TextBoundary: ");
|
|
switch (index) {
|
|
case 0: name = "TestSentenceIteration"; if(exec) TestSentenceIteration(); break;
|
|
case 1: name = "TestWordIteration"; if(exec) TestWordIteration(); break;
|
|
case 2: name = "TestLineIteration"; if(exec) TestLineIteration(); break;
|
|
case 3: name = "TestCharacterIteration"; if(exec) TestCharacterIteration(); break;
|
|
case 4: name = "TestSentenceInvariants"; if(exec) TestSentenceInvariants();break;
|
|
case 5: name = "TestWordInvariants"; if(exec) TestWordInvariants();break;
|
|
case 6: name = "TestLineInvariants"; if(exec) TestLineInvariants();break;
|
|
case 7: name = "TestCharacterInvariants"; if(exec) TestCharacterInvariants();break;
|
|
|
|
case 8: name = "TestEmptyString"; if (exec) TestEmptyString(); break;
|
|
case 9: name = "TestGetAvailableLocales"; if (exec) TestGetAvailableLocales(); break;
|
|
case 10: name = "TestGetDisplayName"; if (exec) TestGetDisplayName(); break;
|
|
case 11: name = "TestPreceding"; if (exec) TestPreceding(); break;
|
|
case 12: name = "TestBug4153072"; if (exec) TestBug4153072(); break;
|
|
case 13: name = "TestEndBehaviour"; if (exec) TestEndBehaviour(); break;
|
|
|
|
case 14: name = "TestJapaneseLineBreak"; if (exec) TestJapaneseLineBreak(); break;
|
|
case 15: name = "TestThaiLineBreak"; if(exec) TestThaiLineBreak(); break;
|
|
case 16: name = "TestMixedThaiLineBreak"; if(exec) TestMixedThaiLineBreak(); break;
|
|
case 17: name = "TestMaiyamok"; if(exec) TestMaiyamok(); break;
|
|
case 18: name = "TestThaiWordBreak"; if(exec) TestThaiWordBreak(); break;
|
|
|
|
|
|
default: name = ""; break; //needed to end loop
|
|
}
|
|
}
|
|
|
|
//---------------------------------------------
|
|
// Test implementation routines
|
|
//---------------------------------------------
|
|
|
|
// general test Implementation subroutines
|
|
void IntlTestTextBoundary::generalIteratorTest(BreakIterator& bi, Vector* expectedResult)
|
|
{
|
|
Enumeration *elems = expectedResult->elements();
|
|
UnicodeString text = createTestData(elems);
|
|
delete elems;
|
|
|
|
logln("comparing forward and backward...");
|
|
bi.setText(text);
|
|
|
|
Vector *nextResults = testFirstAndNext(bi, text);
|
|
if (nextResults == NULL) {
|
|
errln("Couldn't get nextResults!");
|
|
return;
|
|
}
|
|
|
|
Vector *previousResults = testLastAndPrevious(bi, text);
|
|
|
|
if (previousResults == NULL) {
|
|
errln("Couldn't get previousResults!");
|
|
return;
|
|
}
|
|
|
|
int errs = getErrors();
|
|
UnicodeString str1="forward iteration";
|
|
UnicodeString str2="backward iteration";
|
|
compareFragmentLists(str1, str2, nextResults,
|
|
previousResults);
|
|
if (getErrors() == errs) {
|
|
logln("comparing expected and actual...");
|
|
str1="expected result";
|
|
str2="actual result";
|
|
compareFragmentLists(str1, str2, expectedResult,
|
|
nextResults);
|
|
}
|
|
|
|
int32_t *boundaries = new int32_t[expectedResult->size() + 3];
|
|
boundaries[0] = BreakIterator::DONE;
|
|
boundaries[1] = 0;
|
|
for (int i = 0; i < expectedResult->size(); i++)
|
|
boundaries[i + 2] = boundaries[i + 1] + ((UnicodeString)expectedResult->elementAt(i)).
|
|
length();
|
|
|
|
int len = expectedResult->size() + 3 -1;
|
|
boundaries[len] = BreakIterator::DONE;
|
|
|
|
testFollowing(bi, text, boundaries);
|
|
testPreceding(bi, text, boundaries);
|
|
testIsBoundary(bi, text, boundaries);
|
|
|
|
doMultipleSelectionTest(bi, text);
|
|
|
|
delete nextResults;
|
|
delete previousResults;
|
|
delete []boundaries;
|
|
}
|
|
|
|
Vector* IntlTestTextBoundary::testFirstAndNext(BreakIterator& bi, UnicodeString& text)
|
|
{
|
|
int32_t p = bi.first();
|
|
int32_t lastP = p;
|
|
Vector *result = new Vector();
|
|
UnicodeString selection;
|
|
|
|
if (p != 0)
|
|
errln((UnicodeString)"first() returned " + p + (UnicodeString)" instead of 0");
|
|
while (p != BreakIterator::DONE) {
|
|
p = bi.next();
|
|
if (p != BreakIterator::DONE) {
|
|
if (p <= lastP) {
|
|
errln((UnicodeString)"next() failed to move forward: next() on position "
|
|
+ lastP + (UnicodeString)" yielded " + p);
|
|
errln("Are the *.brk files corrupt?");
|
|
return NULL;
|
|
}
|
|
|
|
text.extractBetween(lastP, p, selection);
|
|
result->addElement(selection);
|
|
}
|
|
else {
|
|
if (lastP != text.length())
|
|
errln((UnicodeString)"next() returned DONE prematurely: offset was "
|
|
+ lastP + (UnicodeString)" instead of " + text.length());
|
|
}
|
|
lastP = p;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
Vector* IntlTestTextBoundary::testLastAndPrevious(BreakIterator& bi, UnicodeString& text)
|
|
{
|
|
int32_t p = bi.last();
|
|
int32_t lastP = p;
|
|
Vector *result = new Vector();
|
|
UnicodeString selection;
|
|
|
|
if (p != text.length())
|
|
errln((UnicodeString)"last() returned " + p + (UnicodeString)" instead of " + text.length());
|
|
while (p != BreakIterator::DONE) {
|
|
p = bi.previous();
|
|
if (p != BreakIterator::DONE) {
|
|
if (p >= lastP)
|
|
errln((UnicodeString)"previous() failed to move backward: previous() on position "
|
|
+ lastP + (UnicodeString)" yielded " + p);
|
|
text.extractBetween(p, lastP, selection);
|
|
result->insertElementAt(selection, 0);
|
|
}
|
|
else {
|
|
if (lastP != 0)
|
|
errln((UnicodeString)"previous() returned DONE prematurely: offset was "
|
|
+ lastP + (UnicodeString)" instead of 0");
|
|
}
|
|
lastP = p;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void IntlTestTextBoundary::compareFragmentLists(UnicodeString& f1Name, UnicodeString& f2Name, Vector* f1, Vector* f2)
|
|
{
|
|
int32_t p1 = 0;
|
|
int32_t p2 = 0;
|
|
UnicodeString s1;
|
|
UnicodeString s2;
|
|
int32_t t1 = 0;
|
|
int32_t t2 = 0;
|
|
UnicodeString target;
|
|
|
|
while (p1 < f1->size() && p2 < f2->size()) {
|
|
s1 = (UnicodeString)f1->elementAt(p1);
|
|
s2 = (UnicodeString)f2->elementAt(p2);
|
|
t1 += s1.length();
|
|
t2 += s2.length();
|
|
|
|
if (s1.compare(s2) == 0) {
|
|
logln(prettify((UnicodeString)" >" + s1 + (UnicodeString)"<", target));
|
|
++p1;
|
|
++p2;
|
|
}
|
|
else {
|
|
int32_t tempT1 = t1;
|
|
int32_t tempT2 = t2;
|
|
int32_t tempP1 = p1;
|
|
int32_t tempP2 = p2;
|
|
|
|
while (tempT1 != tempT2 && tempP1 < f1->size() && tempP2 < f2->size()) {
|
|
while (tempT1 < tempT2 && tempP1 < f1->size()) {
|
|
tempT1 += ((UnicodeString)f1->elementAt(tempP1)).length();
|
|
++tempP1;
|
|
}
|
|
while (tempT2 < tempT1 && tempP2 < f2->size()) {
|
|
tempT2 += ((UnicodeString)f2->elementAt(tempP2)).length();
|
|
++tempP2;
|
|
}
|
|
}
|
|
logln((UnicodeString)"*** " + f1Name + (UnicodeString)" has:");
|
|
while (p1 <= tempP1 && p1 < f1->size()) {
|
|
s1 = (UnicodeString)f1->elementAt(p1);
|
|
t1 += s1.length();
|
|
logln(prettify((UnicodeString)" *** >" + s1 + (UnicodeString)"<", target));
|
|
++p1;
|
|
}
|
|
logln("***** " + f2Name + " has:");
|
|
while (p2 <= tempP2 && p2 < f2->size()) {
|
|
s2 = (UnicodeString)f2->elementAt(p2);
|
|
t2 += s2.length();
|
|
logln(prettify(" ***** >" + s2 + "<", target));
|
|
++p2;
|
|
}
|
|
errln((UnicodeString)"Discrepancy between " + f1Name + (UnicodeString)" and " + f2Name);
|
|
}
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::testFollowing(BreakIterator& bi, UnicodeString& text, int32_t *boundaries)
|
|
{
|
|
logln("testFollowing():");
|
|
int p = 2;
|
|
int32_t textLen = text.length();
|
|
for (int i = 0; i <= textLen; i++) {
|
|
if (i == boundaries[p])
|
|
++p;
|
|
|
|
int32_t b = bi.following(i);
|
|
logln((UnicodeString)"bi.following(" + i + ") -> " + b);
|
|
if (b != boundaries[p])
|
|
errln((UnicodeString)"Wrong result from following() for " + i + (UnicodeString)": expected " + boundaries[p]
|
|
+ (UnicodeString)", got " + b);
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::testPreceding(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {
|
|
logln("testPreceding():");
|
|
int p = 0;
|
|
int32_t textLen = text.length();
|
|
for (int i = 0; i <= textLen; i++) {
|
|
int32_t b = bi.preceding(i);
|
|
logln((UnicodeString)"bi.preceding(" + i + ") -> " + b);
|
|
if (b != boundaries[p])
|
|
errln((UnicodeString)"Wrong result from preceding() for " + i + (UnicodeString)": expected " + boundaries[p]
|
|
+ (UnicodeString)", got " + b);
|
|
|
|
if (i == boundaries[p + 1])
|
|
++p;
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::testIsBoundary(BreakIterator& bi, UnicodeString& text, int32_t *boundaries) {
|
|
logln("testIsBoundary():");
|
|
int p = 1;
|
|
UBool isB;
|
|
int32_t textLen = text.length();
|
|
for (int i = 0; i < textLen; i++) {
|
|
isB = bi.isBoundary(i);
|
|
logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);
|
|
|
|
if (i == boundaries[p]) {
|
|
if (!isB)
|
|
errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
|
|
p++;
|
|
}
|
|
else {
|
|
if (isB)
|
|
errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
|
|
}
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::doMultipleSelectionTest(BreakIterator& iterator,
|
|
UnicodeString& testText)
|
|
{
|
|
iterator.setText(testText);
|
|
|
|
BreakIterator* testIterator = iterator.clone();
|
|
int32_t offset = iterator.first();
|
|
int32_t testOffset;
|
|
int32_t count = 0;
|
|
|
|
logln("doMultipleSelectionTest text of length: %d", testText.length());
|
|
|
|
if (*testIterator != iterator)
|
|
errln("clone() or operator!= failed: two clones compared unequal");
|
|
|
|
do {
|
|
testOffset = testIterator->first();
|
|
testOffset = testIterator->next(count);
|
|
if (offset != testOffset)
|
|
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
|
|
|
|
if (offset != BreakIterator::DONE) {
|
|
count++;
|
|
offset = iterator.next();
|
|
|
|
if (offset != BreakIterator::DONE && *testIterator == iterator)
|
|
errln("operator== failed: Two unequal iterators compared equal.");
|
|
}
|
|
} while (offset != BreakIterator::DONE);
|
|
|
|
// now do it backwards...
|
|
offset = iterator.last();
|
|
count = 0;
|
|
|
|
do {
|
|
testOffset = testIterator->last();
|
|
testOffset = testIterator->next(count);
|
|
if (offset != testOffset)
|
|
errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
|
|
|
|
if (offset != BreakIterator::DONE) {
|
|
count--;
|
|
offset = iterator.previous();
|
|
}
|
|
} while (offset != BreakIterator::DONE);
|
|
delete testIterator;
|
|
}
|
|
|
|
void IntlTestTextBoundary::doBreakInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
|
{
|
|
UnicodeString work("aaa");
|
|
int32_t errCount = 0, testCharsLen = testChars.length(), breaksLen;
|
|
|
|
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
|
|
UnicodeString breaks = CharsToUnicodeString("\r\n\\u2029\\u2028");
|
|
int32_t i, j;
|
|
|
|
breaksLen = breaks.length();
|
|
for (i = 0; i < breaksLen; i++) {
|
|
UChar c1 = work[1] = breaks[i];
|
|
for (j = 0; j < testCharsLen; j++) {
|
|
UChar c0 = work[0] = testChars[j];
|
|
for (int k = 0; k < testCharsLen; k++) {
|
|
UChar c2 = work[2] = testChars[k];
|
|
|
|
// if a cr is followed by lf, ps, ls or etx, don't do the check (that's
|
|
// not supposed to work)
|
|
if (c1 == '\r' && (c2 == '\n' || c2 == 0x2029
|
|
|| c2 == 0x2028 || c2 == 0x0003))
|
|
continue;
|
|
|
|
if (u_charType(c1) == U_CONTROL_CHAR &&
|
|
(u_charType(c2) == U_NON_SPACING_MARK ||
|
|
u_charType(c2) == U_ENCLOSING_MARK ||
|
|
u_charType(c2) == U_COMBINING_SPACING_MARK)
|
|
) {
|
|
// Combining marks don't combine with controls.
|
|
// TODO: enhance test to verify that the break actually occurs,
|
|
// not just ignore the case.
|
|
continue;
|
|
}
|
|
|
|
|
|
tb.setText(work);
|
|
UBool seen2 = FALSE;
|
|
for (int l = tb.first(); l != BreakIterator::DONE; l = tb.next()) {
|
|
if (l == 2) {
|
|
seen2 = TRUE;
|
|
break;
|
|
}
|
|
}
|
|
if (!seen2) {
|
|
errln("No break between U+" + UCharToUnicodeString(c1)
|
|
+ " and U+" + UCharToUnicodeString(c2));
|
|
errCount++;
|
|
if (errCount >= 75)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::doOtherInvariantTest(BreakIterator& tb, UnicodeString& testChars)
|
|
{
|
|
UnicodeString work("a\r\na");
|
|
int32_t errCount = 0, testCharsLen = testChars.length();
|
|
int32_t i, j;
|
|
int8_t type;
|
|
|
|
// a break should never occur between CR and LF
|
|
for (i = 0; i < testCharsLen; i++) {
|
|
work[0] = testChars[i];
|
|
for (j = 0; j < testCharsLen; j++) {
|
|
work[3] = testChars[j];
|
|
tb.setText(work);
|
|
for (int32_t k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
|
if (k == 2) {
|
|
errln("Break between CR and LF in string U+" + UCharToUnicodeString(work[0]) +
|
|
", U+d U+a U+" + UCharToUnicodeString(work[3]));
|
|
errCount++;
|
|
if (errCount >= 75)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
// a break should never occur before a non-spacing mark, unless the preceding
|
|
// character is CR, LF, PS, or LS
|
|
// Or the general category == Control.
|
|
work.remove();
|
|
work += "aaaa";
|
|
for (i = 0; i < testCharsLen; i++) {
|
|
UChar c1 = testChars[i];
|
|
if (c1 == '\n' || c1 == '\r' || c1 == 0x2029 || c1 == 0x2028 || c1 == 0x0003 ||
|
|
u_charType(c1) == U_CONTROL_CHAR) {
|
|
continue;
|
|
}
|
|
work[1] = c1;
|
|
for (j = 0; j < testCharsLen; j++) {
|
|
UChar c2 = testChars[j];
|
|
type = Unicode::getType(c2);
|
|
if ((type != Unicode::NON_SPACING_MARK) &&
|
|
(type != Unicode::ENCLOSING_MARK)) {
|
|
continue;
|
|
}
|
|
work[2] = c2;
|
|
tb.setText(work);
|
|
for (int k = tb.first(); k != BreakIterator::DONE; k = tb.next())
|
|
if (k == 2) {
|
|
errln("Break between U+" + UCharToUnicodeString(work[1])
|
|
+ " and U+" + UCharToUnicodeString(work[2]));
|
|
errCount++;
|
|
if (errCount >= 75)
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void IntlTestTextBoundary::sample(BreakIterator& tb,
|
|
UnicodeString& text,
|
|
UnicodeString& title)
|
|
{
|
|
UnicodeString substring;
|
|
UBool verboseWas = verbose;
|
|
verbose = TRUE;
|
|
logln("-------------------------"+title+" length = "+text.length());
|
|
tb.setText(text);
|
|
int32_t start = tb.first();
|
|
int32_t end;
|
|
for (end = tb.next(); end != BreakIterator::DONE; end = tb.next()) {
|
|
text.extractBetween(start, end, substring);
|
|
logln(UnicodeString("[")+start+","+end+"] \""+substring+"\"");
|
|
start = end;
|
|
}
|
|
verbose = verboseWas;
|
|
}
|
|
|
|
|
|
|