2000-01-15 02:00:06 +00:00
/********************************************************************
* COPYRIGHT :
2001-03-21 20:09:56 +00:00
* Copyright ( c ) 1997 - 2001 , International Business Machines Corporation and
2000-01-15 02:00:06 +00:00
* others . All Rights Reserved .
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
1999-08-16 21:50:52 +00:00
# include "intltest.h"
1999-12-28 23:57:50 +00:00
# include "unicode/brkiter.h"
2002-06-25 17:23:07 +00:00
# include "unicode/uchar.h"
1999-08-16 21:50:52 +00:00
# include <stdio.h>
//#include "txbdapi.h" // BreakIteratorAPIC
//--------------------------------------------------------------------------------------
/**
* " Vector " class for holding test tables
* ( this class is actually a linked list , but we use the name and API of the
* java . util . Vector class to keep as much of our test code as possible the same . )
*/
class Enumeration { // text enumeration
public :
2000-05-18 22:08:39 +00:00
virtual UBool hasMoreElements ( ) = 0 ;
1999-08-16 21:50:52 +00:00
virtual UnicodeString nextElement ( ) = 0 ;
} ;
class Vector { // text vector
public :
class TextLink {
public :
TextLink ( ) : fLink ( 0 ) , fText ( ) { }
TextLink ( TextLink * link , UnicodeString text ) : fLink ( link ) , fText ( text ) { }
TextLink * fLink ;
UnicodeString fText ;
} ;
public :
TextLink fBase ;
TextLink * fEnd ;
int32_t fSize ;
public :
class VectorEnumeration : public Enumeration {
public :
VectorEnumeration ( Vector * vector ) : fVector ( vector ) , fPos ( & vector - > fBase ) { }
2000-05-18 22:08:39 +00:00
UBool hasMoreElements ( ) { return fPos - > fLink ! = & fVector - > fBase ; }
1999-08-16 21:50:52 +00:00
UnicodeString nextElement ( ) { fPos = fPos - > fLink ; return fPos - > fText ; }
Vector * fVector ;
TextLink * fPos ;
} ;
Vector ( ) : fBase ( ) , fEnd ( & fBase ) , fSize ( 0 ) { fBase . fLink = & fBase ; }
~ Vector ( ) {
while ( fBase . fLink ! = & fBase ) {
TextLink * link = fBase . fLink ;
fBase . fLink = link - > fLink ;
delete link ;
}
}
void addElement ( UnicodeString text ) { fEnd - > fLink = new TextLink ( & fBase , text ) ; fEnd = fEnd - > fLink ; + + fSize ; }
2000-01-08 02:05:05 +00:00
void insertElementAt ( UnicodeString text , int pos ) {
2000-08-10 00:28:31 +00:00
if ( pos > = fSize | | pos < 0 )
;
else if ( pos = = 0 ) {
2000-01-08 02:05:05 +00:00
TextLink * insert = new TextLink ( & fBase , text ) ;
2000-08-10 00:28:31 +00:00
insert - > fLink = fBase . fLink ;
+ + fSize ;
fBase . fLink = insert ;
}
else {
2000-01-08 02:05:05 +00:00
TextLink * link = fBase . fLink ;
2000-08-10 00:28:31 +00:00
while ( - - pos > 0 )
2000-01-08 02:05:05 +00:00
link = link - > fLink ;
TextLink * insert = new TextLink ( & fBase , text ) ;
2000-08-10 00:28:31 +00:00
insert - > fLink = link - > fLink ;
2000-01-08 02:05:05 +00:00
link - > fLink = insert ;
2000-08-10 00:28:31 +00:00
+ + fSize ;
2000-01-08 02:05:05 +00:00
2000-08-10 00:28:31 +00:00
}
1999-08-16 21:50:52 +00:00
2000-01-08 02:05:05 +00:00
}
UnicodeString elementAt ( int32_t pos ) {
1999-08-16 21:50:52 +00:00
if ( pos > = fSize )
return UnicodeString ( ) ;
TextLink * link = fBase . fLink ;
while ( pos - - > 0 ) link = link - > fLink ;
return link - > fText ;
}
UnicodeString lastElement ( ) { return fEnd = = & fBase ? UnicodeString ( ) : fEnd - > fText ; }
int32_t size ( ) { return fSize ; }
Enumeration * elements ( ) { return new VectorEnumeration ( this ) ; }
} ;
//--------------------------------------------------------------------------------------
/**
* IntlTestTextBoundary is medium top level test class for everything in the directory " findword " .
*/
1999-12-28 23:57:50 +00:00
# include "unicode/utypes.h"
1999-08-16 21:50:52 +00:00
# include "ittxtbd.h"
# include <string.h>
1999-12-28 23:57:50 +00:00
# include "unicode/schriter.h"
1999-08-16 21:50:52 +00:00
const UChar IntlTestTextBoundary : : cannedTestArray [ ] = {
2000-06-28 22:34:25 +00:00
0x0001 , 0x0002 , 0x0003 , 0x0004 , 0x0020 , 0x0021 , ' \\ ' , 0x0022 , 0x0023 , 0x0024 , 0x0025 , 0x0026 , 0x0028 , 0x0029 , 0x002b , 0x002d , 0x0030 , 0x0031 ,
0x0032 , 0x0033 , 0x0034 , 0x003c , 0x003d , 0x003e , 0x0041 , 0x0042 , 0x0043 , 0x0044 , 0x0045 , 0x005b , 0x005d , 0x005e , 0x005f , 0x0060 , 0x0061 , 0x0062 , 0x0063 , 0x0064 , 0x0065 , 0x007b ,
0x007d , 0x007c , 0x002c , 0x00a0 , 0x00a2 ,
1999-08-16 21:50:52 +00:00
0x00a3 , 0x00a4 , 0x00a5 , 0x00a6 , 0x00a7 , 0x00a8 , 0x00a9 , 0x00ab , 0x00ad , 0x00ae , 0x00af , 0x00b0 , 0x00b2 , 0x00b3 ,
0x00b4 , 0x00b9 , 0x00bb , 0x00bc , 0x00bd , 0x02b0 , 0x02b1 , 0x02b2 , 0x02b3 , 0x02b4 , 0x0300 , 0x0301 , 0x0302 , 0x0303 ,
0x0304 , 0x05d0 , 0x05d1 , 0x05d2 , 0x05d3 , 0x05d4 , 0x0903 , 0x093e , 0x093f , 0x0940 , 0x0949 , 0x0f3a , 0x0f3b , 0x2000 ,
0x2001 , 0x2002 , 0x200c , 0x200d , 0x200e , 0x200f , 0x2010 , 0x2011 , 0x2012 , 0x2028 , 0x2029 , 0x202a , 0x203e , 0x203f ,
0x2040 , 0x20dd , 0x20de , 0x20df , 0x20e0 , 0x2160 , 0x2161 , 0x2162 , 0x2163 , 0x2164 , 0x0000
} ;
UnicodeString * IntlTestTextBoundary : : cannedTestChars = 0 ;
//---------------------------------------------
// setup methods
//---------------------------------------------
IntlTestTextBoundary : : IntlTestTextBoundary ( )
{
UnicodeString temp ( cannedTestArray ) ;
cannedTestChars = new UnicodeString ( ) ;
2000-04-03 23:53:59 +00:00
* cannedTestChars + = ( UChar ) 0x0000 ;
1999-08-16 21:50:52 +00:00
* cannedTestChars + = temp ;
addTestWordData ( ) ;
addTestSentenceData ( ) ;
addTestLineData ( ) ;
addTestCharacterData ( ) ;
}
IntlTestTextBoundary : : ~ IntlTestTextBoundary ( )
{
delete wordSelectionData ;
delete sentenceSelectionData ;
delete lineSelectionData ;
delete characterSelectionData ;
delete cannedTestChars ;
}
/**
* @ bug 4097779 4098467 4117554
*/
void IntlTestTextBoundary : : addTestWordData ( )
{
wordSelectionData = new Vector ( ) ;
wordSelectionData - > addElement ( " 12,34 " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( UCharToUnicodeString ( ( UChar ) ( 0x00A2 ) ) ) ; //cent sign
wordSelectionData - > addElement ( UCharToUnicodeString ( ( UChar ) ( 0x00A3 ) ) ) ; //pound sign
wordSelectionData - > addElement ( UCharToUnicodeString ( ( UChar ) ( 0x00A4 ) ) ) ; //currency sign
wordSelectionData - > addElement ( UCharToUnicodeString ( ( UChar ) ( 0x00A5 ) ) ) ; //yen sign
2002-06-25 17:23:07 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " alpha \\ u00adbeta \\ u00adgamma " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " . " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " Badges " ) ;
wordSelectionData - > addElement ( " ? " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " BADGES " ) ;
wordSelectionData - > addElement ( " ! " ) ;
wordSelectionData - > addElement ( " ? " ) ;
wordSelectionData - > addElement ( " ! " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " We " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " don't " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " need " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " no " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " STINKING " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " BADGES " ) ;
wordSelectionData - > addElement ( " ! " ) ;
wordSelectionData - > addElement ( " ! " ) ;
wordSelectionData - > addElement ( " ! " ) ;
wordSelectionData - > addElement ( " 012.566,5 " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " 123.3434,900 " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " 1000,233,456.000 " ) ;
wordSelectionData - > addElement ( " " ) ;
2002-08-09 03:14:43 +00:00
wordSelectionData - > addElement ( " 1,23.322 " ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " 123.1222 " ) ;
wordSelectionData - > addElement ( " " ) ;
2002-08-09 03:14:43 +00:00
wordSelectionData - > addElement ( " $ " ) ;
wordSelectionData - > addElement ( " 123,000.20 " ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2002-08-09 03:14:43 +00:00
wordSelectionData - > addElement ( " 179.01 " ) ;
wordSelectionData - > addElement ( " % " ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " Hello " ) ;
wordSelectionData - > addElement ( " , " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " how " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " are " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " you " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " X " ) ;
wordSelectionData - > addElement ( " " ) ;
wordSelectionData - > addElement ( " Now " ) ;
wordSelectionData - > addElement ( " \r " ) ;
wordSelectionData - > addElement ( " is " ) ;
wordSelectionData - > addElement ( " \n " ) ;
wordSelectionData - > addElement ( " the " ) ;
wordSelectionData - > addElement ( " \r \n " ) ;
wordSelectionData - > addElement ( " time " ) ;
wordSelectionData - > addElement ( " \n " ) ;
wordSelectionData - > addElement ( " \r " ) ;
wordSelectionData - > addElement ( " for " ) ;
wordSelectionData - > addElement ( " \r " ) ;
wordSelectionData - > addElement ( " \r " ) ;
wordSelectionData - > addElement ( " all " ) ;
wordSelectionData - > addElement ( " " ) ;
// to test for bug #4097779
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " aa \\ u0300a " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
// to test for bug #4098467
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc0c1 \\ ud56d " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud55c \\ uc778 " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc5f0 \\ ud569 " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc7a5 \\ ub85c \\ uad50 \\ ud68c " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
// conjoining jamo...
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1109 \\ u1161 \\ u11bc \\ u1112 \\ u1161 \\ u11bc " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u1161 \\ u11ab \\ u110b \\ u1175 \\ u11ab " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110b \\ u1167 \\ u11ab \\ u1112 \\ u1161 \\ u11b8 " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110c \\ u1161 \\ u11bc \\ u1105 \\ u1169 \\ u1100 \\ u116d \\ u1112 \\ u116c " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " " ) ;
// this is a test for bug #4117554: the ideographic iteration mark (U+3005) should
// count as a Kanji character for the purposes of word breaking
wordSelectionData - > addElement ( " abc " ) ;
2002-06-25 17:23:07 +00:00
// Unicode TR29: Ideographs do NOT group together into words.
//wordSelectionData->addElement(CharsToUnicodeString("\\u4e01\\u4e02\\u3005\\u4e03\\u4e03"));
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e01 " ) ) ;
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e02 " ) ) ;
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u3005 " ) ) ;
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e03 " ) ) ;
wordSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e03 " ) ) ;
1999-08-16 21:50:52 +00:00
wordSelectionData - > addElement ( " abc " ) ;
2002-06-25 17:23:07 +00:00
2000-01-08 02:05:05 +00:00
1999-08-16 21:50:52 +00:00
}
const UChar kParagraphSeparator = 0x2029 ;
const UChar kLineSeparator = 0x2028 ;
/**
* @ bug 4111338 4117554 4113835
*/
void IntlTestTextBoundary : : addTestSentenceData ( )
{
sentenceSelectionData = new Vector ( ) ;
sentenceSelectionData - > addElement ( " This is a simple sample sentence. " ) ;
sentenceSelectionData - > addElement ( " (This is it.) " ) ;
sentenceSelectionData - > addElement ( " This is a simple sample sentence. " ) ;
sentenceSelectionData - > addElement ( " \" This isn \' t it. \" " ) ;
sentenceSelectionData - > addElement ( " Hi! " ) ;
sentenceSelectionData - > addElement ( " This is a simple sample sentence. " ) ;
sentenceSelectionData - > addElement ( " It does not have to make any sense as you can see. " ) ;
sentenceSelectionData - > addElement ( " Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. " ) ;
sentenceSelectionData - > addElement ( " Che la dritta via aveo smarrita. " ) ;
sentenceSelectionData - > addElement ( " He said, that I said, that you said!! " ) ;
sentenceSelectionData - > addElement ( " Don't rock the boat. " + UCharToUnicodeString ( kParagraphSeparator ) ) ;
sentenceSelectionData - > addElement ( " Because I am the daddy, that is why. " ) ;
sentenceSelectionData - > addElement ( " Not on my time (el timo.)! " ) ;
sentenceSelectionData - > addElement ( " So what!! " + UCharToUnicodeString ( kParagraphSeparator ) ) ;
sentenceSelectionData - > addElement ( " \" But now, \" he said, \" I know! \" " ) ;
sentenceSelectionData - > addElement ( " Harris thumbed down several, including \" Away We Go \" (which became the huge success Oklahoma!). " ) ;
sentenceSelectionData - > addElement ( " One species, B. anthracis, is highly virulent. \n " ) ;
sentenceSelectionData - > addElement ( " Wolf said about Sounder: \" Beautifully thought-out and directed. \" " ) ;
sentenceSelectionData - > addElement ( " Have you ever said, \" This is where \t I shall live \" ? " ) ;
sentenceSelectionData - > addElement ( " He answered, \" You may not! \" " ) ;
sentenceSelectionData - > addElement ( " Another popular saying is: \" How do you do? \" . " ) ;
sentenceSelectionData - > addElement ( " Yet another popular saying is: \' I \' m fine thanks. \' " ) ;
sentenceSelectionData - > addElement ( " What is the proper use of the abbreviation pp.? " ) ;
sentenceSelectionData - > addElement ( " Yes, I am definatelly 12 \" tall!! " ) ;
// test for bug #4113835: \n and \r count as spaces, not as paragraph breaks
2002-06-25 17:23:07 +00:00
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " Now \r is \n the \r \n time \n \r for \r \r all \\ u037e " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4111338: Don't break sentences at the boundary between CJK
// and other letters
2002-06-25 17:23:07 +00:00
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " \\ u5487 \\ u67ff \\ ue591 \\ u5017 \\ u61b3 \\ u60a1 \\ u9510 \\ u8165: \" JAVA \\ u821c " )
2000-03-22 23:17:42 +00:00
+ CharsToUnicodeString ( " \\ u8165 \\ u7fc8 \\ u51ce \\ u306d, \\ u2494 \\ u56d8 \\ u4ec0 \\ u60b1 \\ u8560 \\ u51ba " )
+ CharsToUnicodeString ( " \\ u611d \\ u57b6 \\ u2510 \\ u5d46 \" . \\ u2029 " ) ) ;
2002-06-25 17:23:07 +00:00
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " \\ u5487 \\ u67ff \\ ue591 \\ u5017 \\ u61b3 \\ u60a1 \\ u9510 \\ u8165 \\ u9de8 " )
2000-03-22 23:17:42 +00:00
+ CharsToUnicodeString ( " \\ u97e4JAVA \\ u821c \\ u8165 \\ u7fc8 \\ u51ce \\ u306d \\ ue30b \\ u2494 \\ u56d8 \\ u4ec0 " )
2002-06-25 17:23:07 +00:00
+ CharsToUnicodeString ( " \\ u60b1 \\ u8560 \\ u51ba \\ u611d \\ u57b6 \\ u2510 \\ u5d46 \\ u97e5 \\ u7751 \\ u3002 " ) ) ;
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " \\ u5487 \\ u67ff \\ ue591 \\ u5017 \\ u61b3 \\ u60a1 \\ u9510 \\ u8165 \\ u9de8 \\ u97e4 " )
2000-03-22 23:17:42 +00:00
+ CharsToUnicodeString ( " \\ u6470 \\ u8790JAVA \\ u821c \\ u8165 \\ u7fc8 \\ u51ce \\ u306d \\ ue30b \\ u2494 \\ u56d8 " )
2002-06-25 17:23:07 +00:00
+ CharsToUnicodeString ( " \\ u4ec0 \\ u60b1 \\ u8560 \\ u51ba \\ u611d \\ u57b6 \\ u2510 \\ u5d46 \\ u97e5 \\ u7751 \\ u2048 " ) ) ;
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " He said, \" I can go there. \" \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4117554: Treat fullwidth variants of .!? the same as their
// normal counterparts
2002-06-25 17:23:07 +00:00
#if 0 // Not according to TR29. TODO: what is the right thing for these chars?
2000-03-22 23:17:42 +00:00
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " I know I'm right \\ uff0e " ) ) ;
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " Right \\ uff1f " ) ) ;
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " Right \\ uff01 " ) ) ;
2002-06-25 17:23:07 +00:00
# endif
1999-08-16 21:50:52 +00:00
// test for bug #4117554: Don't break sentences at boundary between CJK and digits
2000-03-22 23:17:42 +00:00
sentenceSelectionData - > addElement ( CharsToUnicodeString ( " \\ u5487 \\ u67ff \\ ue591 \\ u5017 \\ u61b3 \\ u60a1 \\ u9510 \\ u8165 \\ u9de8 " )
+ CharsToUnicodeString ( " \\ u97e48888 \\ u821c \\ u8165 \\ u7fc8 \\ u51ce \\ u306d \\ ue30b \\ u2494 \\ u56d8 \\ u4ec0 " )
2002-06-25 17:23:07 +00:00
+ CharsToUnicodeString ( " \\ u60b1 \\ u8560 \\ u51ba \\ u611d \\ u57b6 \\ u2510 \\ u5d46 \\ u97e5 \\ u7751. \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4117554: Break sentence between a sentence terminator and
// opening punctuation
2002-06-25 17:23:07 +00:00
sentenceSelectionData - > addElement ( " Say no? " ) ;
sentenceSelectionData - > addElement ( " (yes). " + CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4158381: Don't break sentence after period if it isn't
// followed by a space
sentenceSelectionData - > addElement ( " Test <code>Flags.Flag</code> class. " ) ;
2000-03-22 23:17:42 +00:00
sentenceSelectionData - > addElement ( " Another test. " + CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4158381: No breaks when there are no terminators around
sentenceSelectionData - > addElement ( " <P>Provides a set of "lightweight" (all-java<FONT SIZE= \" -2 \" ><SUP>TM</SUP></FONT> language) components that, to the maximum degree possible, work the same on all platforms. " ) ;
2000-03-22 23:17:42 +00:00
sentenceSelectionData - > addElement ( " Another test. " + CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4143071: Make sure sentences that end with digits
// work right
sentenceSelectionData - > addElement ( " Today is the 27th of May, 1998. " ) ;
sentenceSelectionData - > addElement ( " Tomorrow with be 28 May 1998. " ) ;
sentenceSelectionData - > addElement ( " The day after will be the 30th. "
2000-03-22 23:17:42 +00:00
+ CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4152416: Make sure sentences ending with a capital
// letter are treated correctly
2002-06-25 17:23:07 +00:00
// Unicode TR29 reverses above bug: Don't break a sentence if the last word begins with an upper case letter.
sentenceSelectionData - > addElement ( " The type of all primitive <code>boolean</code> values accessed in the target VM. "
" Calls to xxx will return an implementor of this interface. " + CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
// test for bug #4152117: Make sure sentence breaking is handling
// punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
// HERE TO MAKE SURE IT DOESN'T CROP UP]
sentenceSelectionData - > addElement ( " Constructs a randomly generated BigInteger, uniformly distributed over the range <tt>0</tt> to <tt>(2<sup>numBits</sup> - 1)</tt>, inclusive. " ) ;
sentenceSelectionData - > addElement ( " The uniformity of the distribution assumes that a fair source of random bits is provided in <tt>rnd</tt>. " ) ;
2000-03-22 23:17:42 +00:00
sentenceSelectionData - > addElement ( " Note that this constructor always constructs a non-negative BigInteger. " + CharsToUnicodeString ( " \\ u2029 " ) ) ;
1999-08-16 21:50:52 +00:00
}
/**
* @ bug 4068133 4086052 4035266 4097920 4098467 4117554
*/
void IntlTestTextBoundary : : addTestLineData ( )
{
lineSelectionData = new Vector ( ) ;
lineSelectionData - > addElement ( " Multi- " ) ;
lineSelectionData - > addElement ( " Level " ) ;
lineSelectionData - > addElement ( " example " ) ;
lineSelectionData - > addElement ( " of " ) ;
lineSelectionData - > addElement ( " a " ) ;
lineSelectionData - > addElement ( " semi- " ) ;
lineSelectionData - > addElement ( " idiotic " ) ;
lineSelectionData - > addElement ( " non- " ) ;
lineSelectionData - > addElement ( " sensical " ) ;
lineSelectionData - > addElement ( " (non- " ) ;
lineSelectionData - > addElement ( " important) " ) ;
lineSelectionData - > addElement ( " sentence. " ) ;
lineSelectionData - > addElement ( " Hi " ) ;
lineSelectionData - > addElement ( " Hello " ) ;
lineSelectionData - > addElement ( " How \n " ) ;
lineSelectionData - > addElement ( " are \r " ) ;
lineSelectionData - > addElement ( " you " + UCharToUnicodeString ( kLineSeparator ) ) ;
lineSelectionData - > addElement ( " fine. \t " ) ;
lineSelectionData - > addElement ( " good. " ) ;
lineSelectionData - > addElement ( " Now \r " ) ;
lineSelectionData - > addElement ( " is \n " ) ;
lineSelectionData - > addElement ( " the \r \n " ) ;
lineSelectionData - > addElement ( " time \n " ) ;
lineSelectionData - > addElement ( " \r " ) ;
lineSelectionData - > addElement ( " for \r " ) ;
lineSelectionData - > addElement ( " \r " ) ;
lineSelectionData - > addElement ( " all " ) ;
// to test for bug #4068133
2000-03-22 23:17:42 +00:00
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u96f6 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e00 \\ u3002 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e8c \\ u3001 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e09 \\ u3002 \\ u3001 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u56db \\ u3001 \\ u3002 \\ u3001 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e94, " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u516d. " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e03. \\ u3001, \\ u3002 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u516b " ) ) ;
1999-08-16 21:50:52 +00:00
// to test for bug #4086052
2000-03-22 23:17:42 +00:00
lineSelectionData - > addElement ( CharsToUnicodeString ( " foo \\ u00a0bar " ) ) ;
1999-08-16 21:50:52 +00:00
// lineSelectionData->addElement("foo\\ufeffbar");
// to test for bug #4097920
lineSelectionData - > addElement ( " dog, " ) ;
lineSelectionData - > addElement ( " cat, " ) ;
lineSelectionData - > addElement ( " mouse " ) ;
lineSelectionData - > addElement ( " (one) " ) ;
lineSelectionData - > addElement ( " (two) \n " ) ;
// to test for bug #4035266
lineSelectionData - > addElement ( " The " ) ;
lineSelectionData - > addElement ( " balance " ) ;
lineSelectionData - > addElement ( " is " ) ;
lineSelectionData - > addElement ( " $-23,456.78, " ) ;
lineSelectionData - > addElement ( " not " ) ;
2002-06-25 17:23:07 +00:00
// lineSelectionData->addElement("-$32,456.78!\n"); // Doesn't break this way according to TR29
lineSelectionData - > addElement ( " - " ) ;
lineSelectionData - > addElement ( " $32,456.78! \n " ) ;
1999-08-16 21:50:52 +00:00
// to test for bug #4098467
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
2002-06-25 17:23:07 +00:00
// By TR14, precomposed Hangul syllables should not be grouped together.
// Also, identical test is in rbbitst.cpp.
#if 0
2000-03-22 23:17:42 +00:00
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc0c1 \\ ud56d " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud55c \\ uc778 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc5f0 \\ ud569 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc7a5 \\ ub85c \\ uad50 \\ ud68c " ) ) ;
2002-06-25 17:23:07 +00:00
1999-08-16 21:50:52 +00:00
// conjoining jamo...
2000-03-22 23:17:42 +00:00
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1109 \\ u1161 \\ u11bc \\ u1112 \\ u1161 \\ u11bc " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u1161 \\ u11ab \\ u110b \\ u1175 \\ u11ab " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110b \\ u1167 \\ u11ab \\ u1112 \\ u1161 \\ u11b8 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110c \\ u1161 \\ u11bc \\ u1105 \\ u1169 \\ u1100 \\ u116d \\ u1112 \\ u116c " ) ) ;
2002-06-25 17:23:07 +00:00
# endif
1999-08-16 21:50:52 +00:00
// to test for bug #4117554: Fullwidth .!? should be treated as postJwrd
2000-03-22 23:17:42 +00:00
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e01 \\ uff0e " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e02 \\ uff01 " ) ) ;
lineSelectionData - > addElement ( CharsToUnicodeString ( " \\ u4e03 \\ uff1f " ) ) ;
1999-08-16 21:50:52 +00:00
}
/*
const UnicodeString graveS = " S " + ( UChar ) 0x0300 ;
const UnicodeString acuteBelowI = " i " + UCharToUnicodeString ( 0x0317 ) ;
const UnicodeString acuteE = " e " + UCharToUnicodeString ( 0x0301 ) ;
const UnicodeString circumflexA = " a " + UCharToUnicodeString ( 0x0302 ) ;
const UnicodeString tildeE = " e " + UCharToUnicodeString ( 0x0303 ) ;
*/
/**
* @ bug 4098467
*/
void IntlTestTextBoundary : : addTestCharacterData ( )
{
characterSelectionData = new Vector ( ) ;
characterSelectionData - > addElement ( " S " + UCharToUnicodeString ( 0x0300 ) ) ; //graveS
characterSelectionData - > addElement ( " i " + UCharToUnicodeString ( 0x0301 ) ) ; // acuteBelowI
characterSelectionData - > addElement ( " m " ) ;
characterSelectionData - > addElement ( " p " ) ;
characterSelectionData - > addElement ( " l " ) ;
characterSelectionData - > addElement ( " e " + UCharToUnicodeString ( 0x0301 ) ) ; // acuteE
characterSelectionData - > addElement ( " " ) ;
characterSelectionData - > addElement ( " s " ) ;
characterSelectionData - > addElement ( " a " + UCharToUnicodeString ( 0x0302 ) ) ; // circumflexA
characterSelectionData - > addElement ( " m " ) ;
characterSelectionData - > addElement ( " p " ) ;
characterSelectionData - > addElement ( " l " ) ;
characterSelectionData - > addElement ( " e " + UCharToUnicodeString ( 0x0303 ) ) ; // tildeE
characterSelectionData - > addElement ( " . " ) ;
characterSelectionData - > addElement ( " w " ) ;
characterSelectionData - > addElement ( " a " + UCharToUnicodeString ( 0x0302 ) ) ; // circumflexA
characterSelectionData - > addElement ( " w " ) ;
characterSelectionData - > addElement ( " a " ) ;
characterSelectionData - > addElement ( " f " ) ;
characterSelectionData - > addElement ( " q " ) ;
characterSelectionData - > addElement ( " \n " ) ;
characterSelectionData - > addElement ( " \r " ) ;
characterSelectionData - > addElement ( " \r \n " ) ;
characterSelectionData - > addElement ( " \n " ) ;
// to test for bug #4098467
// What follows is a string of Korean characters (I found it in the Yellow Pages
// ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed
// it correctly), first as precomposed syllables, and then as conjoining jamo.
// Both sequences should be semantically identical and break the same way.
// precomposed syllables...
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc0c1 " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud56d " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud55c " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc778 " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc5f0 " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud569 " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ uc7a5 " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ ub85c " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ uad50 " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ ud68c " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
// conjoining jamo...
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1109 \\ u1161 \\ u11bc " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u1161 \\ u11bc " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u1161 \\ u11ab " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110b \\ u1175 \\ u11ab " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110b \\ u1167 \\ u11ab " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u1161 \\ u11b8 " ) ) ;
1999-08-16 21:50:52 +00:00
characterSelectionData - > addElement ( " " ) ;
2000-03-22 23:17:42 +00:00
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u110c \\ u1161 \\ u11bc " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1105 \\ u1169 " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1100 \\ u116d " ) ) ;
characterSelectionData - > addElement ( CharsToUnicodeString ( " \\ u1112 \\ u116c " ) ) ;
2000-01-08 02:05:05 +00:00
1999-08-16 21:50:52 +00:00
}
UnicodeString IntlTestTextBoundary : : createTestData ( Enumeration * e )
{
2000-08-10 00:28:31 +00:00
UnicodeString result = " " ;
1999-08-16 21:50:52 +00:00
2000-08-10 00:28:31 +00:00
while ( e - > hasMoreElements ( ) ) {
result + = e - > nextElement ( ) ;
}
return result ;
1999-08-16 21:50:52 +00:00
}
//---------------------------------------------
// SentenceBreak tests
//---------------------------------------------
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestSentenceIteration ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createSentenceInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestSentenceIteration. \n " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
generalIteratorTest ( * e , sentenceSelectionData ) ;
1999-08-16 21:50:52 +00:00
delete e ;
}
void IntlTestTextBoundary : : TestSentenceInvariants ( )
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createSentenceInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestSentenceInvariant. \n " ) ;
return ;
}
2000-03-22 23:17:42 +00:00
UnicodeString s = * cannedTestChars + CharsToUnicodeString ( " ., \\ u3001 \\ u3002 \\ u3041 \\ u3042 \\ u3043 \\ ufeff " ) ;
1999-08-16 21:50:52 +00:00
doOtherInvariantTest ( * e , s ) ;
delete e ;
}
//---------------------------------------------
// WordBreak tests
//---------------------------------------------
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestWordIteration ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createWordInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestWordIteration. \n " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
generalIteratorTest ( * e , wordSelectionData ) ;
1999-08-16 21:50:52 +00:00
delete e ;
}
void IntlTestTextBoundary : : TestWordInvariants ( )
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createWordInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestWordInvariants. \n " ) ;
return ;
}
2000-03-22 23:17:42 +00:00
UnicodeString s = * cannedTestChars + CharsToUnicodeString ( " \' ,. \\ u3041 \\ u3042 \\ u3043 \\ u309b \\ u309c \\ u30a1 \\ u30a2 \\ u30a3 \\ u4e00 \\ u4e01 \\ u4e02 " ) ;
1999-08-16 21:50:52 +00:00
doBreakInvariantTest ( * e , s ) ;
2000-03-22 23:17:42 +00:00
s = * cannedTestChars + CharsToUnicodeString ( " \' ,. \\ u3041 \\ u3042 \\ u3043 \\ u309b \\ u309c \\ u30a1 \\ u30a2 \\ u30a3 \\ u4e00 \\ u4e01 \\ u4e02 " ) ;
1999-08-16 21:50:52 +00:00
doOtherInvariantTest ( * e , s ) ;
delete e ;
}
//---------------------------------------------
2000-01-08 02:05:05 +00:00
// CharacterBreak tests
1999-08-16 21:50:52 +00:00
//---------------------------------------------
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestCharacterIteration ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createCharacterInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestCharacterIteration. \n " ) ;
return ;
}
2000-08-10 00:28:31 +00:00
// generalIteratorTest(*e, testCharacterText, characterSelectionData);
generalIteratorTest ( * e , characterSelectionData ) ;
1999-08-16 21:50:52 +00:00
delete e ;
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestCharacterInvariants ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createCharacterInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestCharacterInvariants. \n " ) ;
return ;
}
2000-03-22 23:17:42 +00:00
UnicodeString s = * cannedTestChars + CharsToUnicodeString ( " \\ u1100 \\ u1101 \\ u1102 \\ u1160 \\ u1161 \\ u1162 \\ u11a8 \\ u11a9 \\ u11aa " ) ;
2000-01-08 02:05:05 +00:00
doBreakInvariantTest ( * e , s ) ;
2000-03-22 23:17:42 +00:00
s = * cannedTestChars + CharsToUnicodeString ( " \\ u1100 \\ u1101 \\ u1102 \\ u1160 \\ u1161 \\ u1162 \\ u11a8 \\ u11a9 \\ u11aa " ) ;
2000-01-08 02:05:05 +00:00
doOtherInvariantTest ( * e , s ) ;
1999-08-16 21:50:52 +00:00
delete e ;
}
2000-01-08 02:05:05 +00:00
//---------------------------------------------
// LineBreak tests
//---------------------------------------------
void IntlTestTextBoundary : : TestLineIteration ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * e = BreakIterator : : createLineInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestLineIteration. \n " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
generalIteratorTest ( * e , lineSelectionData ) ;
1999-08-16 21:50:52 +00:00
delete e ;
}
void IntlTestTextBoundary : : TestLineInvariants ( )
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
2002-08-21 00:16:30 +00:00
BreakIterator * e = BreakIterator : : createLineInstance ( Locale : : getUS ( ) , status ) ;
2000-01-14 00:13:59 +00:00
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestLineInvariants. \n " ) ;
return ;
}
2000-03-22 23:17:42 +00:00
UnicodeString s = CharsToUnicodeString ( " .,;: \\ u3001 \\ u3002 \\ u3041 \\ u3042 \\ u3043 \\ u3044 \\ u3045 \\ u30a3 \\ u4e00 \\ u4e01 \\ u4e02 " ) ;
1999-08-16 21:50:52 +00:00
UnicodeString testChars = * cannedTestChars + s ;
doBreakInvariantTest ( * e , testChars ) ;
doOtherInvariantTest ( * e , testChars ) ;
2001-03-13 22:54:54 +00:00
int32_t errCount = 0 , testCharsLen , noBreakLen , dashesLen ;
2002-03-12 01:32:42 +00:00
int32_t i , j , k ;
1999-08-16 21:50:52 +00:00
// in addition to the other invariants, a line-break iterator should make sure that:
2002-06-25 17:23:07 +00:00
// it doesn't break around the non-breaking characters,
// EXCEPT breaking after a space takes precedence over not breaking before
// an non-breaking char. So says TR 14.
2000-03-22 23:17:42 +00:00
UnicodeString noBreak = CharsToUnicodeString ( " \\ u00a0 \\ u2007 \\ u2011 \\ ufeff " ) ;
1999-08-16 21:50:52 +00:00
UnicodeString work ( " aaa " ) ;
2001-03-13 22:54:54 +00:00
testCharsLen = testChars . length ( ) ;
noBreakLen = noBreak . length ( ) ;
for ( i = 0 ; i < testCharsLen ; i + + ) {
1999-08-16 21:50:52 +00:00
UChar c = testChars [ i ] ;
2002-06-25 17:23:07 +00:00
if ( c = = ' \r ' | | c = = ' \n ' | | c = = 0x2029 | | c = = 0x2028 | | c = = 0x0003 | |
u_charType ( c ) = = U_CONTROL_CHAR ) {
1999-08-16 21:50:52 +00:00
continue ;
2002-06-25 17:23:07 +00:00
}
1999-08-16 21:50:52 +00:00
work [ 0 ] = c ;
2001-03-13 22:54:54 +00:00
for ( j = 0 ; j < noBreakLen ; j + + ) {
1999-08-16 21:50:52 +00:00
work [ 1 ] = noBreak [ j ] ;
2001-03-13 22:54:54 +00:00
for ( k = 0 ; k < testCharsLen ; k + + ) {
1999-08-16 21:50:52 +00:00
work [ 2 ] = testChars [ k ] ;
2001-03-07 22:42:46 +00:00
e - > setText ( work ) ;
2002-06-25 17:23:07 +00:00
for ( int l = e - > first ( ) ; l ! = BreakIterator : : DONE ; l = e - > next ( ) ) {
UChar c1 = work [ l - 1 ] ;
UChar c2 = work [ l ] ;
if ( c1 = = 0x20 & & l = = 1 ) {
continue ;
}
1999-08-16 21:50:52 +00:00
if ( l = = 1 | | l = = 2 ) {
2002-06-25 17:23:07 +00:00
errln ( " Got break between U+ " + UCharToUnicodeString ( c1 ) +
" and U+ " + UCharToUnicodeString ( c2 ) ) ;
2000-12-04 23:17:28 +00:00
errCount + + ;
if ( errCount > = 75 )
1999-08-16 21:50:52 +00:00
return ;
}
2002-06-25 17:23:07 +00:00
}
1999-08-16 21:50:52 +00:00
}
}
}
2002-06-25 17:23:07 +00:00
// it does break after hyphens (Rule 15B from TR 14
// (unless they're followed by a digit, a non-spacing mark,
// a currency symbol, a non-breaking space, or a line or paragraph separator
// or something of class BA, HY, NS, QU, GL, CL, EX, IS or SY from TR14 when the hyphen is /u002d
// This test is sufficiently screwed up that I'm largely disabling it. TODO: fix it. 06/12/2002 AGH
//
2000-03-22 23:17:42 +00:00
UnicodeString dashes = CharsToUnicodeString ( " - \\ u00ad \\ u2010 \\ u2012 \\ u2013 \\ u2014 " ) ;
2001-03-13 22:54:54 +00:00
dashesLen = dashes . length ( ) ;
for ( i = 0 ; i < testCharsLen ; i + + ) {
1999-08-16 21:50:52 +00:00
work [ 0 ] = testChars [ i ] ;
2001-03-13 22:54:54 +00:00
for ( j = 0 ; j < dashesLen ; j + + ) {
2002-06-25 17:23:07 +00:00
UChar c1 = work [ 1 ] = dashes [ j ] ;
2001-03-13 22:54:54 +00:00
for ( k = 0 ; k < testCharsLen ; k + + ) {
2002-06-25 17:23:07 +00:00
UChar c2 = work [ 2 ] = testChars [ k ] ;
2002-08-21 19:09:33 +00:00
int8_t type = u_charType ( c2 ) ;
if ( type = = U_DECIMAL_DIGIT_NUMBER | |
type = = U_OTHER_NUMBER | |
type = = U_NON_SPACING_MARK | |
type = = U_ENCLOSING_MARK | |
type = = U_CURRENCY_SYMBOL | |
type = = U_SPACE_SEPARATOR | |
type = = U_DASH_PUNCTUATION | |
type = = U_CONTROL_CHAR | |
type = = U_FORMAT_CHAR | |
2002-06-25 17:23:07 +00:00
c2 = = ' \n ' | | c2 = = ' \r ' | | c2 = = 0x2028 | | c2 = = 0x2029 | |
c2 = = 0x0003 | | c2 = = 0x00a0 | | c2 = = 0x2007 | | c2 = = 0x2011 | |
c2 = = 0xfeff )
2001-03-13 22:54:54 +00:00
{
1999-08-16 21:50:52 +00:00
continue ;
2001-03-13 22:54:54 +00:00
}
2002-06-25 17:23:07 +00:00
// If c1 == hyphen-minus, and ...
if ( c1 = = 0x002d & & (
c2 = = 0x0021 | | // !
c2 = = 0x002c | | // ,
c2 = = 0x002d | | // -
c2 = = 0x002e | | // . (TR 14 class IS)
c2 = = 0x0029 | | // )
c2 = = 0x003a | | // :
c2 = = 0x003b | | // ; (TR 14 class IS)
c2 = = 0x005d | | // ]
c2 = = 0x007c | | // | (TR 14 class BA, rule 15)
c2 = = 0x007d | | // }
c2 = = 0x0903 | | // Devanagari sign visarga, combining, what's it doing in this test?
c2 = = 0x093E | | // Devanagari , combining, what's it doing in this test?
c2 = = 0x093F | | // Devanagari , combining, what's it doing in this test?
c2 = = 0x0940 | | // Devanagari , combining, what's it doing in this test?
c2 = = 0x0949 | | // Devanagari , combining, what's it doing in this test?
c2 = = 0x0f3b | | // Tibetan closing bracket
c2 = = 0x3001 | | // CJK closing bracket
c2 = = 0x3002 // CJK closing bracket
) ) {
continue ;
}
2001-03-07 22:42:46 +00:00
e - > setText ( work ) ;
2000-05-18 22:08:39 +00:00
UBool saw2 = FALSE ;
2001-03-13 22:54:54 +00:00
for ( int l = e - > first ( ) ; l ! = BreakIterator : : DONE ; l = e - > next ( ) ) {
if ( l = = 2 ) {
1999-08-16 21:50:52 +00:00
saw2 = TRUE ;
2001-03-13 22:54:54 +00:00
break ;
}
}
1999-08-16 21:50:52 +00:00
if ( ! saw2 ) {
2002-06-25 17:23:07 +00:00
// TODO: This test is completely out of sync with the spec. Fix it.
// errln("Didn't get break between U+" + UCharToUnicodeString(work[1]) +
// " and U+" + UCharToUnicodeString(work[2]));
// errCount++;
// if (errCount >= 75)
// return;
1999-08-16 21:50:52 +00:00
}
}
}
}
2001-03-14 01:38:58 +00:00
delete e ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestThaiLineBreak ( ) {
2001-03-13 22:54:54 +00:00
Vector * thaiLineSelection = new Vector ( ) ;
UErrorCode status = U_ZERO_ERROR ;
// \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
// represents elided letters at the end of a long word. It should be bound to
// the end of the word and not treated as an independent punctuation mark.
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2a \\ u0e16 \\ u0e32 \\ u0e19 \\ u0e35 \\ u0e2f " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e08 \\ u0e30 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e30 \\ u0e14 \\ u0e21 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e08 \\ u0e49 \\ u0e32 " ) ) ;
2000-03-22 23:17:42 +00:00
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2b\\u0e19\\u0e49\\u0e32"));
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e17\\u0e35\\u0e48"));
2001-03-13 22:54:54 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2b \\ u0e19 \\ u0e49 \\ u0e32 \\ u0e17 \\ u0e35 \\ u0e48 " ) ) ;
// the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2d \\ u0e2d \\ u0e01 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e21 \\ u0e32 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e23 \\ u0e48 \\ u0e07 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e30 \\ u0e1a \\ u0e32 \\ u0e22 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2d \\ u0e22 \\ u0e48 \\ u0e32 \\ u0e07 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e15 \\ u0e47 \\ u0e21 " ) ) ;
// the one time where the paiyannoi occurs somewhere other than at the end
// of a word is in the Thai abbrevation for "etc.", which both begins and
// ends with a paiyannoi
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2f \\ u0e25 \\ u0e2f " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e17 \\ u0e35 \\ u0e48 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e19 \\ u0e31 \\ u0e49 \\ u0e19 " ) ) ;
BreakIterator * e = BreakIterator : : createLineInstance (
Locale ( " th " ) , status ) ;
if ( U_FAILURE ( status ) )
{
2001-05-02 22:41:02 +00:00
errln ( " Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. \n " ) ;
2001-03-13 22:54:54 +00:00
return ;
2000-01-08 02:05:05 +00:00
}
1999-08-16 21:50:52 +00:00
2001-03-13 22:54:54 +00:00
generalIteratorTest ( * e , thaiLineSelection ) ;
delete e ;
delete thaiLineSelection ;
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestMixedThaiLineBreak ( )
1999-08-16 21:50:52 +00:00
{
2001-03-13 22:54:54 +00:00
UErrorCode status = U_ZERO_ERROR ;
Vector * thaiLineSelection = new Vector ( ) ;
// Arabic numerals should always be separated from surrounding Thai text
2000-01-08 02:05:05 +00:00
/*
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e04 \\ u0e48 \\ u0e32 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e07 \\ u0e34 \\ u0e19 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e1a \\ u0e32 \\ u0e17 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e41 \\ u0e15 \\ u0e30 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e30 \\ u0e14 \\ u0e31 \\ u0e1a " ) ) ;
2000-01-08 02:05:05 +00:00
thaiLineSelection - > addElement ( " 39 " ) ;
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e1a \\ u0e32 \\ u0e17 " ) ) ;
2000-01-08 02:05:05 +00:00
// words in non-Thai scripts should always be separated from surrounding Thai text
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e17 \\ u0e14 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2a \\ u0e2d \\ u0e1a " ) ) ;
2000-01-08 02:05:05 +00:00
thaiLineSelection - > addElement ( " Java " ) ;
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e1a \\ u0e19 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e04 \\ u0e23 \\ u0e37 \\ u0e48 \\ u0e2d \\ u0e07 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e44 \\ u0e2d \\ u0e1a \\ u0e35 \\ u0e40 \\ u0e2d \\ u0e47 \\ u0e21 " ) ) ;
2000-01-08 02:05:05 +00:00
// Thai numerals should always be separated from the text surrounding them
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e04 \\ u0e48 \\ u0e32 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e07 \\ u0e34 \\ u0e19 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e1a \\ u0e32 \\ u0e17 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e41 \\ u0e15 \\ u0e30 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e30 \\ u0e14 \\ u0e31 \\ u0e1a " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e53 \\ u0e59 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e1a \\ u0e32 \\ u0e17 " ) ) ;
2000-01-08 02:05:05 +00:00
// Thai text should interact correctly with punctuation and symbols
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e44 \\ u0e2d \\ u0e1a \\ u0e35 \\ u0e40 \\ u0e2d \\ u0e47 \\ u0e21 " ) ) ;
// thaiLineSelection->addElement(CharsToUnicodeString("(\\u0e1b\\u0e23\\u0e30\\u0e40\\u0e17\\u0e28"));
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e44\\u0e17\\u0e22)"));
thaiLineSelection - > addElement ( CharsToUnicodeString ( " ( \\ u0e1b \\ u0e23 \\ u0e30 \\ u0e40 \\ u0e17 \\ u0e28 \\ u0e44 \\ u0e17 \\ u0e22) " ) ) ;
2000-01-08 02:05:05 +00:00
// I believe the commented-out reading above to be the correct one, but this is what passes with our current dictionary
2000-03-22 23:17:42 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e08 \\ u0e33 \\ u0e01 \\ u0e31 \\ u0e14 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e1b \\ u0e34 \\ u0e14 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e15 \\ u0e31 \\ u0e27 \" " ) ) ;
2000-01-08 02:05:05 +00:00
*/
2002-06-25 17:23:07 +00:00
// The Unicode Linebreak TR says do not break before or after quotes.
// So this test is changed ot not break around the quote.
// TODO: should Thai break around the around the quotes, like the original behavior here?
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e2e\\u0e32\\u0e23\\u0e4c\\u0e14\\u0e14\\u0e34\\u0e2a\\u0e01\\u0e4c\""));
// thaiLineSelection->addElement(CharsToUnicodeString("\\u0e23\\u0e38\\u0e48\\u0e19"));
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e2e \\ u0e32 \\ u0e23 \\ u0e4c \\ u0e14 \\ u0e14 \\ u0e34 \\ u0e2a \\ u0e01 \\ u0e4c \" "
" \\ u0e23 \\ u0e38 \\ u0e48 \\ u0e19 " ) ) ;
2001-03-13 22:54:54 +00:00
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e43 \\ u0e2b \\ u0e21 \\ u0e48 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e14 \\ u0e37 \\ u0e2d \\ u0e19 \\ u0e21 \\ u0e34. " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e22. " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e19 \\ u0e35 \\ u0e49 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e32 \\ u0e04 \\ u0e32 " ) ) ;
thaiLineSelection - > addElement ( " $200 " ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e17 \\ u0e48 \\ u0e32 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e19 \\ u0e31 \\ u0e49 \\ u0e19 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " ( \" \\ u0e2e \\ u0e32 \\ u0e23 \\ u0e4c \\ u0e14 \\ u0e14 \\ u0e34 \\ u0e2a \\ u0e01 \\ u0e4c \" ). " ) ) ;
BreakIterator * e = BreakIterator : : createLineInstance ( Locale ( " th " ) , status ) ;
if ( U_FAILURE ( status ) )
{
2001-05-02 22:41:02 +00:00
errln ( " Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. \n " ) ;
2001-03-13 22:54:54 +00:00
return ;
}
2000-08-10 00:28:31 +00:00
2001-03-13 22:54:54 +00:00
generalIteratorTest ( * e , thaiLineSelection ) ;
delete e ;
delete thaiLineSelection ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestMaiyamok ( )
1999-08-16 21:50:52 +00:00
{
2001-03-13 22:54:54 +00:00
Vector * thaiLineSelection = new Vector ( ) ;
UErrorCode status = U_ZERO_ERROR ;
// the Thai maiyamok character is a shorthand symbol that means "repeat the previous
// word". Instead of appearing as a word unto itself, however, it's kept together
// with the word before it
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e44 \\ u0e1b \\ u0e46 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e21 \\ u0e32 \\ u0e46 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e23 \\ u0e30 \\ u0e2b \\ u0e27 \\ u0e48 \\ u0e32 \\ u0e07 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e01 \\ u0e23 \\ u0e38 \\ u0e07 \\ u0e40 \\ u0e17 \\ u0e1e " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e41 \\ u0e25 \\ u0e30 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e40 \\ u0e03 \\ u0e35 \\ u0e22 \\ u0e07 " ) ) ;
thaiLineSelection - > addElement ( CharsToUnicodeString ( " \\ u0e43 \\ u0e2b \\ u0e21 \\ u0e48 " ) ) ;
BreakIterator * e = BreakIterator : : createLineInstance (
Locale ( " th " ) , status ) ;
if ( U_FAILURE ( status ) )
{
2001-05-02 22:41:02 +00:00
errln ( " Failed to create the BreakIterator for Thai locale in TestMaiyamok. \n " ) ;
2001-03-13 22:54:54 +00:00
return ;
}
generalIteratorTest ( * e , thaiLineSelection ) ;
delete e ;
delete thaiLineSelection ;
1999-08-16 21:50:52 +00:00
}
2001-05-02 22:41:02 +00:00
void IntlTestTextBoundary : : TestThaiWordBreak ( ) {
Vector * thaiWordSelection = new Vector ( ) ;
UErrorCode status = U_ZERO_ERROR ;
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E1A \\ u0E17 " ) ) ; //2
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E17 \\ u0E35 \\ u0E48 " ) ) ; //5
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E51 " ) ) ; //6
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E1E \\ u0E32 \\ u0E22 \\ u0E38 " ) ) ; //10
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E44 \\ u0E0B \\ u0E42 \\ u0E04 \\ u0E25 \\ u0E19 " ) ) ; //16
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u000D \\ u000A " ) ) ; //18
// This is the correct result
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E42\\u0E14\\u0E42\\u0E23\\u0E18\\u0E35")); //24
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E2D\\u0E32\\u0E28\\u0E31\\u0E22")); //29
// and this is what the dictionary does...
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E42 \\ u0E14 " ) ) ; // 20
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E42 \\ u0E23 \\ u0E18 \\ u0E35 \\ u0E2D \\ u0E32 \\ u0E28 \\ u0E31 \\ u0E22 " ) ) ; //29
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E2D \\ u0E22 \\ u0E39 \\ u0E48 " ) ) ; //33
// This is the correct result
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E17\\u0E48\\u0E32\\u0E21")); //37
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E25\\u0E32\\u0E07")); //41
// and this is what the dictionary does
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E17 \\ u0E48 \\ u0E32 \\ u0E21 \\ u0E01 \\ u0E25 \\ u0E32 \\ u0E07 " ) ) ; //41
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E17 \\ u0E38 \\ u0E48 \\ u0E07 " ) ) ; //45
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E43 \\ u0E2B \\ u0E0D \\ u0E48 " ) ) ; //49
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E43 \\ u0E19 " ) ) ; //51
// This is the correct result
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E04\\u0E19\\u0E0B\\u0E31\\u0E2A")); //57
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E01\\u0E31\\u0E1A")); //60
// and this is what the dictionary does
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E41 \\ u0E04 \\ u0E19 " ) ) ; // 54
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E0B \\ u0E31 \\ u0E2A \\ u0E01 \\ u0E31 \\ u0E1A " ) ) ; //60
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E25 \\ u0E38 \\ u0E07 " ) ) ; //63
// This is the correct result
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E40\\u0E2E\\u0E19\\u0E23\\u0E35")); //68
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E0A\\u0E32\\u0E27")); //71
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E44\\u0E23\\u0E48")); //74
//thaiWordSelection->addElement(CharsToUnicodeString("\\u0E41\\u0E25\\u0E30")); //77
// and this is what the dictionary does
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E40 \\ u0E2E " ) ) ; // 65
thaiWordSelection - > addElement ( CharsToUnicodeString ( " \\ u0E19 \\ u0E23 \\ u0E35 \\ u0E0A \\ u0E32 \\ u0E27 \\ u0E44 \\ u0E23 \\ u0E48 \\ u0E41 \\ u0E25 \\ u0E30 " ) ) ; //77
BreakIterator * e = BreakIterator : : createWordInstance (
Locale ( " th " ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for Thai locale in TestThaiWordBreak. \n " ) ;
return ;
}
generalIteratorTest ( * e , thaiWordSelection ) ;
delete e ;
delete thaiWordSelection ;
}
1999-08-16 21:50:52 +00:00
/**
2000-01-08 02:05:05 +00:00
* Test Japanese Line Break
1999-08-16 21:50:52 +00:00
* @ bug 4095322
*/
void IntlTestTextBoundary : : TestJapaneseLineBreak ( )
{
2002-06-25 17:23:07 +00:00
// Change for Unicode TR 14: Punctuation characters with categories Pi and Pf do not count
// as opening and closing punctuation for line breaking.
// Also, \u30fc and \u30fe are not counted as hyphens. Remove these chars
// from these tests. 6-13-2002
//
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
2000-03-22 23:17:42 +00:00
UnicodeString testString = CharsToUnicodeString ( " \\ u4e00x \\ u4e8c " ) ;
2002-06-25 17:23:07 +00:00
UnicodeString precedingChars = CharsToUnicodeString (
//"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
" ([{$ \\ u00a5 \\ u00a3 \\ u00a4 \\ u201a \\ u201e " ) ;
UnicodeString followingChars = CharsToUnicodeString (
// ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
" )]}!%,. \\ u3001 \\ u3002 \\ u3063 \\ u3083 \\ u3085 \\ u3087 \\ u30c3 \\ u30e3 \\ u30e5 \\ u30e7 "
// ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
" :; \\ u309b \\ u309c \\ u3005 \\ u309d \\ u309e \\ u30fd \\ u00b0 \\ u2032 \\ u2033 \\ u2034 "
" \\ u2030 \\ u2031 \\ u2103 \\ u2109 \\ u00a2 \\ u0300 \\ u0301 \\ u0302 " ) ;
2002-08-21 00:16:30 +00:00
BreakIterator * iter = BreakIterator : : createLineInstance ( Locale : : getJapan ( ) , status ) ;
1999-08-16 21:50:52 +00:00
2002-03-12 01:32:42 +00:00
int32_t i ;
2000-01-14 00:13:59 +00:00
if ( U_FAILURE ( status ) )
{
2001-05-02 22:41:02 +00:00
errln ( " Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak. \n " ) ;
2000-01-14 00:13:59 +00:00
return ;
}
1999-08-16 21:50:52 +00:00
1999-12-08 02:11:04 +00:00
for ( i = 0 ; i < precedingChars . length ( ) ; i + + ) {
1999-08-16 21:50:52 +00:00
testString [ 1 ] = precedingChars [ i ] ;
2001-03-07 22:42:46 +00:00
iter - > setText ( testString ) ;
1999-08-16 21:50:52 +00:00
int32_t j = iter - > first ( ) ;
if ( j ! = 0 )
errln ( " ja line break failure: failed to start at 0 " ) ;
j = iter - > next ( ) ;
if ( j ! = 1 )
errln ( " ja line break failure: failed to stop before ' " + UCharToUnicodeString ( precedingChars [ i ] )
+ " ' ( " + ( ( int ) ( precedingChars [ i ] ) ) + " ) " ) ;
j = iter - > next ( ) ;
if ( j ! = 3 )
errln ( " ja line break failure: failed to skip position after ' " + UCharToUnicodeString ( precedingChars [ i ] )
+ " ' ( " + ( ( int ) ( precedingChars [ i ] ) ) + " ) " ) ;
}
1999-12-08 02:11:04 +00:00
for ( i = 0 ; i < followingChars . length ( ) ; i + + ) {
1999-08-16 21:50:52 +00:00
testString [ 1 ] = followingChars [ i ] ;
2001-03-07 22:42:46 +00:00
iter - > setText ( testString ) ;
1999-08-16 21:50:52 +00:00
int j = iter - > first ( ) ;
if ( j ! = 0 )
errln ( " ja line break failure: failed to start at 0 " ) ;
j = iter - > next ( ) ;
if ( j ! = 2 )
errln ( " ja line break failure: failed to skip position before ' " + UCharToUnicodeString ( followingChars [ i ] )
+ " ' ( " + ( ( int ) ( followingChars [ i ] ) ) + " ) " ) ;
j = iter - > next ( ) ;
if ( j ! = 3 )
errln ( " ja line break failure: failed to stop after ' " + UCharToUnicodeString ( followingChars [ i ] )
+ " ' ( " + ( ( int ) ( followingChars [ i ] ) ) + " ) " ) ;
}
delete iter ;
}
2000-01-08 02:05:05 +00:00
//---------------------------------------------
// other tests
//---------------------------------------------/
1999-08-16 21:50:52 +00:00
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestEmptyString ( )
{
UnicodeString text = " " ;
Vector x ;
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
2000-01-08 02:05:05 +00:00
x . addElement ( text ) ;
2000-01-14 00:13:59 +00:00
BreakIterator * bi = BreakIterator : : createLineInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestEmptyString. \n " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
generalIteratorTest ( * bi , & x ) ;
delete bi ;
}
void IntlTestTextBoundary : : TestGetAvailableLocales ( )
{
int32_t locCount = 0 ;
const Locale * locList = BreakIterator : : getAvailableLocales ( locCount ) ;
if ( locCount = = 0 )
errln ( " getAvailableLocales() returned an empty list! " ) ;
2001-03-28 18:50:17 +00:00
// Just make sure that it's returning good memory.
for ( int32_t i = 0 ; i < locCount ; + + i ) {
logln ( locList [ i ] . getName ( ) ) ;
}
2000-01-08 02:05:05 +00:00
}
//Testing the BreakIterator::getDisplayName() function
1999-08-16 21:50:52 +00:00
void IntlTestTextBoundary : : TestGetDisplayName ( )
{
UnicodeString result ;
2002-03-26 23:18:57 +00:00
BreakIterator : : getDisplayName ( Locale : : getUS ( ) , result ) ;
if ( Locale : : getDefault ( ) = = Locale : : getUS ( ) & & result ! = " English (United States) " )
1999-08-16 21:50:52 +00:00
errln ( " BreakIterator::getDisplayName() failed: expected \" English (United States) \" , got \" "
+ result ) ;
2002-03-26 23:18:57 +00:00
BreakIterator : : getDisplayName ( Locale : : getFrance ( ) , Locale : : getUS ( ) , result ) ;
1999-08-16 21:50:52 +00:00
if ( result ! = " French (France) " )
errln ( " BreakIterator::getDisplayName() failed: expected \" French (France) \" , got \" "
+ result ) ;
}
/**
2000-01-08 02:05:05 +00:00
* Test End Behaviour
1999-08-16 21:50:52 +00:00
* @ bug 4068137
*/
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : TestEndBehaviour ( )
1999-08-16 21:50:52 +00:00
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
1999-08-16 21:50:52 +00:00
UnicodeString testString ( " boo. " ) ;
2000-01-14 00:13:59 +00:00
BreakIterator * wb = BreakIterator : : createWordInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestEndBehaviour. \n " ) ;
return ;
}
2001-03-07 22:42:46 +00:00
wb - > setText ( testString ) ;
1999-08-16 21:50:52 +00:00
if ( wb - > first ( ) ! = 0 )
errln ( " Didn't get break at beginning of string. " ) ;
if ( wb - > next ( ) ! = 3 )
errln ( " Didn't get break before period in \" boo. \" " ) ;
if ( wb - > current ( ) ! = 4 & & wb - > next ( ) ! = 4 )
errln ( " Didn't get break at end of string. " ) ;
delete wb ;
}
2000-01-08 02:05:05 +00:00
/*
* @ bug 4153072
*/
void IntlTestTextBoundary : : TestBug4153072 ( ) {
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
BreakIterator * iter = BreakIterator : : createWordInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestBug4153072 \n " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
UnicodeString str ( " ...Hello, World!... " ) ;
int32_t begin = 3 ;
int32_t end = str . length ( ) - 3 ;
2000-05-18 22:08:39 +00:00
UBool dummy ;
1999-08-16 21:50:52 +00:00
2000-01-08 02:05:05 +00:00
StringCharacterIterator * textIterator = new StringCharacterIterator ( str , begin , end , begin ) ;
iter - > adoptText ( textIterator ) ;
for ( int index = - 1 ; index < begin + 1 ; + + index ) {
dummy = iter - > isBoundary ( index ) ;
if ( index < begin & & dummy = = TRUE ) {
errln ( ( UnicodeString ) " Didn't handle preceeding correctly with offset = " + index +
" and begin index = " + begin ) ;
}
}
delete iter ;
}
2000-12-04 23:17:28 +00:00
2000-01-08 02:05:05 +00:00
/*
* Test Preceding ( )
*/
void IntlTestTextBoundary : : TestPreceding ( )
{
2000-01-14 00:13:59 +00:00
UErrorCode status = U_ZERO_ERROR ;
2000-01-08 02:05:05 +00:00
UnicodeString words3 ( " aaa bbb ccc " ) ;
2000-01-14 00:13:59 +00:00
BreakIterator * e = BreakIterator : : createWordInstance ( Locale : : getDefault ( ) , status ) ;
if ( U_FAILURE ( status ) )
{
errln ( " Failed to create the BreakIterator for default locale in TestPreceeding. \n " ) ;
return ;
}
2001-03-07 22:42:46 +00:00
e - > setText ( words3 ) ;
2000-01-08 02:05:05 +00:00
e - > first ( ) ;
2002-03-12 01:32:42 +00:00
int32_t p1 = e - > next ( ) ;
int32_t p2 = e - > next ( ) ;
int32_t p3 = e - > next ( ) ;
int32_t p4 = e - > next ( ) ;
2001-03-28 18:50:17 +00:00
2002-03-12 01:32:42 +00:00
int32_t f = e - > following ( p2 + 1 ) ;
int32_t p = e - > preceding ( p2 + 1 ) ;
2000-12-04 23:17:28 +00:00
if ( f ! = p3 )
errln ( " IntlTestTextBoundary::TestPreceding: f!=p3 " ) ;
if ( p ! = p2 )
errln ( " IntlTestTextBoundary::TestPreceding: p!=p2 " ) ;
2001-03-28 18:50:17 +00:00
if ( p1 + 1 ! = p2 )
errln ( " IntlTestTextBoundary::TestPreceding: p1+1!=p2 " ) ;
if ( p3 + 1 ! = p4 )
errln ( " IntlTestTextBoundary::TestPreceding: p3+1!=p4 " ) ;
2000-01-08 02:05:05 +00:00
if ( ! e - > isBoundary ( p2 ) | | e - > isBoundary ( p2 + 1 ) | | ! e - > isBoundary ( p3 ) )
{
errln ( " IntlTestTextBoundary::TestPreceding: isBoundary err " ) ;
}
delete e ;
}
1999-08-16 21:50:52 +00:00
//---------------------------------------------
// runIndexedTest
//---------------------------------------------
2000-08-23 19:11:16 +00:00
void IntlTestTextBoundary : : runIndexedTest ( int32_t index , UBool exec , const char * & name , char * /*par*/ )
1999-08-16 21:50:52 +00:00
{
if ( exec ) logln ( " TestSuite TextBoundary: " ) ;
switch ( index ) {
2001-03-13 22:54:54 +00:00
case 0 : name = " TestSentenceIteration " ; if ( exec ) TestSentenceIteration ( ) ; break ;
case 1 : name = " TestWordIteration " ; if ( exec ) TestWordIteration ( ) ; break ;
case 2 : name = " TestLineIteration " ; if ( exec ) TestLineIteration ( ) ; break ;
case 3 : name = " TestCharacterIteration " ; if ( exec ) TestCharacterIteration ( ) ; break ;
case 4 : name = " TestSentenceInvariants " ; if ( exec ) TestSentenceInvariants ( ) ; break ;
case 5 : name = " TestWordInvariants " ; if ( exec ) TestWordInvariants ( ) ; break ;
case 6 : name = " TestLineInvariants " ; if ( exec ) TestLineInvariants ( ) ; break ;
case 7 : name = " TestCharacterInvariants " ; if ( exec ) TestCharacterInvariants ( ) ; break ;
case 8 : name = " TestEmptyString " ; if ( exec ) TestEmptyString ( ) ; break ;
case 9 : name = " TestGetAvailableLocales " ; if ( exec ) TestGetAvailableLocales ( ) ; break ;
case 10 : name = " TestGetDisplayName " ; if ( exec ) TestGetDisplayName ( ) ; break ;
case 11 : name = " TestPreceding " ; if ( exec ) TestPreceding ( ) ; break ;
case 12 : name = " TestBug4153072 " ; if ( exec ) TestBug4153072 ( ) ; break ;
case 13 : name = " TestEndBehaviour " ; if ( exec ) TestEndBehaviour ( ) ; break ;
case 14 : name = " TestJapaneseLineBreak " ; if ( exec ) TestJapaneseLineBreak ( ) ; break ;
case 15 : name = " TestThaiLineBreak " ; if ( exec ) TestThaiLineBreak ( ) ; break ;
case 16 : name = " TestMixedThaiLineBreak " ; if ( exec ) TestMixedThaiLineBreak ( ) ; break ;
case 17 : name = " TestMaiyamok " ; if ( exec ) TestMaiyamok ( ) ; break ;
2001-05-02 22:41:02 +00:00
case 18 : name = " TestThaiWordBreak " ; if ( exec ) TestThaiWordBreak ( ) ; break ;
2001-03-13 22:54:54 +00:00
default : name = " " ; break ; //needed to end loop
1999-08-16 21:50:52 +00:00
}
}
//---------------------------------------------
// Test implementation routines
//---------------------------------------------
2000-01-08 02:05:05 +00:00
// general test Implementation subroutines
void IntlTestTextBoundary : : generalIteratorTest ( BreakIterator & bi , Vector * expectedResult )
1999-08-16 21:50:52 +00:00
{
2000-01-08 02:05:05 +00:00
Enumeration * elems = expectedResult - > elements ( ) ;
2000-08-10 00:28:31 +00:00
UnicodeString text = createTestData ( elems ) ;
delete elems ;
2000-01-08 02:05:05 +00:00
2001-05-05 01:30:29 +00:00
logln ( " comparing forward and backward... " ) ;
2001-03-07 22:42:46 +00:00
bi . setText ( text ) ;
2000-01-08 02:05:05 +00:00
Vector * nextResults = testFirstAndNext ( bi , text ) ;
2001-05-05 01:30:29 +00:00
if ( nextResults = = NULL ) {
errln ( " Couldn't get nextResults! " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
Vector * previousResults = testLastAndPrevious ( bi , text ) ;
2001-05-05 01:30:29 +00:00
if ( previousResults = = NULL ) {
errln ( " Couldn't get previousResults! " ) ;
return ;
}
2000-01-08 02:05:05 +00:00
int errs = getErrors ( ) ;
2000-08-10 00:28:31 +00:00
UnicodeString str1 = " forward iteration " ;
UnicodeString str2 = " backward iteration " ;
2000-01-13 19:25:32 +00:00
compareFragmentLists ( str1 , str2 , nextResults ,
2000-01-08 02:05:05 +00:00
previousResults ) ;
if ( getErrors ( ) = = errs ) {
logln ( " comparing expected and actual... " ) ;
2000-08-10 00:28:31 +00:00
str1 = " expected result " ;
str2 = " actual result " ;
2000-01-13 19:25:32 +00:00
compareFragmentLists ( str1 , str2 , expectedResult ,
2000-01-08 02:05:05 +00:00
nextResults ) ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
int32_t * boundaries = new int32_t [ expectedResult - > size ( ) + 3 ] ;
2000-01-11 01:50:24 +00:00
boundaries [ 0 ] = BreakIterator : : DONE ;
2000-01-08 02:05:05 +00:00
boundaries [ 1 ] = 0 ;
for ( int i = 0 ; i < expectedResult - > size ( ) ; i + + )
boundaries [ i + 2 ] = boundaries [ i + 1 ] + ( ( UnicodeString ) expectedResult - > elementAt ( i ) ) .
length ( ) ;
2000-08-10 00:28:31 +00:00
2000-01-08 02:05:05 +00:00
int len = expectedResult - > size ( ) + 3 - 1 ;
2000-08-10 00:28:31 +00:00
boundaries [ len ] = BreakIterator : : DONE ;
2000-01-08 02:05:05 +00:00
testFollowing ( bi , text , boundaries ) ;
testPreceding ( bi , text , boundaries ) ;
testIsBoundary ( bi , text , boundaries ) ;
doMultipleSelectionTest ( bi , text ) ;
2000-08-14 23:14:23 +00:00
delete nextResults ;
delete previousResults ;
2001-03-14 01:38:58 +00:00
delete [ ] boundaries ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
Vector * IntlTestTextBoundary : : testFirstAndNext ( BreakIterator & bi , UnicodeString & text )
1999-08-16 21:50:52 +00:00
{
2000-01-08 02:05:05 +00:00
int32_t p = bi . first ( ) ;
int32_t lastP = p ;
Vector * result = new Vector ( ) ;
UnicodeString selection ;
2002-06-25 17:23:07 +00:00
2000-01-08 02:05:05 +00:00
if ( p ! = 0 )
errln ( ( UnicodeString ) " first() returned " + p + ( UnicodeString ) " instead of 0 " ) ;
2000-01-11 01:50:24 +00:00
while ( p ! = BreakIterator : : DONE ) {
2000-01-08 02:05:05 +00:00
p = bi . next ( ) ;
2000-01-11 01:50:24 +00:00
if ( p ! = BreakIterator : : DONE ) {
2001-05-05 01:30:29 +00:00
if ( p < = lastP ) {
2000-01-08 02:05:05 +00:00
errln ( ( UnicodeString ) " next() failed to move forward: next() on position "
2002-06-25 17:23:07 +00:00
+ lastP + ( UnicodeString ) " yielded " + p ) ;
2001-05-05 01:30:29 +00:00
errln ( " Are the *.brk files corrupt? " ) ;
return NULL ;
}
2002-06-25 17:23:07 +00:00
2000-01-08 02:05:05 +00:00
text . extractBetween ( lastP , p , selection ) ;
result - > addElement ( selection ) ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
else {
if ( lastP ! = text . length ( ) )
errln ( ( UnicodeString ) " next() returned DONE prematurely: offset was "
2002-06-25 17:23:07 +00:00
+ lastP + ( UnicodeString ) " instead of " + text . length ( ) ) ;
2000-01-08 02:05:05 +00:00
}
lastP = p ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
return result ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
Vector * IntlTestTextBoundary : : testLastAndPrevious ( BreakIterator & bi , UnicodeString & text )
1999-08-16 21:50:52 +00:00
{
2000-01-08 02:05:05 +00:00
int32_t p = bi . last ( ) ;
int32_t lastP = p ;
Vector * result = new Vector ( ) ;
2000-08-10 00:28:31 +00:00
UnicodeString selection ;
2000-01-08 02:05:05 +00:00
if ( p ! = text . length ( ) )
errln ( ( UnicodeString ) " last() returned " + p + ( UnicodeString ) " instead of " + text . length ( ) ) ;
2000-01-11 01:50:24 +00:00
while ( p ! = BreakIterator : : DONE ) {
2000-01-08 02:05:05 +00:00
p = bi . previous ( ) ;
2000-01-11 01:50:24 +00:00
if ( p ! = BreakIterator : : DONE ) {
2000-01-08 02:05:05 +00:00
if ( p > = lastP )
errln ( ( UnicodeString ) " previous() failed to move backward: previous() on position "
+ lastP + ( UnicodeString ) " yielded " + p ) ;
text . extractBetween ( p , lastP , selection ) ;
result - > insertElementAt ( selection , 0 ) ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
else {
if ( lastP ! = 0 )
errln ( ( UnicodeString ) " previous() returned DONE prematurely: offset was "
+ lastP + ( UnicodeString ) " instead of 0 " ) ;
}
lastP = p ;
1999-08-16 21:50:52 +00:00
}
2000-08-10 00:28:31 +00:00
return result ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : compareFragmentLists ( UnicodeString & f1Name , UnicodeString & f2Name , Vector * f1 , Vector * f2 )
1999-08-16 21:50:52 +00:00
{
2000-01-08 02:05:05 +00:00
int32_t p1 = 0 ;
int32_t p2 = 0 ;
UnicodeString s1 ;
UnicodeString s2 ;
int32_t t1 = 0 ;
int32_t t2 = 0 ;
2000-08-10 00:28:31 +00:00
UnicodeString target ;
2000-01-08 02:05:05 +00:00
while ( p1 < f1 - > size ( ) & & p2 < f2 - > size ( ) ) {
s1 = ( UnicodeString ) f1 - > elementAt ( p1 ) ;
s2 = ( UnicodeString ) f2 - > elementAt ( p2 ) ;
t1 + = s1 . length ( ) ;
t2 + = s2 . length ( ) ;
if ( s1 . compare ( s2 ) = = 0 ) {
logln ( prettify ( ( UnicodeString ) " > " + s1 + ( UnicodeString ) " < " , target ) ) ;
+ + p1 ;
+ + p2 ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
else {
int32_t tempT1 = t1 ;
int32_t tempT2 = t2 ;
int32_t tempP1 = p1 ;
int32_t tempP2 = p2 ;
while ( tempT1 ! = tempT2 & & tempP1 < f1 - > size ( ) & & tempP2 < f2 - > size ( ) ) {
while ( tempT1 < tempT2 & & tempP1 < f1 - > size ( ) ) {
tempT1 + = ( ( UnicodeString ) f1 - > elementAt ( tempP1 ) ) . length ( ) ;
+ + tempP1 ;
}
while ( tempT2 < tempT1 & & tempP2 < f2 - > size ( ) ) {
tempT2 + = ( ( UnicodeString ) f2 - > elementAt ( tempP2 ) ) . length ( ) ;
+ + tempP2 ;
}
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
logln ( ( UnicodeString ) " *** " + f1Name + ( UnicodeString ) " has: " ) ;
while ( p1 < = tempP1 & & p1 < f1 - > size ( ) ) {
s1 = ( UnicodeString ) f1 - > elementAt ( p1 ) ;
t1 + = s1 . length ( ) ;
logln ( prettify ( ( UnicodeString ) " *** > " + s1 + ( UnicodeString ) " < " , target ) ) ;
+ + p1 ;
1999-08-16 21:50:52 +00:00
}
2000-01-08 02:05:05 +00:00
logln ( " ***** " + f2Name + " has: " ) ;
while ( p2 < = tempP2 & & p2 < f2 - > size ( ) ) {
s2 = ( UnicodeString ) f2 - > elementAt ( p2 ) ;
t2 + = s2 . length ( ) ;
logln ( prettify ( " ***** > " + s2 + " < " , target ) ) ;
+ + p2 ;
}
errln ( ( UnicodeString ) " Discrepancy between " + f1Name + ( UnicodeString ) " and " + f2Name ) ;
1999-08-16 21:50:52 +00:00
}
}
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : testFollowing ( BreakIterator & bi , UnicodeString & text , int32_t * boundaries )
1999-08-16 21:50:52 +00:00
{
2000-01-08 02:05:05 +00:00
logln ( " testFollowing(): " ) ;
int p = 2 ;
2001-03-13 22:54:54 +00:00
int32_t textLen = text . length ( ) ;
for ( int i = 0 ; i < = textLen ; i + + ) {
2000-01-08 02:05:05 +00:00
if ( i = = boundaries [ p ] )
+ + p ;
int32_t b = bi . following ( i ) ;
logln ( ( UnicodeString ) " bi.following( " + i + " ) -> " + b ) ;
if ( b ! = boundaries [ p ] )
errln ( ( UnicodeString ) " Wrong result from following() for " + i + ( UnicodeString ) " : expected " + boundaries [ p ]
+ ( UnicodeString ) " , got " + b ) ;
1999-08-16 21:50:52 +00:00
}
}
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : testPreceding ( BreakIterator & bi , UnicodeString & text , int32_t * boundaries ) {
logln ( " testPreceding(): " ) ;
int p = 0 ;
2001-03-13 22:54:54 +00:00
int32_t textLen = text . length ( ) ;
for ( int i = 0 ; i < = textLen ; i + + ) {
2000-01-08 02:05:05 +00:00
int32_t b = bi . preceding ( i ) ;
logln ( ( UnicodeString ) " bi.preceding( " + i + " ) -> " + b ) ;
if ( b ! = boundaries [ p ] )
errln ( ( UnicodeString ) " Wrong result from preceding() for " + i + ( UnicodeString ) " : expected " + boundaries [ p ]
+ ( UnicodeString ) " , got " + b ) ;
if ( i = = boundaries [ p + 1 ] )
+ + p ;
}
}
1999-08-16 21:50:52 +00:00
2000-01-08 02:05:05 +00:00
void IntlTestTextBoundary : : testIsBoundary ( BreakIterator & bi , UnicodeString & text , int32_t * boundaries ) {
logln ( " testIsBoundary(): " ) ;
int p = 1 ;
2000-05-18 22:08:39 +00:00
UBool isB ;
2001-03-13 22:54:54 +00:00
int32_t textLen = text . length ( ) ;
for ( int i = 0 ; i < textLen ; i + + ) {
2000-01-08 02:05:05 +00:00
isB = bi . isBoundary ( i ) ;
logln ( ( UnicodeString ) " bi.isBoundary( " + i + " ) -> " + isB ) ;
if ( i = = boundaries [ p ] ) {
if ( ! isB )
errln ( ( UnicodeString ) " Wrong result from isBoundary() for " + i + ( UnicodeString ) " : expected true, got false " ) ;
p + + ;
}
else {
if ( isB )
errln ( ( UnicodeString ) " Wrong result from isBoundary() for " + i + ( UnicodeString ) " : expected false, got true " ) ;
1999-08-16 21:50:52 +00:00
}
}
}
void IntlTestTextBoundary : : doMultipleSelectionTest ( BreakIterator & iterator ,
UnicodeString & testText )
{
2001-03-07 22:42:46 +00:00
iterator . setText ( testText ) ;
1999-08-16 21:50:52 +00:00
BreakIterator * testIterator = iterator . clone ( ) ;
int32_t offset = iterator . first ( ) ;
int32_t testOffset ;
int32_t count = 0 ;
2001-03-14 01:38:58 +00:00
logln ( " doMultipleSelectionTest text of length: %d " , testText . length ( ) ) ;
1999-08-16 21:50:52 +00:00
if ( * testIterator ! = iterator )
errln ( " clone() or operator!= failed: two clones compared unequal " ) ;
do {
testOffset = testIterator - > first ( ) ;
testOffset = testIterator - > next ( count ) ;
if ( offset ! = testOffset )
errln ( UnicodeString ( " next(n) and next() not returning consistent results: for step " ) + count + " , next(n) returned " + testOffset + " and next() had " + offset ) ;
if ( offset ! = BreakIterator : : DONE ) {
count + + ;
offset = iterator . next ( ) ;
if ( offset ! = BreakIterator : : DONE & & * testIterator = = iterator )
errln ( " operator== failed: Two unequal iterators compared equal. " ) ;
}
} while ( offset ! = BreakIterator : : DONE ) ;
// now do it backwards...
offset = iterator . last ( ) ;
count = 0 ;
do {
testOffset = testIterator - > last ( ) ;
testOffset = testIterator - > next ( count ) ;
if ( offset ! = testOffset )
errln ( UnicodeString ( " next(n) and next() not returning consistent results: for step " ) + count + " , next(n) returned " + testOffset + " and next() had " + offset ) ;
if ( offset ! = BreakIterator : : DONE ) {
count - - ;
offset = iterator . previous ( ) ;
}
} while ( offset ! = BreakIterator : : DONE ) ;
delete testIterator ;
}
void IntlTestTextBoundary : : doBreakInvariantTest ( BreakIterator & tb , UnicodeString & testChars )
{
UnicodeString work ( " aaa " ) ;
2001-03-13 22:54:54 +00:00
int32_t errCount = 0 , testCharsLen = testChars . length ( ) , breaksLen ;
1999-08-16 21:50:52 +00:00
// a break should always occur after CR (unless followed by LF), LF, PS, and LS
2000-03-22 23:17:42 +00:00
UnicodeString breaks = CharsToUnicodeString ( " \r \n \\ u2029 \\ u2028 " ) ;
2002-03-12 01:32:42 +00:00
int32_t i , j ;
1999-08-16 21:50:52 +00:00
2001-03-13 22:54:54 +00:00
breaksLen = breaks . length ( ) ;
for ( i = 0 ; i < breaksLen ; i + + ) {
2002-06-25 17:23:07 +00:00
UChar c1 = work [ 1 ] = breaks [ i ] ;
2001-03-13 22:54:54 +00:00
for ( j = 0 ; j < testCharsLen ; j + + ) {
2002-06-25 17:23:07 +00:00
UChar c0 = work [ 0 ] = testChars [ j ] ;
2001-03-13 22:54:54 +00:00
for ( int k = 0 ; k < testCharsLen ; k + + ) {
2002-06-25 17:23:07 +00:00
UChar c2 = work [ 2 ] = testChars [ k ] ;
1999-08-16 21:50:52 +00:00
// if a cr is followed by lf, ps, ls or etx, don't do the check (that's
// not supposed to work)
2002-06-25 17:23:07 +00:00
if ( c1 = = ' \r ' & & ( c2 = = ' \n ' | | c2 = = 0x2029
| | c2 = = 0x2028 | | c2 = = 0x0003 ) )
1999-08-16 21:50:52 +00:00
continue ;
2002-06-25 17:23:07 +00:00
if ( u_charType ( c1 ) = = U_CONTROL_CHAR & &
( u_charType ( c2 ) = = U_NON_SPACING_MARK | |
u_charType ( c2 ) = = U_ENCLOSING_MARK | |
u_charType ( c2 ) = = U_COMBINING_SPACING_MARK )
) {
// Combining marks don't combine with controls.
// TODO: enhance test to verify that the break actually occurs,
// not just ignore the case.
continue ;
}
2001-03-07 22:42:46 +00:00
tb . setText ( work ) ;
2000-05-18 22:08:39 +00:00
UBool seen2 = FALSE ;
1999-08-16 21:50:52 +00:00
for ( int l = tb . first ( ) ; l ! = BreakIterator : : DONE ; l = tb . next ( ) ) {
2001-03-13 22:54:54 +00:00
if ( l = = 2 ) {
1999-08-16 21:50:52 +00:00
seen2 = TRUE ;
2001-03-13 22:54:54 +00:00
break ;
}
1999-08-16 21:50:52 +00:00
}
if ( ! seen2 ) {
2002-06-25 17:23:07 +00:00
errln ( " No break between U+ " + UCharToUnicodeString ( c1 )
+ " and U+ " + UCharToUnicodeString ( c2 ) ) ;
2000-12-04 23:17:28 +00:00
errCount + + ;
if ( errCount > = 75 )
1999-08-16 21:50:52 +00:00
return ;
}
}
}
}
}
void IntlTestTextBoundary : : doOtherInvariantTest ( BreakIterator & tb , UnicodeString & testChars )
{
UnicodeString work ( " a \r \n a " ) ;
2001-03-13 22:54:54 +00:00
int32_t errCount = 0 , testCharsLen = testChars . length ( ) ;
2002-03-12 01:32:42 +00:00
int32_t i , j ;
2001-03-13 22:54:54 +00:00
int8_t type ;
1999-08-16 21:50:52 +00:00
// a break should never occur between CR and LF
2001-03-13 22:54:54 +00:00
for ( i = 0 ; i < testCharsLen ; i + + ) {
1999-08-16 21:50:52 +00:00
work [ 0 ] = testChars [ i ] ;
2001-03-13 22:54:54 +00:00
for ( j = 0 ; j < testCharsLen ; j + + ) {
1999-08-16 21:50:52 +00:00
work [ 3 ] = testChars [ j ] ;
2001-03-07 22:42:46 +00:00
tb . setText ( work ) ;
1999-08-16 21:50:52 +00:00
for ( int32_t k = tb . first ( ) ; k ! = BreakIterator : : DONE ; k = tb . next ( ) )
if ( k = = 2 ) {
errln ( " Break between CR and LF in string U+ " + UCharToUnicodeString ( work [ 0 ] ) +
" , U+d U+a U+ " + UCharToUnicodeString ( work [ 3 ] ) ) ;
2000-12-04 23:17:28 +00:00
errCount + + ;
if ( errCount > = 75 )
1999-08-16 21:50:52 +00:00
return ;
}
}
}
// a break should never occur before a non-spacing mark, unless the preceding
// character is CR, LF, PS, or LS
2002-06-25 17:23:07 +00:00
// Or the general category == Control.
1999-08-16 21:50:52 +00:00
work . remove ( ) ;
work + = " aaaa " ;
2001-03-13 22:54:54 +00:00
for ( i = 0 ; i < testCharsLen ; i + + ) {
2002-06-25 17:23:07 +00:00
UChar c1 = testChars [ i ] ;
if ( c1 = = ' \n ' | | c1 = = ' \r ' | | c1 = = 0x2029 | | c1 = = 0x2028 | | c1 = = 0x0003 | |
2002-08-09 03:14:43 +00:00
u_charType ( c1 ) = = U_CONTROL_CHAR | | u_charType ( c1 ) = = U_FORMAT_CHAR ) {
1999-08-16 21:50:52 +00:00
continue ;
2002-06-25 17:23:07 +00:00
}
work [ 1 ] = c1 ;
2001-03-13 22:54:54 +00:00
for ( j = 0 ; j < testCharsLen ; j + + ) {
2002-06-25 17:23:07 +00:00
UChar c2 = testChars [ j ] ;
2002-08-21 19:09:33 +00:00
type = u_charType ( c2 ) ;
if ( ( type ! = U_NON_SPACING_MARK ) & &
( type ! = U_ENCLOSING_MARK ) ) {
1999-08-16 21:50:52 +00:00
continue ;
2002-06-25 17:23:07 +00:00
}
work [ 2 ] = c2 ;
2001-03-07 22:42:46 +00:00
tb . setText ( work ) ;
1999-08-16 21:50:52 +00:00
for ( int k = tb . first ( ) ; k ! = BreakIterator : : DONE ; k = tb . next ( ) )
if ( k = = 2 ) {
errln ( " Break between U+ " + UCharToUnicodeString ( work [ 1 ] )
+ " and U+ " + UCharToUnicodeString ( work [ 2 ] ) ) ;
2000-12-04 23:17:28 +00:00
errCount + + ;
if ( errCount > = 75 )
1999-08-16 21:50:52 +00:00
return ;
}
}
}
}
void IntlTestTextBoundary : : sample ( BreakIterator & tb ,
UnicodeString & text ,
UnicodeString & title )
{
UnicodeString substring ;
2000-05-18 22:08:39 +00:00
UBool verboseWas = verbose ;
1999-08-16 21:50:52 +00:00
verbose = TRUE ;
1999-12-08 02:11:04 +00:00
logln ( " ------------------------- " + title + " length = " + text . length ( ) ) ;
2001-03-07 22:42:46 +00:00
tb . setText ( text ) ;
1999-08-16 21:50:52 +00:00
int32_t start = tb . first ( ) ;
int32_t end ;
for ( end = tb . next ( ) ; end ! = BreakIterator : : DONE ; end = tb . next ( ) ) {
text . extractBetween ( start , end , substring ) ;
logln ( UnicodeString ( " [ " ) + start + " , " + end + " ] \" " + substring + " \" " ) ;
start = end ;
}
verbose = verboseWas ;
}
2000-01-08 02:05:05 +00:00
1999-08-16 21:50:52 +00:00