2002-08-08 15:38:16 +00:00
/ * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( C ) 1996 - 2001 , International Business Machines Corporation and *
* others . All Rights Reserved . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
* $Source : / xsrl / Nsvn / icu / unicodetools / com / ibm / text / UCD / GenerateBreakTest . java , v $
2003-02-25 23:38:23 +00:00
* $Date : 2003 / 02 / 25 23 : 38 : 23 $
* $Revision : 1 . 3 $
2002-08-08 15:38:16 +00:00
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* /
package com.ibm.text.UCD ;
import java.util.* ;
import java.io.* ;
import com.ibm.text.utility.* ;
import com.ibm.icu.text.UTF16 ;
import com.ibm.icu.text.UnicodeSet ;
abstract public class GenerateBreakTest implements UCD_Types {
static boolean DEBUG = false ;
// COMMON STUFF for Hangul
static final byte hNot = - 1 , hL = 0 , hV = 1 , hT = 2 , hLV = 3 , hLVT = 4 , hLIMIT = 5 ;
static final String [ ] hNames = { " L " , " V " , " T " , " LV " , " LVT " } ;
static byte getHangulType ( int cp ) {
if ( Default . ucd . isLeadingJamo ( cp ) ) return hL ;
if ( Default . ucd . isVowelJamo ( cp ) ) return hV ;
if ( Default . ucd . isTrailingJamo ( cp ) ) return hT ;
if ( Default . ucd . isHangulSyllable ( cp ) ) {
if ( Default . ucd . isDoubleHangul ( cp ) ) return hLV ;
return hLVT ;
}
return hNot ;
}
public static boolean onCodepointBoundary ( String s , int offset ) {
if ( offset < 0 | | offset > s . length ( ) ) return false ;
if ( offset = = 0 | | offset = = s . length ( ) ) return true ;
if ( UTF16 . isLeadSurrogate ( s . charAt ( offset - 1 ) )
& & UTF16 . isTrailSurrogate ( s . charAt ( offset ) ) ) return false ;
return true ;
}
// finds the first base character, or the first character if there is no base
public static int findFirstBase ( String source , int start , int limit ) {
int cp ;
for ( int i = start ; i < limit ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
byte cat = Default . ucd . getCategory ( cp ) ;
if ( ( ( 1 < < cat ) & MARK_MASK ) ! = 0 ) continue ;
return cp ;
}
return UTF16 . charAt ( source , start ) ;
}
2002-08-09 23:56:24 +00:00
// quick & dirty routine
String insertEverywhere ( String source , String insertion , GenerateBreakTest breaker ) {
String result = insertion ;
for ( int i = 0 ; i < source . length ( ) ; + + i ) {
result + = source . charAt ( i ) ;
if ( breaker . isBreak ( source , i , true ) ) {
result + = insertion ;
}
}
return result + insertion ;
}
2002-08-08 15:38:16 +00:00
static UnicodeSet midLetterSet = new UnicodeSet ( " [ \ u0027 \ u002E \ u003A \ u00AD \ u05F3 \ u05F4 \ u2019 \ uFE52 \ uFE55 \ uFF07 \ uFF0E \ uFF1A] " ) ;
static UnicodeSet ambigSentPunct = new UnicodeSet ( " [ \ u002E \ u0589 \ u06D4] " ) ;
static UnicodeSet sentPunct = new UnicodeSet ( " [ \ u0021 \ u003F \ u0387 \ u061F \ u0964 \ u203C \ u203D \ u2048 \ u2049 "
+ " \ u3002 \ ufe52 \ ufe57 \ uff01 \ uff0e \ uff1f \ uff61] " ) ;
2002-08-09 23:56:24 +00:00
static {
Default . setUCD ( ) ;
}
2003-02-25 23:38:23 +00:00
static UnicodeSet extraAlpha = new UnicodeSet ( " [ \\ u02B9- \\ u02BA \\ u02C2- \\ u02CF \\ u02D2- \\ u02DF \\ u02E5- \\ u02ED \\ u05F3] " ) ;
2002-08-09 23:56:24 +00:00
static UnicodeSet alphabeticSet = UnifiedBinaryProperty . make ( DERIVED | PropAlphabetic ) . getSet ( )
. addAll ( extraAlpha ) ;
static UnicodeSet ideographicSet = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Ideographic ) . getSet ( ) ;
static {
2003-02-25 23:38:23 +00:00
if ( false ) System . out . println ( " alphabetic: " + alphabeticSet . toPattern ( true ) ) ;
2002-08-09 23:56:24 +00:00
}
// ====================== Main ===========================
static final boolean SHOW_TYPE = false ;
2002-08-08 15:38:16 +00:00
public static void main ( String [ ] args ) throws IOException {
System . out . println ( " Remember to add length marks (half & full) and other punctuation for sentence, with FF61 " ) ;
Default . setUCD ( ) ;
2002-08-09 23:56:24 +00:00
2002-08-08 15:38:16 +00:00
if ( DEBUG ) {
2002-08-09 23:56:24 +00:00
checkDecomps ( ) ;
2002-08-08 15:38:16 +00:00
Utility . showSetNames ( " " , new UnicodeSet ( " [ \ u034F \ u00AD \ u1806[:DI:]-[:Cs:]-[:Cn:]] " ) , true , Default . ucd ) ;
System . out . println ( " *** Extend - Cf " ) ;
generateTerminalClosure ( ) ;
GenerateWordBreakTest gwb = new GenerateWordBreakTest ( ) ;
PrintWriter systemPrintWriter = new PrintWriter ( System . out ) ;
gwb . printLine ( systemPrintWriter , " n \ u0308't " , true , true , false ) ;
systemPrintWriter . flush ( ) ;
2002-08-09 23:56:24 +00:00
showSet ( " sepSet " , GenerateSentenceBreakTest . sepSet ) ;
showSet ( " atermSet " , GenerateSentenceBreakTest . atermSet ) ;
showSet ( " termSet " , GenerateSentenceBreakTest . termSet ) ;
}
2003-02-25 23:38:23 +00:00
if ( true ) {
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest ( ) ;
//foo.isBreak("(\"Go.\") (He did)", 5, true);
foo . isBreak ( " 3.4 " , 2 , true ) ;
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
new GenerateSentenceBreakTest ( ) . run ( ) ;
//if (true) return; // cut short for now
2002-08-08 15:38:16 +00:00
new GenerateLineBreakTest ( ) . run ( ) ;
new GenerateGraphemeBreakTest ( ) . run ( ) ;
new GenerateWordBreakTest ( ) . run ( ) ;
}
static void checkDecomps ( ) {
UnicodeProperty [ ] INFOPROPS = { UnifiedProperty . make ( CATEGORY ) , UnifiedProperty . make ( LINE_BREAK ) } ;
GenerateBreakTest [ ] tests = {
new GenerateGraphemeBreakTest ( ) ,
new GenerateWordBreakTest ( ) ,
new GenerateLineBreakTest ( ) ,
} ;
tests [ 0 ] . isBreak ( " \ u0300 \ u0903 " , 1 , true ) ;
System . out . println ( " Check Decomps " ) ;
System . out . println ( " otherExtendSet: " + ( ( GenerateGraphemeBreakTest ) tests [ 0 ] ) . otherExtendSet . toPattern ( true ) ) ;
Utility . showSetNames ( " " , ( ( GenerateGraphemeBreakTest ) tests [ 0 ] ) . otherExtendSet , false , Default . ucd ) ;
for ( int k = 0 ; k < tests . length ; + + k ) {
for ( int i = 0 ; i < 0x10FFFF ; + + i ) {
if ( ! Default . ucd . isAllocated ( i ) ) continue ;
if ( Default . ucd . isHangulSyllable ( i ) ) continue ;
if ( Default . nfd . isNormalized ( i ) ) continue ;
String decomp = Default . nfd . normalize ( i ) ;
boolean shown = false ;
String test = decomp ;
for ( int j = 1 ; j < test . length ( ) ; + + j ) {
if ( tests [ k ] . isBreak ( test , j , true ) ) {
if ( ! shown ) {
System . out . println ( showData ( UTF16 . valueOf ( i ) , INFOPROPS , " \ r \ n \ t " ) ) ;
System . out . println ( " => " + showData ( decomp , INFOPROPS , " \ r \ n \ t " ) ) ;
shown = true ;
}
System . out . println ( j + " : " + tests [ k ] . fileName ) ;
}
}
}
}
}
static String showData ( String source , UnicodeProperty [ ] props , String separator ) {
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < source . length ( ) ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
if ( i ! = 0 ) result . append ( separator ) ;
result . append ( Default . ucd . getCodeAndName ( cp ) ) ;
for ( int j = 0 ; j < props . length ; + + j ) {
result . append ( " , " ) ;
result . append ( props [ j ] . getProperty ( SHORT ) ) . append ( '=' ) . append ( props [ j ] . getValue ( cp , SHORT ) ) ;
}
}
return result . toString ( ) ;
}
2002-08-09 23:56:24 +00:00
static void showSet ( String title , UnicodeSet set ) {
System . out . println ( title + " : " + set . toPattern ( true ) ) ;
Utility . showSetNames ( " " , set , false , Default . ucd ) ;
}
2002-08-08 15:38:16 +00:00
// determines if string is of form Base NSM*
static boolean isBaseNSMStar ( String source ) {
int cp ;
int status = 0 ;
for ( int i = 0 ; i < source . length ( ) ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
byte cat = Default . ucd . getCategory ( cp ) ;
int catMask = 1 < < cat ;
switch ( status ) {
case 0 : if ( ( catMask & BASE_MASK ) = = 0 ) return false ;
status = 1 ;
break ;
case 1 : if ( ( catMask & NONSPACING_MARK_MASK ) = = 0 ) return false ;
break ;
}
}
return true ;
}
static UnicodeSet getClosure ( UnicodeSet source ) {
UnicodeSet result = new UnicodeSet ( source ) ;
for ( int i = 0 ; i < 0x10FFFF ; + + i ) {
if ( ! Default . ucd . isAllocated ( i ) ) continue ;
if ( Default . nfkd . isNormalized ( i ) ) continue ;
String decomp = Default . nfkd . normalize ( i ) ;
if ( source . containsAll ( decomp ) ) result . add ( i ) ;
}
return result ;
}
static void generateTerminalClosure ( ) {
UnicodeSet terminals = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Terminal_Punctuation ) . getSet ( ) ;
UnicodeSet extras = getClosure ( terminals ) . removeAll ( terminals ) ;
System . out . println ( " Current Terminal_Punctuation " ) ;
Utility . showSetNames ( " " , terminals , true , Default . ucd ) ;
System . out . println ( " Missing Terminal_Punctuation " ) ;
Utility . showSetNames ( " " , extras , true , Default . ucd ) ;
System . out . println ( " midLetterSet " ) ;
System . out . println ( midLetterSet . toPattern ( true ) ) ;
Utility . showSetNames ( " " , midLetterSet , true , Default . ucd ) ;
System . out . println ( " ambigSentPunct " ) ;
System . out . println ( ambigSentPunct . toPattern ( true ) ) ;
Utility . showSetNames ( " " , ambigSentPunct , true , Default . ucd ) ;
System . out . println ( " sentPunct " ) ;
System . out . println ( sentPunct . toPattern ( true ) ) ;
Utility . showSetNames ( " " , sentPunct , true , Default . ucd ) ;
/ *
UnicodeSet sentencePunctuation = new UnicodeSet ( " [ \ u0021 \ 003F ; Terminal_Punctuation # Po QUESTION MARK
037E ; Terminal_Punctuation # Po GREEK QUESTION MARK
061F ; Terminal_Punctuation # Po ARABIC QUESTION MARK
06D4 ; Terminal_Punctuation # Po ARABIC FULL STOP
203C . . 203D ; Terminal_Punctuation # Po [ 2 ] DOUBLE EXCLAMATION MARK . . INTERROBANG
3002 ; Terminal_Punctuation # Po IDEOGRAPHIC FULL STOP
2048 . . 2049 ; Terminal_Punctuation # Po [ 2 ] QUESTION EXCLAMATION MARK . . EXCLAMATION QUESTION MARK
* /
}
//============================
protected String rule ;
protected String fileName ;
protected String [ ] samples = new String [ 100 ] ;
protected String [ ] extraSamples = new String [ 0 ] ;
protected String [ ] extraSingleSamples = new String [ 0 ] ;
protected int sampleLimit = 0 ;
protected int tableLimit = - 1 ;
public void run ( ) throws IOException {
findSamples ( ) ;
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility . openPrintWriter ( fileName + " BreakTest.html " , Utility . UTF8_WINDOWS ) ;
out . println ( " <html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title> "
2003-02-25 23:38:23 +00:00
+ fileName + " Break Chart</title></head> " ) ;
2002-08-09 23:56:24 +00:00
out . println ( " <body bgcolor='#FFFFFF'><h3>Current:</h3> " ) ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
if ( recommendedDiffers ( ) ) {
generateTable ( out , false ) ;
out . println ( " <h3>Recommended:</h3> " ) ;
generateTable ( out , true ) ;
out . println ( " </body></html> " ) ;
} else {
generateTable ( out , true ) ;
}
2002-08-08 15:38:16 +00:00
out . close ( ) ;
2002-08-09 23:56:24 +00:00
if ( recommendedDiffers ( ) ) {
generateTest ( false , false ) ;
}
generateTest ( false , true ) ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
}
public void generateTest ( boolean shortVersion , boolean recommended ) throws IOException {
2002-08-08 15:38:16 +00:00
String [ ] testCase = new String [ 50 ] ;
// do main test
2002-08-09 23:56:24 +00:00
PrintWriter out = Utility . openPrintWriter ( fileName + " BreakTest "
+ ( recommended & recommendedDiffers ( ) ? " _NEW " : " " )
+ ( shortVersion ? " _SHORT " : " " )
2003-02-25 23:38:23 +00:00
+ " .txt " , Utility . UTF8_WINDOWS ) ;
2002-08-09 23:56:24 +00:00
int counter = 0 ;
out . println ( " # Default " + fileName + " Break Test " ) ;
out . println ( " # Generated: " + Default . getDate ( ) + " , MED " ) ;
out . println ( " # " ) ;
out . println ( " # Format: " ) ;
out . println ( " # <string> (# <comment>)? " ) ;
out . println ( " # <string> contains hex Unicode code points, with " ) ;
out . println ( " # \ t " + BREAK + " wherever there is a break opportunity, and " ) ;
out . println ( " # \ t " + NOBREAK + " wherever there is not. " ) ;
out . println ( " # <comment> the format can change, but currently it shows: " ) ;
out . println ( " # \ t- the sample character name " ) ;
out . println ( " # \ t- (x) the line_break property* for the sample character " ) ;
out . println ( " # \ t- [x] the rule that determines whether there is a break or not " ) ;
out . println ( " # " ) ;
sampleDescription ( out ) ;
out . println ( " # These samples may be extended or changed in the future. " ) ;
out . println ( " # " ) ;
for ( int ii = 0 ; ii < sampleLimit ; + + ii ) {
String before = samples [ ii ] ;
for ( int jj = 0 ; jj < sampleLimit ; + + jj ) {
Utility . dot ( counter ) ;
String after = samples [ jj ] ;
// do line straight
int len = genTestItems ( before , after , testCase ) ;
for ( int q = 0 ; q < len ; + + q ) {
printLine ( out , testCase [ q ] , ! shortVersion & & q = = 0 , recommended , false ) ;
+ + counter ;
2002-08-08 15:38:16 +00:00
}
}
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
for ( int ii = 0 ; ii < extraSingleSamples . length ; + + ii ) {
printLine ( out , extraSingleSamples [ ii ] , true , recommended , false ) ;
2002-08-08 15:38:16 +00:00
}
2002-08-09 23:56:24 +00:00
out . println ( " # Lines: " + counter ) ;
out . close ( ) ;
2002-08-08 15:38:16 +00:00
}
public void sampleDescription ( PrintWriter out ) { }
abstract public boolean isBreak ( String source , int offset , boolean recommended ) ;
abstract public byte getType ( int cp , boolean recommended ) ;
abstract public String getTypeID ( int s , boolean recommended ) ;
2002-08-09 23:56:24 +00:00
public boolean recommendedDiffers ( ) {
return false ;
}
2002-08-08 15:38:16 +00:00
final public byte getType ( int cp ) {
return getType ( cp , false ) ;
}
final public String getTypeID ( int cp ) {
return getTypeID ( cp , false ) ;
}
final public String getTypeID ( String s ) {
return getTypeID ( s , false ) ;
}
public String getTypeID ( String s , boolean recommended ) {
if ( s = = null ) return " <null> " ;
if ( s . length ( ) = = 1 ) return getTypeID ( s . charAt ( 0 ) , recommended ) ;
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < s . length ( ) ; i + = UTF32 . count16 ( cp ) ) {
cp = UTF32 . char32At ( s , i ) ;
if ( i > 0 ) result . append ( " " ) ;
result . append ( getTypeID ( cp , recommended ) ) ;
}
return result . toString ( ) ;
}
static final int DONE = - 1 ;
public int next ( String source , int offset , boolean recommended ) {
for ( int i = offset + 1 ; i < = source . length ( ) ; + + i ) {
if ( isBreak ( source , i , recommended ) ) return i ;
}
return DONE ;
}
public int previous ( String source , int offset , boolean recommended ) {
for ( int i = offset - 1 ; i > = 0 ; - - i ) {
if ( isBreak ( source , i , recommended ) ) return i ;
}
return DONE ;
}
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
return 1 ;
}
public String getTableEntry ( String before , String after , boolean recommended , String [ ] ruleOut ) {
boolean normalBreak = isBreak ( before + after , before . length ( ) , recommended ) ;
String normalRule = rule ;
ruleOut [ 0 ] = rule ;
return normalBreak ? BREAK : NOBREAK ;
}
public byte getResolvedType ( int cp , boolean recommended ) {
return getType ( cp , recommended ) ;
}
boolean skipType ( int type ) {
return false ;
}
static String getInfo ( String s ) {
if ( s = = null | | s . length ( ) = = 0 ) return " NULL " ;
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < s . length ( ) ; i + = UTF32 . count16 ( cp ) ) {
cp = UTF32 . char32At ( s , i ) ;
if ( i > 0 ) result . append ( " , " ) ;
result . append ( Default . ucd . getCodeAndName ( cp ) ) ;
result . append ( " , gc= " + Default . ucd . getCategoryID_fromIndex ( Default . ucd . getCategory ( cp ) , SHORT ) ) ;
result . append ( " , sc= " + Default . ucd . getScriptID_fromIndex ( Default . ucd . getScript ( cp ) , SHORT ) ) ;
result . append ( " , lb= " + Default . ucd . getLineBreakID_fromIndex ( Default . ucd . getLineBreak ( cp ) ) ) ;
}
return result . toString ( ) ;
}
public void generateTable ( PrintWriter out , boolean recommended ) {
2002-08-09 23:56:24 +00:00
String width = " width=' " + ( 100 / ( tableLimit + 2 ) ) + " %' " ;
2002-08-08 15:38:16 +00:00
out . print ( " <table border='1' cellspacing='0' width='100%'> " ) ;
String types = " " ;
String codes = " " ;
for ( int type = 0 ; type < tableLimit ; + + type ) {
String after = samples [ type ] ;
if ( after = = null ) continue ;
String h = getTypeID ( after , recommended ) ;
types + = " <th " + width + " > " + h + " </th> " ;
codes + = " <th " + width + " title=' " + getInfo ( after ) + " '> " + Utility . hex ( after ) + " </th> " ;
}
out . println ( " <tr><th " + width + " ></th><th " + width + " ></th> " + types + " </tr> " ) ;
out . println ( " <tr><th " + width + " ></th><th " + width + " ></th> " + codes + " </tr> " ) ;
String [ ] rule = new String [ 1 ] ;
String [ ] rule2 = new String [ 1 ] ;
2002-08-09 23:56:24 +00:00
for ( int type = 0 ; type < sampleLimit ; + + type ) {
2002-08-08 15:38:16 +00:00
String before = samples [ type ] ;
if ( before = = null ) continue ;
String line = " <tr><th> " + getTypeID ( before , recommended ) + " </th> "
+ " <th title=' " + Default . ucd . getCodeAndName ( before ) + " '> " + Utility . hex ( before ) + " </th> " ;
for ( int type2 = 0 ; type2 < tableLimit ; + + type2 ) {
String after = samples [ type2 ] ;
if ( after = = null ) continue ;
String t = getTableEntry ( before , after , recommended , rule ) ;
String background = " " ;
String t2 = getTableEntry ( before , after , ! recommended , rule2 ) ;
if ( ! t . equals ( t2 ) ) {
if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCFFFF' " ;
} else {
background = " bgcolor='#FFFF00' " ;
}
} else if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCCCFF' " ;
}
line + = " <th title=' " + rule [ 0 ] + " ' " + background + " > " + t + " </th> " ;
}
out . println ( line + " </tr> " ) ;
}
out . println ( " </table> " ) ;
out . println ( " <ol> " ) ;
for ( int ii = 0 ; ii < extraSingleSamples . length ; + + ii ) {
out . println ( " <li><font size='5'> " ) ;
2002-08-09 23:56:24 +00:00
printLine ( out , extraSingleSamples [ ii ] , true , recommended , true ) ;
2002-08-08 15:38:16 +00:00
out . println ( " </font></li> " ) ;
}
out . println ( " </ol> " ) ;
}
static final String BREAK = " \ u00F7 " ;
static final String NOBREAK = " \ u00D7 " ;
public void printLine ( PrintWriter out , String source , boolean comments , boolean recommended , boolean html ) {
int cp ;
StringBuffer string = new StringBuffer ( ) ;
StringBuffer comment = new StringBuffer ( " \ t# " ) ;
boolean hasBreak = isBreak ( source , 0 , recommended ) ;
String status ;
if ( html ) {
status = hasBreak ? " style='border-right: 1px solid blue' " : " " ;
string . append ( " <span title=' " + rule + " '><span " + status + " > </span> <span> " ) ;
} else {
status = hasBreak ? BREAK : NOBREAK ;
string . append ( status ) ;
}
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( rule ) . append ( ']' ) ;
for ( int offset = 0 ; offset < source . length ( ) ; offset + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , offset ) ;
hasBreak = isBreak ( source , offset + UTF16 . getCharCount ( cp ) , recommended ) ;
if ( html ) {
status = hasBreak ? " style='border-right: 1px solid blue' " : " " ;
string . append ( " <span title=' " +
Utility . quoteXML ( Default . ucd . getCodeAndName ( cp ) + " ( " + getTypeID ( cp ) + " ) " , true )
+ " '> "
+ Utility . quoteXML ( Utility . getDisplay ( cp ) , true )
+ " </span> " ) ;
string . append ( " <span title=' " + rule + " '><span " + status + " > </span> <span> " ) ;
} else {
if ( string . length ( ) > 0 ) {
string . append ( ' ' ) ;
comment . append ( ' ' ) ;
}
status = hasBreak ? BREAK : NOBREAK ;
string . append ( Utility . hex ( cp ) ) ;
comment . append ( Default . ucd . getName ( cp ) + " ( " + getTypeID ( cp ) + " ) " ) ;
string . append ( ' ' ) . append ( status ) ;
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( rule ) . append ( ']' ) ;
}
}
if ( comments & & ! html ) string . append ( comment ) ;
out . println ( string ) ;
}
public void findSamples ( ) {
// what we want is a list of sample characters. In the simple case, this is just one per type.
// However, if there are characters that have different types (when recommended or not), then
// we want a type for each cross-section
BitSet bitset = new BitSet ( ) ;
Map list = new TreeMap ( ) ;
for ( int i = 1 ; i < = 0x10FFFF ; + + i ) {
if ( ! Default . ucd . isAllocated ( i ) ) continue ;
if ( 0xD800 < = i & & i < = 0xDFFF ) continue ;
if ( DEBUG & & i = = 0x1100 ) {
System . out . println ( " debug " ) ;
}
byte lb = getType ( i ) ;
byte lb2 = getType ( i , true ) ;
if ( lb = = lb2 & & skipType ( lb ) ) continue ;
int combined = ( lb < < 7 ) + lb2 ;
if ( ! bitset . get ( combined ) ) {
bitset . set ( combined ) ;
list . put ( new Integer ( combined ) , UTF16 . valueOf ( i ) ) ;
}
/ *
// if the sample slot is full OR
if ( samples [ lb ] = = null ) {
samples [ lb ] = UTF16 . valueOf ( i ) ;
if ( sampleLimit < = lb ) sampleLimit = lb + 1 ;
// byte lb2 = getType(i, true);
// if (lb2 != lb) bs.set(lb);
}
* /
}
Iterator it = list . keySet ( ) . iterator ( ) ;
while ( it . hasNext ( ) ) {
String sample = ( String ) list . get ( it . next ( ) ) ;
samples [ sampleLimit + + ] = sample ;
if ( DEBUG ) System . out . println ( getTypeID ( sample ) + " : \ t " + Default . ucd . getCodeAndName ( sample ) ) ;
}
tableLimit = sampleLimit ;
// now add values that are different
/ *
for ( int i = 1 ; i < = 0x10FFFF ; + + i ) {
if ( ! Default . ucd . isAllocated ( i ) ) continue ;
if ( 0xD800 < = i & & i < = 0xDFFF ) continue ;
byte lb = getType ( i ) ;
byte lb2 = getType ( i , true ) ;
if ( lb = = lb2 ) continue ;
// pick some different ones
if ( ! bs . get ( lb ) ) {
samples [ sampleLimit + + ] = UTF16 . valueOf ( i ) ;
bs . set ( lb ) ;
}
if ( ! bs2 . get ( lb2 ) ) {
samples [ sampleLimit + + ] = UTF16 . valueOf ( i ) ;
bs . set ( lb2 ) ;
}
}
* /
2002-08-09 23:56:24 +00:00
if ( extraSamples . length > 0 ) {
System . arraycopy ( extraSamples , 0 , samples , sampleLimit , extraSamples . length ) ;
sampleLimit + = extraSamples . length ;
}
2002-08-08 15:38:16 +00:00
}
public int findLastNon ( String source , int offset , byte notLBType , boolean recommended ) {
int cp ;
for ( int i = offset - 1 ; i > = 0 ; i - = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
byte f = getResolvedType ( cp , recommended ) ;
if ( f ! = notLBType ) return i ;
}
return - 1 ;
}
2003-02-25 23:38:23 +00:00
static public class Context {
public int cpBefore2 , cpBefore , cpAfter , cpAfter2 ;
public byte tBefore2 , tBefore , tAfter , tAfter2 ;
public String toString ( ) {
return " [ "
+ Utility . hex ( cpBefore2 ) + " ( " + tBefore2 + " ), "
+ Utility . hex ( cpBefore ) + " ( " + tBefore + " ), "
+ Utility . hex ( cpAfter ) + " ( " + tAfter + " ), "
+ Utility . hex ( cpAfter2 ) + " ( " + tAfter2 + " )] " ;
}
}
public void getGraphemeBases ( String source , int offset , boolean recommended , byte ignoreType , Context context ) {
context . cpBefore2 = context . cpBefore = context . cpAfter = context . cpAfter2 = - 1 ;
context . tBefore2 = context . tBefore = context . tAfter = context . tAfter2 = - 1 ;
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType);
MyBreakIterator graphemeIterator = new MyBreakIterator ( ) ;
graphemeIterator . set ( source , offset ) ;
while ( true ) {
int cp = graphemeIterator . previousBase ( ) ;
if ( cp = = - 1 ) break ;
byte t = getResolvedType ( cp , recommended ) ;
if ( t = = ignoreType ) continue ;
if ( context . cpBefore = = - 1 ) {
context . cpBefore = cp ;
context . tBefore = t ;
} else {
context . cpBefore2 = cp ;
context . tBefore2 = t ;
break ;
}
}
graphemeIterator . set ( source , offset ) ;
while ( true ) {
int cp = graphemeIterator . nextBase ( ) ;
if ( cp = = - 1 ) break ;
byte t = getResolvedType ( cp , recommended ) ;
if ( t = = ignoreType ) continue ;
if ( context . cpAfter = = - 1 ) {
context . cpAfter = cp ;
context . tAfter = t ;
} else {
context . cpAfter2 = cp ;
context . tAfter2 = t ;
break ;
}
}
}
2002-08-08 15:38:16 +00:00
// ========================================
static class GenerateLineBreakTest extends GenerateBreakTest {
// all the other items are supplied in UCD_TYPES
static byte LB_L = LB_LIMIT + hL , LB_V = LB_LIMIT + hV , LB_T = LB_LIMIT + hT ,
LB_LV = LB_LIMIT + hLV , LB_LVT = LB_LIMIT + hLVT , LB_SUP = LB_LIMIT + hLIMIT ,
LB2_LIMIT = ( byte ) ( LB_SUP + 1 ) ;
/ *
private byte [ ] AsmusOrderToMyOrder = {
LB_OP , LB_CL , LB_QU , LB_GL , LB_NS , LB_EX , LB_SY , LB_IS , LB_PR , LB_PO ,
LB_NU , LB_AL , LB_ID , LB_IN , LB_HY , LB_BA , LB_BB , LB_B2 , LB_ZW , LB_CM ,
// missing from Pair Table
LB_SP , LB_BK , LB_CR , LB_LF ,
// resolved types below
LB_CB , LB_AI , LB_SA , LB_SG , LB_XX ,
// 3 JAMO CLASSES, plus supplementary
LB_L , LB_V , LB_T , LB_LV , LB_LVT , LB_SUP
} ;
private byte [ ] MyOrderToAsmusOrder = new byte [ AsmusOrderToMyOrder . length ] ;
{
for ( byte i = 0 ; i < AsmusOrderToMyOrder . length ; + + i ) {
MyOrderToAsmusOrder [ AsmusOrderToMyOrder [ i ] ] = i ;
}
* /
{
fileName = " Line " ;
2002-08-09 23:56:24 +00:00
extraSingleSamples = new String [ ] { " can't " , " can \ u2019t " , " ab \ u00ADby " , " -3 " } ;
2002-08-08 15:38:16 +00:00
}
2002-08-09 23:56:24 +00:00
public boolean recommendedDiffers ( ) {
return true ;
}
2002-08-08 15:38:16 +00:00
public void sampleDescription ( PrintWriter out ) {
out . println ( " # Samples: " ) ;
out . println ( " # The test currently takes all pairs of linebreak types*, " ) ;
out . println ( " # picks a sample for each type, and generates three strings: " ) ;
out . println ( " # \ t- the pair alone " ) ;
out . println ( " # \ t- the pair alone with an imbeded space " ) ;
out . println ( " # \ t- the pair alone with embedded combining marks " ) ;
out . println ( " # The sample for each type is simply the first code point (above NULL) " ) ;
out . println ( " # with that property. " ) ;
out . println ( " # * Note: " ) ;
out . println ( " # \ t- SG is omitted " ) ;
out . println ( " # \ t- 3 different Jamo characters and a supplementary character are added " ) ;
out . println ( " # \ t The syllable types for the Jamo (L, V, T) are displayed in comments " ) ;
out . println ( " # \ t instead of the linebreak property " ) ;
out . println ( " # " ) ;
}
// stuff that subclasses need to override
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
results [ 1 ] = before + " " + after ;
results [ 2 ] = before + " \ u0301 \ u0308 " + after ;
return 3 ;
}
// stuff that subclasses need to override
boolean skipType ( int type ) {
return type = = LB_AI | | type = = LB_SA | | type = = LB_SG | | type = = LB_XX ;
}
// stuff that subclasses need to override
public String getTypeID ( int cp , boolean recommended ) {
byte result = getType ( cp , recommended ) ;
if ( result = = LB_SUP ) return " SUP " ;
if ( result > = LB_LIMIT ) return hNames [ result - LB_LIMIT ] ;
return Default . ucd . getLineBreakID_fromIndex ( result ) ; // AsmusOrderToMyOrder[result]);
}
// stuff that subclasses need to override
public byte getType ( int cp , boolean recommended ) {
if ( cp > 0xFFFF ) return LB_SUP ;
byte result = getHangulType ( cp ) ;
if ( result ! = hNot ) return ( byte ) ( result + LB_LIMIT ) ;
// return MyOrderToAsmusOrder[Default.ucd.getLineBreak(cp)];
return Default . ucd . getLineBreak ( cp ) ;
}
public String getTableEntry ( String before , String after , boolean recommended , String [ ] ruleOut ) {
String t = " _ " ;
boolean spaceBreak = isBreak ( before + " " + after , before . length ( ) + 1 , recommended ) ;
String spaceRule = rule ;
boolean spaceBreak2 = isBreak ( before + " " + after , before . length ( ) , recommended ) ;
String spaceRule2 = rule ;
boolean normalBreak = isBreak ( before + after , before . length ( ) , recommended ) ;
String normalRule = rule ;
if ( ! normalBreak ) {
if ( ! spaceBreak & & ! spaceBreak2 ) {
t = " ^ " ;
rule = spaceRule . equals ( normalRule ) ? normalRule : spaceRule + " / " + normalRule ;
if ( ! spaceRule2 . equals ( normalRule ) & & ! spaceRule2 . equals ( spaceRule ) ) {
rule + = " / " + spaceRule2 ;
}
} else {
t = " % " ;
rule = normalRule ;
}
}
ruleOut [ 0 ] = rule ;
return t ;
}
public byte getResolvedType ( int cp , boolean recommended ) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = getType ( cp ) ;
switch ( result ) {
case LB_AI : result = LB_AI ; break ;
// case LB_CB: result = LB_ID; break;
case LB_SA : result = LB_AL ; break ;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX : result = LB_AL ; break ;
}
if ( recommended ) {
if ( getHangulType ( cp ) ! = hNot ) {
result = LB_ID ;
}
}
return result ;
}
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
public boolean isBreak ( String source , int offset , boolean recommended ) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
rule = " 2a " ;
if ( offset < = 0 ) return false ;
// LB 2b Always break at the end of text
rule = " 2b " ;
if ( offset > = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
if ( ! onCodepointBoundary ( source , offset ) ) return false ;
// now get the character before and after, and their types
int cpBefore = UTF16 . charAt ( source , offset - 1 ) ;
int cpAfter = UTF16 . charAt ( source , offset ) ;
byte before = getResolvedType ( cpBefore , recommended ) ;
byte after = getResolvedType ( cpAfter , recommended ) ;
rule = " 3a " ;
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if ( before = = LB_CR & & after = = LB_LF ) return false ;
if ( before = = LB_BK | | before = = LB_LF | | before = = LB_CR ) return true ;
//LB 3b Don<6F> t break before hard line breaks.
rule = " 3b " ;
if ( after = = LB_BK | | after = = LB_LF | after = = LB_CR ) return false ;
// LB 4 Don<6F> t break before spaces or zero-width space.
// <20> SP
// <20> ZW
rule = " 4 " ;
if ( after = = LB_SP | | after = = LB_ZW ) return false ;
// LB 5 Break after zero-width space.
// ZW <20>
rule = " 5 " ;
if ( before = = LB_ZW ) return true ;
// LB 6 Don<6F> t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule = " 6 " ;
if ( after = = LB_CM ) return false ;
if ( before = = LB_L & & ( after = = LB_L | | after = = LB_V | | after = = LB_LV | | after = = LB_LVT ) ) return false ;
if ( ( before = = LB_LV | | before = = LB_V ) & & ( after = = LB_V | | after = = LB_T ) ) return false ;
if ( ( before = = LB_LVT | | before = = LB_T ) & & ( after = = LB_T ) ) return false ;
boolean setBase = false ;
if ( before = = LB_CM ) {
setBase = true ;
int backOffset = findLastNon ( source , offset , LB_CM , recommended ) ;
if ( backOffset < 0 ) {
before = LB_ID ;
} else {
before = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
}
}
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
rule = " 7 " ;
if ( setBase & & before = = LB_SP ) before = LB_ID ;
// LB 8 Don<6F> t break before <20> ]<5D> or <20> !<21> or <20> ;<3B> or <20> /<2F> , even after spaces.
// <20> CL, <20> EX, <20> IS, <20> SY
rule = " 8 " ;
if ( after = = LB_CL | | after = = LB_EX | | after = = LB_SY | after = = LB_IS ) return false ;
// find the last non-space character; we will need it
byte lastNonSpace = before ;
if ( lastNonSpace = = LB_SP ) {
int backOffset = findLastNon ( source , offset , LB_CM , recommended ) ;
if ( backOffset > = 0 ) {
lastNonSpace = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
}
}
// LB 9 Don<6F> t break after <20> [<5B> , even after spaces.
// OP SP* <20>
rule = " 9 " ;
if ( lastNonSpace = = LB_OP ) return false ;
// LB 10 Don<6F> t break within <20> <> [<5B> , , even with intervening spaces.
// QU SP* <20> OP
rule = " 10 " ;
if ( lastNonSpace = = LB_QU & & after = = LB_OP ) return false ;
// LB 11 Don<6F> t break within <20> ]h<> , even with intervening spaces.
// CL SP* <20> NS
rule = " 11 " ;
if ( lastNonSpace = = LB_CL & & after = = LB_NS ) return false ;
// LB 11a Don<6F> t break within <20> <> <EFBFBD> <EFBFBD> , even with intervening spaces.
// B2 <20> B2
rule = " 11a " ;
if ( lastNonSpace = = LB_B2 & & after = = LB_B2 ) return false ;
if ( recommended ) {
// LB 13 Don<6F> t break before or after NBSP or WORD JOINER
// <20> GL
// GL <20>
rule = " 11b " ;
if ( after = = LB_GL | | before = = LB_GL ) return false ;
}
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
rule = " 12 " ;
// LB 12 Break after spaces
// SP <20>
if ( before = = LB_SP ) return true ;
if ( ! recommended ) {
// LB 13 Don<6F> t break before or after NBSP or WORD JOINER
// <20> GL
// GL <20>
rule = " 13 " ;
if ( after = = LB_GL | | before = = LB_GL ) return false ;
}
rule = " 14 " ;
// LB 14 Don<6F> t break before or after <20> <> <EFBFBD>
// <20> QU
// QU <20>
if ( before = = LB_QU | | after = = LB_QU ) return false ;
// LB 15 Don<6F> t break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
// <20> BA
// <20> HY
// <20> NS
// BB <20>
if ( recommended ) {
// LB 14a Break before and after CB
// CB <20>
// <20> CB
if ( before = = LB_CB | | after = = LB_CB ) return true ;
}
rule = " 15 " ;
if ( after = = LB_NS ) return false ;
if ( after = = LB_HY ) return false ;
if ( after = = LB_BA ) return false ;
if ( before = = LB_BB ) return false ;
if ( ! recommended ) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY <20>
// <20> BB
rule = " 15b " ;
if ( before = = LB_HY ) return true ;
if ( after = = LB_BB ) return true ;
}
// LB 16 Don<6F> t break between two ellipses, or between letters or numbers and ellipsis:
// AL <20> IN
// ID <20> IN
// IN <20> IN
// NU <20> IN
// Examples: <20> 9...<2E> , <20> a...<2E> , <20> H...<2E>
rule = " 16 " ;
if ( ( before = = LB_NU | | before = = LB_AL | | before = = LB_ID ) & & after = = LB_IN ) return false ;
if ( before = = LB_IN & & after = = LB_IN ) return false ;
// Don't break alphanumerics.
// LB 17 Don<6F> t break within <20> a9<61> , <20> 3a<33> , or <20> H%<25>
// ID <20> PO
// AL <20> NU
// NU <20> AL
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)<29> 12.54<EFBFBD>
// This is approximated with the following rules. (Some cases already handled above,
// like <20> 9,<2C> , <20> [9<> .)
rule = " 17 " ;
if ( before = = LB_ID & & after = = LB_PO ) return false ;
if ( before = = LB_AL & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_AL ) return false ;
// LB 18 Don<6F> t break between the following pairs of classes.
// CL <20> PO
// HY <20> NU
// IS <20> NU
// NU <20> NU
// NU <20> PO
// PR <20> AL
// PR <20> HY
// PR <20> ID
// PR <20> NU
// PR <20> OP
// SY <20> NU
// Example pairs: <20> $9<> , <20> $[<5B> , <20> $-<2D> , <20> -9<> , <20> /9<> , <20> 99<39> , <20> ,9<> , <20> 9%<25> <20> ]%<25>
rule = " 18 " ;
if ( before = = LB_CL & & after = = LB_PO ) return false ;
if ( before = = LB_HY & & after = = LB_NU ) return false ;
if ( before = = LB_IS & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_PO ) return false ;
if ( before = = LB_PR & & after = = LB_AL ) return false ;
if ( before = = LB_PR & & after = = LB_HY ) return false ;
if ( before = = LB_PR & & after = = LB_ID ) return false ;
if ( before = = LB_PR & & after = = LB_NU ) return false ;
if ( before = = LB_PR & & after = = LB_OP ) return false ;
if ( before = = LB_SY & & after = = LB_NU ) return false ;
if ( recommended ) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY <20>
// <20> BB
rule = " 18b " ;
if ( before = = LB_HY ) return true ;
if ( after = = LB_BB ) return true ;
}
// LB 19 Don<6F> t break between alphabetics (<28> at<61> )
// AL <20> AL
rule = " 19 " ;
if ( before = = LB_AL & & after = = LB_AL ) return false ;
// LB 20 Break everywhere else
// ALL <20>
// <20> ALL
rule = " 20 " ;
return true ;
}
}
//==============================================
static class GenerateGraphemeBreakTest extends GenerateBreakTest {
static final byte CR = 0 , LF = 1 , Control = 2 , Extend = 3 , Link = 4 , CGJ = 5 , Base = 6 , LetterBase = 7 , Other = 8 ,
oLIMIT = 9 , // RESET THIS IF LIST ABOVE CHANGES!
L = oLIMIT + hL , V = oLIMIT + hV , T = oLIMIT + hT , LV = oLIMIT + hLV , LVT = oLIMIT + hLVT ,
LIMIT = LVT + 1 ;
static final String [ ] Names = { " CR " , " LF " , " CTL " , " Extend " , " Link " , " CGJ " , " Base " , " LetterBase " , " Other " } ;
static UnicodeProperty extendProp = UnifiedBinaryProperty . make ( DERIVED | GraphemeExtend ) ;
static UnicodeProperty baseProp = UnifiedBinaryProperty . make ( DERIVED | GraphemeBase ) ;
static UnicodeProperty linkProp = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | GraphemeLink ) ;
static UnicodeSet otherExtendSet = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Other_GraphemeExtend ) . getSet ( )
. addAll ( new UnicodeSet ( " [ \ u09BE \ u09D7 \ u0B3E \ u0B57 \ u0BD7 \ u0BBE "
+ " \ u0CC2 \ u0CD5 \ u0CD6 \ u0D3E \ u0D57 \ u0DCF \ u0DDF \\ U0001D165 \\ U0001D16E \\ U0001D16F] " ) ) ;
{
fileName = " GraphemeCluster " ;
}
// stuff that subclasses need to override
public String getTypeID ( int cp , boolean recommended ) {
byte type = getType ( cp , recommended ) ;
if ( type > = oLIMIT ) return hNames [ type - oLIMIT ] ;
return Names [ type ] ;
}
// stuff that subclasses need to override
public byte getType ( int cp , boolean recommended ) {
2002-08-09 23:56:24 +00:00
recommended = true ; // don't care about old stuff
2002-08-08 15:38:16 +00:00
// single characters
if ( cp = = 0xA ) return LF ;
if ( cp = = 0xD ) return CR ;
if ( recommended ) {
2003-02-25 23:38:23 +00:00
if ( cp = = 0x034F ) return Extend ;
2002-08-08 15:38:16 +00:00
}
if ( cp = = 0x2028 | | cp = = 0x2029 ) return Control ;
// Hangul
byte result = getHangulType ( cp ) ;
if ( result ! = hNot ) return ( byte ) ( result + oLIMIT ) ;
// other properties
// category based
byte cat = Default . ucd . getCategory ( cp ) ;
if ( cat = = Cc ) return Control ;
if ( recommended ) {
if ( cat = = Cf ) return Control ;
if ( cat = = Me | | cat = = Mn ) return Extend ;
2002-08-09 23:56:24 +00:00
// FOR FUTURE! if (otherExtendSet.contains(cp)) return Extend;
2002-08-08 15:38:16 +00:00
return Base ;
}
if ( cat = = Cf ) return Extend ;
if ( ( ( 1 < < cat ) & LETTER_MASK ) ! = 0 ) return LetterBase ;
// other binary properties
if ( linkProp . hasValue ( cp ) ) return Link ;
if ( extendProp . hasValue ( cp ) ) return Extend ;
if ( baseProp . hasValue ( cp ) ) return Base ;
return Other ;
}
public boolean isBreak ( String source , int offset , boolean recommended ) {
2002-08-09 23:56:24 +00:00
recommended = true ; // don't care about old stuff
2002-08-08 15:38:16 +00:00
rule = " 1 " ;
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
rule = " 2 " ;
if ( offset = = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
if ( ! onCodepointBoundary ( source , offset ) ) return false ;
// now get the character before and after, and their types
int cpBefore = UTF16 . charAt ( source , offset - 1 ) ;
int cpAfter = UTF16 . charAt ( source , offset ) ;
byte before = getResolvedType ( cpBefore , recommended ) ;
byte after = getResolvedType ( cpAfter , recommended ) ;
rule = " 3 " ;
if ( before = = CR & & after = = LF ) return false ;
rule = " 4 " ;
if ( before = = CR | | before = = LF | | before = = Control
| | after = = Control | | after = = LF | | after = = CR ) return true ;
rule = " 6 " ;
if ( before = = L & & ( after = = L | | after = = V | | after = = LV | | after = = LVT ) ) return false ;
rule = " 7 " ;
if ( ( before = = LV | | before = = V ) & & ( after = = V | | after = = T ) ) return false ;
rule = " 8 " ;
if ( ( before = = LVT | | before = = T ) & & ( after = = T ) ) return false ;
rule = " 9 " ;
if ( after = = Extend ) return false ;
if ( recommended ) {
if ( after = = Link | | after = = CGJ ) return false ;
} else {
// Do not break around a CGJ.
rule = " 10 " ;
if ( before = = CGJ & & ( after = = Base
| | after = = LetterBase | | after = = L | | after = = V | | after = = T | | after = = LV | | after = = LVT ) ) return false ;
rule = " 11 " ;
if ( after = = CGJ ) return false ;
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = " 12 " ;
//Link Extend* <20> LetterBase (12)
if ( after = = LetterBase | | after = = L | | after = = V | | after = = T | | after = = LV | | after = = LVT ) {
int backOffset = findLastNon ( source , offset , Extend , recommended ) ;
if ( backOffset > = 0 ) {
byte last = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
if ( last = = Link ) return false ;
}
}
rule = " 13 " ;
if ( after = = Link ) return false ;
}
// Otherwise break after all characters.
rule = " 14 " ;
return true ;
}
}
//==============================================
static class GenerateWordBreakTest extends GenerateBreakTest {
2002-08-09 23:56:24 +00:00
static final byte Format = 0 , Katakana = 1 , ALetter = 2 , MidLetter = 3 , Hyphen = 4 ,
2002-08-08 15:38:16 +00:00
Numeric = 5 , Infix_Numeric = 6 , Prefix_Numeric = 7 , Postfix_Numeric = 8 ,
2002-08-09 23:56:24 +00:00
Prefix = 9 , Postfix = 10 , MidNumLet = 11 , Hiragana = 12 , Other = 13 ,
2002-08-08 15:38:16 +00:00
LIMIT = Other + 1 ;
2002-08-09 23:56:24 +00:00
static final String [ ] Names = { " Format " , " Katakana " , " ALetter " , " MidLetter " , " Hyphen " ,
" Numeric " , " INum " , " PrNum " , " PoNum " , " PreLet " , " PostLet " , " MidNumLet " , " Hiragana " , " Other " } ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest ( ) ;
static Context context = new Context ( ) ;
2002-08-08 15:38:16 +00:00
static String LENGTH = " [ \ u30FC \ uFF70] " ;
static String HALFWIDTH_KATAKANA = " [ \ uFF65- \ uFF9F] " ;
static String KATAKANA_ITERATION = " [ \ u30FD \ u30FE] " ;
static String HIRAGANA_ITERATION = " [ \ u309D \ u309E] " ;
static UnicodeSet extraKatakana = new UnicodeSet ( " [ " + LENGTH + HALFWIDTH_KATAKANA + KATAKANA_ITERATION + " ] " ) ;
static UnicodeProperty baseProp = UnifiedBinaryProperty . make ( DERIVED | GraphemeBase ) ;
static UnicodeProperty linkProp = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | GraphemeLink ) ;
static UnicodeSet prefixSet = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Logical_Order_Exception ) . getSet ( ) ;
static UnicodeSet postfixSet ;
static UnicodeSet exceptionLetters ;
static UnicodeSet normalLetters ;
static UnicodeSet thaiLaoNormal ;
static UnicodeSet marks = UnifiedBinaryProperty . make ( CATEGORY | Mn ) . getSet ( )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Me ) . getSet ( ) ) ;
static UnicodeSet oughtToBeLm = new UnicodeSet ( " [ \ u02B9- \ u02BA \ u02C2- \ u02CF \ u02D2- \ u02DF \ u02E5- \ u02ED] " ) ;
/ *
U + 02B9 . . U + 02BA # MODIFIER LETTER PRIME . . MODIFIER LETTER DOUBLE PRIME
U + 02C2 . . U + 02CF # MODIFIER LETTER LEFT ARROWHEAD . . MODIFIER LETTER LOW ACUTE ACCENT
U + 02D2 . . U + 02DF # MODIFIER LETTER CENTRED RIGHT HALF RING . . MODIFIER LETTER CROSS ACCE
U + 02E5 . . U + 02ED # MODIFIER LETTER EXTRA - HIGH TONE BAR . . MODIFIER LETTER UNASPIRATED
* /
static UnicodeSet letterSet = UnifiedBinaryProperty . make ( CATEGORY | Lo ) . getSet ( )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Lu ) . getSet ( ) )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Lt ) . getSet ( ) )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Ll ) . getSet ( ) )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Lm ) . getSet ( ) )
. addAll ( UnifiedBinaryProperty . make ( CATEGORY | Mc ) . getSet ( ) )
. addAll ( oughtToBeLm ) ;
{
UnicodeSet linebreakNS = UnifiedBinaryProperty . make ( LINE_BREAK | LB_NS ) . getSet ( ) ;
postfixSet = new UnicodeSet ( linebreakNS )
. retainAll ( letterSet )
. addAll ( new UnicodeSet ( " [ \ u0e30- \ u0E3A \ u0e45- \ u0e4e] " ) )
. addAll ( new UnicodeSet ( " [[ \ u0eb0- \ u0EBd \ u0ec6- \ u0ece]-[:Cn:]] " ) )
. removeAll ( new UnicodeSet ( " [:mn:] " ) )
. removeAll ( new UnicodeSet ( " [:katakana:] " ) . addAll ( extraKatakana ) ) ;
thaiLaoNormal = new UnicodeSet ( " [[:thai:][:lao:]] " ) . removeAll ( prefixSet ) . removeAll ( postfixSet ) ;
// we want ideographics, hiragana, thai (except prefix/suffix)
2002-08-09 23:56:24 +00:00
//UnicodeSet compatIdeographics = new UnicodeSet("[\uf900-\ufa6a\\U0002F800-\\U0002FA1D]");
2002-08-08 15:38:16 +00:00
UnicodeSet hiragana = UnifiedBinaryProperty . make ( SCRIPT | HIRAGANA_SCRIPT ) . getSet ( ) ;
UnicodeSet smallHiragana = new UnicodeSet ( hiragana ) . retainAll ( linebreakNS ) ;
UnicodeSet missingKatakana = new UnicodeSet ( extraKatakana ) . removeAll ( new UnicodeSet ( " [:katakana:] " ) ) ;
if ( DEBUG ) {
System . out . println ( " missingKatakana: " + missingKatakana . toPattern ( true ) ) ;
Utility . showSetNames ( " " , missingKatakana , false , Default . ucd ) ;
System . out . println ( " oughtToBeLm: " + oughtToBeLm . toPattern ( true ) ) ;
Utility . showSetNames ( " " , oughtToBeLm , false , Default . ucd ) ;
System . out . println ( " Prefix: " + prefixSet . toPattern ( true ) ) ;
Utility . showSetNames ( " " , prefixSet , false , Default . ucd ) ;
System . out . println ( " Postfix: " + postfixSet . toPattern ( true ) ) ;
Utility . showSetNames ( " " , postfixSet , false , Default . ucd ) ;
System . out . println ( " exceptionLetters: " + exceptionLetters . toPattern ( true ) ) ;
System . out . println ( " hiragana: " + hiragana . toPattern ( true ) ) ;
System . out . println ( " smallHiragana: " + hiragana . toPattern ( true ) ) ;
Utility . showSetNames ( " " , smallHiragana , true , Default . ucd ) ;
System . out . println ( " midLetterSet: " + midLetterSet . toPattern ( true ) ) ;
Utility . showSetNames ( " " , midLetterSet , true , Default . ucd ) ;
}
fileName = " Word " ;
extraSamples = new String [ ] {
2002-08-09 23:56:24 +00:00
" \ uFF70 " , " \ uFF65 " , " \ u30FD " , " a \ u2060 " , " a: " , " a' " , " a' \ u2060 " , " a, " , " 1: " , " 1' " , " 1, " , " 1. \ u2060 "
2002-08-08 15:38:16 +00:00
} ;
if ( DEBUG ) {
System . out . println ( " length not covered: "
+ new UnicodeSet ( LENGTH ) . removeAll ( new UnicodeSet ( " [:katakana:] " ) ) . toPattern ( true ) ) ;
System . out . println ( " half-width not covered: "
+ new UnicodeSet ( HALFWIDTH_KATAKANA ) . removeAll ( new UnicodeSet ( " [:katakana:] " ) ) . toPattern ( true ) ) ;
System . out . println ( " k.iteration not covered: "
+ new UnicodeSet ( KATAKANA_ITERATION ) . removeAll ( new UnicodeSet ( " [:katakana:] " ) ) . toPattern ( true ) ) ;
System . out . println ( " h.iteration not covered: "
+ new UnicodeSet ( HIRAGANA_ITERATION ) . removeAll ( new UnicodeSet ( " [:hiragana:] " ) ) . toPattern ( true ) ) ;
System . out . println ( " L1: " + getTypeID ( '\u30FC' ) ) ;
System . out . println ( " L2: " + getTypeID ( '\uFF70' ) ) ;
}
2002-08-09 23:56:24 +00:00
String [ ] temp = { " can't " , " can \ u2019t " , " ab \ u00ADby " , " a$-34,567.14%b " , " 3a " } ;
extraSingleSamples = new String [ temp . length * 2 ] ;
System . arraycopy ( temp , 0 , extraSingleSamples , 0 , temp . length ) ;
for ( int i = 0 ; i < temp . length ; + + i ) {
extraSingleSamples [ i + temp . length ] = insertEverywhere ( temp [ i ] , " \ u2060 " , grapheme ) ;
}
2002-08-08 15:38:16 +00:00
}
// stuff that subclasses need to override
public String getTypeID ( int cp , boolean recommended ) {
byte type = getType ( cp , recommended ) ;
return Names [ type ] ;
}
// stuff that subclasses need to override
public byte getType ( int cp , boolean recommended ) {
byte cat = Default . ucd . getCategory ( cp ) ;
2002-08-09 23:56:24 +00:00
if ( cat = = Cf ) return Format ;
2002-08-08 15:38:16 +00:00
byte script = Default . ucd . getScript ( cp ) ;
if ( script = = KATAKANA_SCRIPT ) return Katakana ;
2002-08-09 23:56:24 +00:00
if ( extraKatakana . contains ( cp ) ) return Katakana ;
if ( script = = HIRAGANA_SCRIPT | | script = = THAI_SCRIPT | | script = = LAO_SCRIPT ) return Other ;
if ( ideographicSet . contains ( cp ) ) return Other ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
if ( alphabeticSet . contains ( cp ) ) return ALetter ;
2002-08-08 15:38:16 +00:00
byte lb = Default . ucd . getLineBreak ( cp ) ;
if ( lb = = LB_NU ) return Numeric ;
2002-08-09 23:56:24 +00:00
if ( midLetterSet . contains ( cp ) ) {
if ( lb = = LB_IS ) return MidNumLet ;
return MidLetter ;
}
2002-08-08 15:38:16 +00:00
if ( lb = = LB_IS ) return Infix_Numeric ;
2002-08-09 23:56:24 +00:00
2002-08-08 15:38:16 +00:00
return Other ;
}
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
results [ 1 ] = 'a' + before + " \ u0301 \ u0308 " + after + " \ u0301 \ u0308 " + 'a' ;
results [ 2 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidLetter ] + after + " \ u0301 \ u0308 " + 'a' ;
results [ 3 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ Infix_Numeric ] + after + " \ u0301 \ u0308 " + 'a' ;
return 3 ;
}
public boolean isBreak ( String source , int offset , boolean recommended ) {
2002-08-09 23:56:24 +00:00
recommended = true ; // don't care about old stuff
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
rule = " 1 " ;
2002-08-08 15:38:16 +00:00
if ( offset < 0 | | offset > source . length ( ) ) return false ;
2002-08-09 23:56:24 +00:00
2002-08-08 15:38:16 +00:00
if ( offset = = 0 ) return true ;
2002-08-09 23:56:24 +00:00
rule = " 2 " ;
2002-08-08 15:38:16 +00:00
if ( offset = = source . length ( ) ) return true ;
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
// GC => FB
2002-08-09 23:56:24 +00:00
rule = " 3 " ;
2002-08-08 15:38:16 +00:00
if ( ! grapheme . isBreak ( source , offset , recommended ) ) return false ;
// now get the base character before and after, and their types
2003-02-25 23:38:23 +00:00
getGraphemeBases ( source , offset , recommended , Format , context ) ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
byte before = context . tBefore ;
byte after = context . tAfter ;
byte before2 = context . tBefore2 ;
byte after2 = context . tAfter2 ;
2002-08-08 15:38:16 +00:00
//Don't break between most letters
2002-08-09 23:56:24 +00:00
// ALetter <20> ALetter
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
rule = " 5 " ;
if ( before = = ALetter & & after = = ALetter ) return false ;
2002-08-08 15:38:16 +00:00
// Don<6F> t break letters across certain punctuation
2002-08-09 23:56:24 +00:00
// ALetter<65> (MidLetter | MidNumLet) ALetter(6)
// ALetter (MidLetter | MidNumLet)<29> ALetter(7)
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
rule = " 6 " ;
if ( before = = ALetter & & ( after = = MidLetter | | after = = MidNumLet ) & & after2 = = ALetter ) return false ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
rule = " 7 " ;
if ( before2 = = ALetter & & ( before = = MidLetter | | before = = MidNumLet ) & & after = = ALetter ) return false ;
2002-08-08 15:38:16 +00:00
// Don<6F> t break within sequences of digits, or digits adjacent to letters.
// Numeric <20> Numeric (5)
2002-08-09 23:56:24 +00:00
rule = " 8 " ;
2002-08-08 15:38:16 +00:00
if ( before = = Numeric & & after = = Numeric ) return false ;
2002-08-09 23:56:24 +00:00
// ALetter <20> Numeric (6)
rule = " 9 " ;
if ( before = = ALetter & & after = = Numeric ) return false ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// Numeric <20> ALetter (7)
rule = " 10 " ;
if ( before = = Numeric & & after = = ALetter ) return false ;
2002-08-08 15:38:16 +00:00
// Don<6F> t break within sequences like: '-3.2'
2002-08-09 23:56:24 +00:00
// Numeric (MidNum | MidNumLet)<29> Numeric(11)
rule = " 11 " ;
if ( before2 = = Numeric & & ( before = = Infix_Numeric | | before = = MidNumLet ) & & after = = Numeric ) return false ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// Numeric<69> (MidNum | MidNumLet) Numeric(12)
rule = " 12 " ;
if ( before = = Numeric & & ( after = = Infix_Numeric | | after = = MidNumLet ) & & after2 = = Numeric ) return false ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// Don't break between Hiragana
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// Hiragana <20> Hiragana (13)
rule = " 13 " ;
if ( before = = Hiragana & & after = = Hiragana ) return false ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// Otherwise break always.
rule = " 14 " ;
return true ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
//==============================================
static class GenerateSentenceBreakTest extends GenerateBreakTest {
static final byte Format = 0 , Sep = 1 , Sp = 2 , OLetter = 3 , Lower = 4 , Upper = 5 ,
2003-02-25 23:38:23 +00:00
Numeric = 6 , Close = 7 , ATerm = 8 , Term = 9 , Other = 10 ,
2002-08-09 23:56:24 +00:00
LIMIT = Other + 1 ;
2003-02-25 23:38:23 +00:00
static final String [ ] Names = { " Format " , " Sep " , " Sp " , " OLetter " , " Lower " , " Upper " , " Numeric " ,
2002-08-09 23:56:24 +00:00
" Close " , " ATerm " , " Term " , " Other " } ;
static GenerateGraphemeBreakTest grapheme = new GenerateGraphemeBreakTest ( ) ;
static UnicodeSet sepSet = new UnicodeSet ( " [ \\ u000a \\ u000d \\ u0085 \\ u2029 \\ u2028] " ) ;
static UnicodeSet atermSet = new UnicodeSet ( " [ \\ u002E] " ) ;
2003-02-25 23:38:23 +00:00
static UnicodeSet termSet = new UnicodeSet (
" [ \\ u0021 \\ u003F \\ u0589 \\ u061f \\ u06d4 \\ u0700- \\ u0702 \\ u0934 "
+ " \\ u1362 \\ u1367 \\ u1368 \\ u104A \\ u104B \\ u166E "
+ " \\ u1803 \\ u1809 \\ u203c \\ u203d "
+ " \\ u2048 \\ u2049 \\ u3002 \\ ufe52 \\ ufe57 \\ uff01 \\ uff0e \\ uff1f \\ uff61] " ) ;
2002-08-09 23:56:24 +00:00
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty . make ( DERIVED | PropLowercase ) ;
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty . make ( DERIVED | PropUppercase ) ;
2003-02-25 23:38:23 +00:00
UnicodeSet linebreakNS = UnifiedBinaryProperty . make ( LINE_BREAK | LB_NU ) . getSet ( ) ;
2002-08-09 23:56:24 +00:00
{
fileName = " Sentence " ;
extraSamples = new String [ ] {
} ;
2003-02-25 23:38:23 +00:00
extraSingleSamples = new String [ ] {
2002-08-09 23:56:24 +00:00
" ( \" Go. \" ) (He did.) " ,
2003-02-25 23:38:23 +00:00
" ( \ u201CGo? \ u201D) (He did.) " ,
2002-08-09 23:56:24 +00:00
" U.S.A \ u0300. is " ,
" U.S.A \ u0300? He " ,
" U.S.A \ u0300. " ,
2003-02-25 23:38:23 +00:00
" 3.4 " ,
" c.d " ,
" etc.) \ u2019<EFBFBD> \ u2018(the " ,
" etc.) \ u2019<EFBFBD> \ u2018(The " ,
" the resp. leaders are " ,
" \ u5B57. \ u5B57 " ,
" etc. \ u5B83 " ,
" etc. \ u3002 " ,
" \ u5B57 \ u3002 \ u5B83 " ,
2002-08-09 23:56:24 +00:00
} ;
2003-02-25 23:38:23 +00:00
String [ ] temp = new String [ extraSingleSamples . length * 2 ] ;
System . arraycopy ( extraSingleSamples , 0 , temp , 0 , extraSingleSamples . length ) ;
for ( int i = 0 ; i < extraSingleSamples . length ; + + i ) {
temp [ i + extraSingleSamples . length ] = insertEverywhere ( extraSingleSamples [ i ] , " \ u2060 " , grapheme ) ;
2002-08-08 15:38:16 +00:00
}
2003-02-25 23:38:23 +00:00
extraSingleSamples = temp ;
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
}
// stuff that subclasses need to override
public String getTypeID ( int cp , boolean recommended ) {
byte type = getType ( cp , recommended ) ;
return Names [ type ] ;
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// stuff that subclasses need to override
public byte getType ( int cp , boolean recommended ) {
byte cat = Default . ucd . getCategory ( cp ) ;
if ( cat = = Cf ) return Format ;
if ( sepSet . contains ( cp ) ) return Sep ;
if ( Default . ucd . getBinaryProperty ( cp , White_space ) ) return Sp ;
2003-02-25 23:38:23 +00:00
if ( linebreakNS . contains ( cp ) ) return Numeric ;
2002-08-09 23:56:24 +00:00
if ( lowercaseProp . hasValue ( cp ) ) return Lower ;
if ( uppercaseProp . hasValue ( cp ) | | cat = = Lt ) return Upper ;
2003-02-25 23:38:23 +00:00
if ( alphabeticSet . contains ( cp ) ) return OLetter ;
2002-08-09 23:56:24 +00:00
if ( atermSet . contains ( cp ) ) return ATerm ;
if ( termSet . contains ( cp ) ) return Term ;
if ( cat = = Po | | cat = = Pe
| | Default . ucd . getLineBreak ( cp ) = = LB_QU ) return Close ;
return Other ;
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
/ *
results [ 1 ] = 'a' + before + " \ u0301 \ u0308 " + after + " \ u0301 \ u0308 " + 'a' ;
results [ 2 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidLetter ] + after + " \ u0301 \ u0308 " + 'a' ;
results [ 3 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ Infix_Numeric ] + after + " \ u0301 \ u0308 " + 'a' ;
* /
return 1 ;
2002-08-08 15:38:16 +00:00
}
2003-02-25 23:38:23 +00:00
static Context context = new Context ( ) ;
2002-08-09 23:56:24 +00:00
public boolean isBreak ( String source , int offset , boolean recommended ) {
rule = " 1 " ;
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
rule = " 2 " ;
if ( offset = = source . length ( ) ) return true ;
// Sep <20> (3)
rule = " 3 " ;
2003-02-25 23:38:23 +00:00
byte beforeChar = getResolvedType ( source . charAt ( offset - 1 ) , recommended ) ;
if ( beforeChar = = Sep ) return true ;
2002-08-09 23:56:24 +00:00
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
// GC => FB
// Ignore interior Format characters. That is, ignore Format characters in all subsequent rules.
// X Format*
// ?
// X
// (5)
rule = " 3 " ;
if ( ! grapheme . isBreak ( source , offset , recommended ) ) return false ;
2003-02-25 23:38:23 +00:00
getGraphemeBases ( source , offset , recommended , Format , context ) ;
byte before = context . tBefore ;
byte after = context . tAfter ;
byte before2 = context . tBefore2 ;
byte after2 = context . tAfter2 ;
2002-08-09 23:56:24 +00:00
2003-02-25 23:38:23 +00:00
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
// ATerm <20> (Lower | Numeric) (6)
// Upper ATerm <20> Upper (7)
2002-08-09 23:56:24 +00:00
2003-02-25 23:38:23 +00:00
if ( before = = ATerm ) {
rule = " 6 " ;
if ( after = = Lower | | after = = Numeric ) return false ;
rule = " 7 " ;
if ( DEBUG_GRAPHEMES ) System . out . println ( context + " , " + Upper ) ;
if ( before2 = = Upper & & after = = Upper ) return false ;
}
// The following cases are all handled together.
2002-08-09 23:56:24 +00:00
// First we loop backwards, checking for the different types.
MyBreakIterator graphemeIterator = new MyBreakIterator ( ) ;
graphemeIterator . set ( source , offset ) ;
int state = 0 ;
byte lookAfter = - 1 ;
int cp ;
byte t ;
boolean gotSpace = false ;
boolean gotClose = false ;
behindLoop :
while ( true ) {
cp = graphemeIterator . previousBase ( ) ;
if ( cp = = - 1 ) break ;
t = getResolvedType ( cp , recommended ) ;
if ( SHOW_TYPE ) System . out . println ( Default . ucd . getCodeAndName ( cp ) + " , " + getTypeID ( cp , recommended ) ) ;
if ( t = = Format ) continue ; // ignore all formats!
switch ( state ) {
case 0 :
if ( t = = Sp ) {
// loop as long as we have Space
gotSpace = true ;
continue behindLoop ;
} else if ( t = = Close ) {
gotClose = true ;
state = 1 ; // go to close loop
continue behindLoop ;
}
break ;
case 1 :
if ( t = = Close ) {
// loop as long as we have Close
continue behindLoop ;
}
break ;
}
if ( t = = ATerm ) {
lookAfter = ATerm ;
} else if ( t = = Term ) {
lookAfter = Term ;
}
break ;
}
// if we didn't find ATerm or Term, bail
if ( lookAfter = = - 1 ) {
// Otherwise, do not break
// Any <20> Any (11)
2003-02-25 23:38:23 +00:00
rule = " 12 " ;
2002-08-09 23:56:24 +00:00
return false ;
}
2003-02-25 23:38:23 +00:00
// ATerm Close* Sp*<2A> (<28> ( OLetter))* Lower(8)
2002-08-09 23:56:24 +00:00
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
2003-02-25 23:38:23 +00:00
// ( Term | ATerm ) Close*<2A> ( Close | Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp<53> ( Sp | Sep )(10)
// ( Term | ATerm ) Close* Sp*<2A> (11)
2002-08-09 23:56:24 +00:00
// We DID find one. Loop to see if the right side is ok.
graphemeIterator . set ( source , offset ) ;
boolean isFirst = true ;
while ( true ) {
cp = graphemeIterator . nextBase ( ) ;
if ( cp = = - 1 ) break ;
t = getResolvedType ( cp , recommended ) ;
if ( SHOW_TYPE ) System . out . println ( Default . ucd . getCodeAndName ( cp ) + " , " + getTypeID ( cp , recommended ) ) ;
if ( t = = Format ) continue ; // skip format characters!
if ( isFirst ) {
isFirst = false ;
if ( lookAfter = = ATerm & & t = = Upper ) {
2003-02-25 23:38:23 +00:00
rule = " 8 " ;
2002-08-09 23:56:24 +00:00
return false ;
}
if ( gotSpace ) {
if ( t = = Sp | | t = = Sep ) {
2003-02-25 23:38:23 +00:00
rule = " 10 " ;
2002-08-09 23:56:24 +00:00
return false ;
}
} else if ( t = = Close | | t = = Sp | | t = = Sep ) {
2003-02-25 23:38:23 +00:00
rule = " 9 " ;
2002-08-09 23:56:24 +00:00
return false ;
}
if ( lookAfter = = Term ) break ;
}
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
if ( t ! = OLetter & & t ! = Upper & & t ! = Lower ) continue ;
if ( t = = Lower ) {
2003-02-25 23:38:23 +00:00
rule = " 8 " ;
2002-08-09 23:56:24 +00:00
return false ;
}
break ;
}
2003-02-25 23:38:23 +00:00
rule = " 11 " ;
2002-08-09 23:56:24 +00:00
return true ;
}
}
2003-02-25 23:38:23 +00:00
static final boolean DEBUG_GRAPHEMES = false ;
2002-08-09 23:56:24 +00:00
static class MyBreakIterator {
int offset = 0 ;
String string = " " ;
GenerateBreakTest breaker = new GenerateGraphemeBreakTest ( ) ;
boolean recommended = true ;
public MyBreakIterator set ( String source , int offset ) {
2003-02-25 23:38:23 +00:00
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset);
2002-08-09 23:56:24 +00:00
string = source ;
this . offset = offset ;
return this ;
}
public int nextBase ( ) {
if ( offset > = string . length ( ) ) return - 1 ;
int result = UTF16 . charAt ( string , offset ) ;
for ( + + offset ; offset < string . length ( ) ; + + offset ) {
if ( breaker . isBreak ( string , offset , recommended ) ) break ;
}
2003-02-25 23:38:23 +00:00
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
2002-08-09 23:56:24 +00:00
return result ;
}
public int previousBase ( ) {
if ( offset < = 0 ) return - 1 ;
for ( - - offset ; offset > = 0 ; - - offset ) {
if ( breaker . isBreak ( string , offset , recommended ) ) break ;
}
2003-02-25 23:38:23 +00:00
int result = UTF16 . charAt ( string , offset ) ;
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
return result ;
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
}
}