2002-07-30 09:57:18 +00:00
/ * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( C ) 1996 - 2001 , International Business Machines Corporation and *
* others . All Rights Reserved . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
* $Source : / xsrl / Nsvn / icu / unicodetools / com / ibm / text / UCD / GenerateLineBreakTest . java , v $
2004-04-17 18:21:39 +00:00
* $Date : 2004 / 04 / 17 18 : 21 : 39 $
* $Revision : 1 . 5 $
2002-07-30 09:57:18 +00:00
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* /
package com.ibm.text.UCD ;
import java.util.* ;
import java.io.* ;
import com.ibm.text.utility.* ;
import com.ibm.icu.text.UTF16 ;
import com.ibm.icu.text.UnicodeSet ;
public class GenerateLineBreakTest implements UCD_Types {
2002-08-04 21:38:45 +00:00
// COMMON STUFF for Hangul
static final byte hNot = - 1 , hL = 0 , hV = 1 , hT = 2 , hLV = 3 , hLVT = 4 , hLIMIT = 5 ;
static final String [ ] hNames = { " L " , " V " , " T " , " LV " , " LVT " } ;
static byte getHangulType ( int cp ) {
2004-02-07 01:01:17 +00:00
if ( Default . ucd ( ) . isLeadingJamo ( cp ) ) return hL ;
if ( Default . ucd ( ) . isVowelJamo ( cp ) ) return hV ;
if ( Default . ucd ( ) . isTrailingJamo ( cp ) ) return hT ;
if ( Default . ucd ( ) . isHangulSyllable ( cp ) ) {
if ( Default . ucd ( ) . isDoubleHangul ( cp ) ) return hLV ;
2002-08-04 21:38:45 +00:00
return hLVT ;
}
return hNot ;
}
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
//============================
protected String rule ;
protected String fileName = " Line " ;
// all the other items are supplied in UCD_TYPES
static byte LB_L = LB_LIMIT + hL , LB_V = LB_LIMIT + hV , LB_T = LB_LIMIT + hT ,
LB_LV = LB_LIMIT + hLV , LB_LVT = LB_LIMIT + hLVT , LB_SUP = LB_LIMIT + hLIMIT ,
LB2_LIMIT = ( byte ) ( LB_SUP + 1 ) ;
String [ ] samples = new String [ 100 ] ;
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
byte [ ] TypeOrder = {
2002-07-30 09:57:18 +00:00
LB_OP , LB_CL , LB_QU , LB_GL , LB_NS , LB_EX , LB_SY , LB_IS , LB_PR , LB_PO ,
LB_NU , LB_AL , LB_ID , LB_IN , LB_HY , LB_BA , LB_BB , LB_B2 , LB_ZW , LB_CM ,
// missing from Pair Table
LB_SP , LB_BK , LB_CR , LB_LF ,
// resolved types below
LB_CB , LB_AI , LB_SA , LB_SG , LB_XX ,
2002-08-04 21:38:45 +00:00
// 3 JAMO CLASSES, plus supplementary
LB_L , LB_V , LB_T , LB_LV , LB_LVT , LB_SUP
2002-07-30 09:57:18 +00:00
} ;
public static void main ( String [ ] args ) throws IOException {
2004-02-07 01:01:17 +00:00
2002-08-04 21:38:45 +00:00
new GenerateLineBreakTest ( ) . run ( ) ;
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
new GenerateWordBreakTest ( ) . run ( ) ;
}
// stuff that subclasses need to override
public void run ( ) throws IOException {
2002-07-30 09:57:18 +00:00
findSamples ( ) ;
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
2002-08-04 21:38:45 +00:00
PrintWriter out = Utility . openPrintWriter ( fileName + " BreakTest.html " , Utility . UTF8_WINDOWS ) ;
out . println ( " <html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title> "
+ fileName + " </title></head> " ) ;
out . println ( " <body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3> " ) ;
2002-07-30 09:57:18 +00:00
generateTable ( out , false ) ;
2002-08-04 21:38:45 +00:00
out . println ( " <h3>Recommended:</h3> " ) ;
2002-07-30 09:57:18 +00:00
generateTable ( out , true ) ;
out . println ( " </body></html> " ) ;
out . close ( ) ;
2002-08-04 21:38:45 +00:00
String [ ] testCase = new String [ 50 ] ;
2002-07-30 09:57:18 +00:00
// do main test
for ( int k = 0 ; k < 2 ; + + k ) {
2002-08-04 21:38:45 +00:00
out = Utility . openPrintWriter ( fileName + ( k = = 0 ? " Test_SHORT.txt " : " Test.txt " ) , Utility . LATIN1_WINDOWS ) ;
2002-07-30 09:57:18 +00:00
int counter = 0 ;
2002-08-04 21:38:45 +00:00
out . println ( " # Default " + fileName + " Break Test " ) ;
out . println ( " # Generated: " + Default . getDate ( ) + " , MED " ) ;
out . println ( " # " ) ;
out . println ( " # Format: " ) ;
out . println ( " # <string> (# <comment>)? " ) ;
out . println ( " # <string> contains hex Unicode code points, with " ) ;
out . println ( " # \ t " + BREAK + " wherever there is a break opportunity, and " ) ;
out . println ( " # \ t " + NOBREAK + " wherever there is not. " ) ;
out . println ( " # <comment> the format can change, but currently it shows: " ) ;
out . println ( " # \ t- the sample character name " ) ;
out . println ( " # \ t- (x) the line_break property* for the sample character " ) ;
out . println ( " # \ t- [x] the rule that determines whether there is a break or not " ) ;
out . println ( " # " ) ;
out . println ( " # Samples: " ) ;
out . println ( " # The test currently takes all pairs of linebreak types*, " ) ;
out . println ( " # picks a sample for each type, and generates three strings: " ) ;
out . println ( " # \ t- the pair alone " ) ;
out . println ( " # \ t- the pair alone with an imbeded space " ) ;
out . println ( " # \ t- the pair alone with embedded combining marks " ) ;
out . println ( " # The sample for each type is simply the first code point (above NULL) " ) ;
out . println ( " # with that property. " ) ;
out . println ( " # * Note: " ) ;
out . println ( " # \ t- SG is omitted " ) ;
out . println ( " # \ t- 3 different Jamo characters and a supplementary character are added " ) ;
out . println ( " # \ t The syllable types for the Jamo (L, V, T) are displayed in comments " ) ;
out . println ( " # \ t instead of the linebreak property " ) ;
out . println ( " # These samples may be extended in the future. " ) ;
2002-07-30 09:57:18 +00:00
out . println ( " # " ) ;
2002-08-04 21:38:45 +00:00
for ( int ii = 0 ; ii < getLimit ( ) ; + + ii ) {
int i = TypeOrder [ ii ] ;
if ( i = = LB_SG ) continue ;
2002-07-30 09:57:18 +00:00
String before = samples [ i ] ;
2002-08-04 21:38:45 +00:00
for ( int jj = 0 ; jj < getLimit ( ) ; + + jj ) {
Utility . dot ( counter ) ;
int j = TypeOrder [ jj ] ;
if ( j = = LB_SG ) continue ;
2002-07-30 09:57:18 +00:00
String after = samples [ j ] ;
// do line straight
2002-08-04 21:38:45 +00:00
int len = genTestItems ( before , after , testCase ) ;
for ( int q = 0 ; q < len ; + + q ) {
printLine ( out , testCase [ q ] , k ! = 0 & & q = = 0 , false ) ;
+ + counter ;
}
2002-07-30 09:57:18 +00:00
}
}
out . println ( " # Lines: " + counter ) ;
out . close ( ) ;
}
}
2002-08-04 21:38:45 +00:00
// stuff that subclasses need to override
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
results [ 1 ] = before + " " + after ;
results [ 2 ] = before + " \ u0301 \ u0308 " + after ;
return 3 ;
}
// stuff that subclasses need to override
boolean skipType ( byte type ) {
return type = = LB_AI | | type = = LB_SA | | type = = LB_SG | | type = = LB_XX ;
}
// stuff that subclasses need to override
public String getTypeID ( int cp ) {
byte result = getType ( cp ) ;
if ( result = = LB_SUP ) return " SUP " ;
if ( result > = LB_LIMIT ) return hNames [ result - LB_LIMIT ] ;
2004-02-07 01:01:17 +00:00
return Default . ucd ( ) . getLineBreakID_fromIndex ( result ) ;
2002-08-04 21:38:45 +00:00
}
// stuff that subclasses need to override
public byte getType ( int cp ) {
if ( cp > 0xFFFF ) return LB_SUP ;
byte result = getHangulType ( cp ) ;
if ( result ! = hNot ) return ( byte ) ( result + LB_LIMIT ) ;
2004-02-07 01:01:17 +00:00
return Default . ucd ( ) . getLineBreak ( cp ) ;
2002-08-04 21:38:45 +00:00
}
public int getLimit ( ) {
return LB2_LIMIT ;
}
public int getTableLimit ( ) {
return LB_SUP ; // skip last;
}
public void generateTable ( PrintWriter out , boolean recommended ) {
String width = " width=' " + ( 100 / ( getTableLimit ( ) + 1 ) ) + " %' " ;
out . print ( " <table border='1' cellspacing='0'><tr><th " + width + " ></th> " ) ;
byte type ;
for ( int i = 0 ; i < getTableLimit ( ) ; + + i ) {
type = TypeOrder [ i ] ;
if ( skipType ( type ) ) continue ;
String h = getTypeID ( samples [ TypeOrder [ i ] ] ) ;
out . print ( " <th " + width + " > " + h + " </th> " ) ;
2002-07-30 09:57:18 +00:00
}
out . print ( " </tr> " ) ;
String [ ] rule = new String [ 1 ] ;
String [ ] rule2 = new String [ 1 ] ;
2002-08-04 21:38:45 +00:00
for ( int i = 0 ; i < getTableLimit ( ) ; + + i ) {
type = TypeOrder [ i ] ;
if ( skipType ( type ) ) continue ;
String before = samples [ type ] ;
String line = " <tr><th> " + getTypeID ( before ) + " </th> " ;
for ( int j = 0 ; j < getTableLimit ( ) ; + + j ) {
type = TypeOrder [ j ] ;
if ( skipType ( type ) ) continue ;
String after = samples [ type ] ;
2002-07-30 09:57:18 +00:00
String t = getTableEntry ( before , after , recommended , rule ) ;
String background = " " ;
2002-08-04 21:38:45 +00:00
String t2 = getTableEntry ( before , after , ! recommended , rule2 ) ;
if ( ! t . equals ( t2 ) ) {
if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCFFFF' " ;
} else {
background = " bgcolor='#FFFF00' " ;
}
} else if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCCCFF' " ;
2002-07-30 09:57:18 +00:00
}
line + = " <th title=' " + rule [ 0 ] + " ' " + background + " > " + t + " </th> " ;
}
out . println ( line + " </tr> " ) ;
}
out . println ( " </table> " ) ;
}
2002-08-04 21:38:45 +00:00
public String getTableEntry ( String before , String after , boolean recommended , String [ ] ruleOut ) {
2002-07-30 09:57:18 +00:00
String t = " _ " ;
boolean spaceBreak = isBreak ( before + " " + after , before . length ( ) + 1 , recommended ) ;
String spaceRule = rule ;
boolean spaceBreak2 = isBreak ( before + " " + after , before . length ( ) , recommended ) ;
String spaceRule2 = rule ;
boolean normalBreak = isBreak ( before + after , before . length ( ) , recommended ) ;
String normalRule = rule ;
if ( ! normalBreak ) {
if ( ! spaceBreak & & ! spaceBreak2 ) {
t = " ^ " ;
rule = spaceRule . equals ( normalRule ) ? normalRule : spaceRule + " / " + normalRule ;
if ( ! spaceRule2 . equals ( normalRule ) & & ! spaceRule2 . equals ( spaceRule ) ) {
rule + = " / " + spaceRule2 ;
}
} else {
t = " % " ;
rule = normalRule ;
}
}
ruleOut [ 0 ] = rule ;
return t ;
}
2002-08-04 21:38:45 +00:00
static final String BREAK = " \ u00F7 " ;
static final String NOBREAK = " \ u00D7 " ;
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
public void printLine ( PrintWriter out , String source , boolean comments , boolean recommended ) {
int cp ;
StringBuffer string = new StringBuffer ( ) ;
StringBuffer comment = new StringBuffer ( " \ t# " ) ;
String status = isBreak ( source , 0 , recommended ) ? BREAK : NOBREAK ;
string . append ( status ) ;
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( rule ) . append ( ']' ) ;
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
for ( int offset = 0 ; offset < source . length ( ) ; offset + = UTF16 . getCharCount ( cp ) ) {
2002-07-30 09:57:18 +00:00
2002-08-04 21:38:45 +00:00
cp = UTF16 . charAt ( source , offset ) ;
if ( string . length ( ) > 0 ) {
string . append ( ' ' ) ;
comment . append ( ' ' ) ;
}
string . append ( Utility . hex ( cp ) ) ;
2004-02-07 01:01:17 +00:00
comment . append ( Default . ucd ( ) . getName ( cp ) + " ( " + getTypeID ( cp ) + " ) " ) ;
2002-08-04 21:38:45 +00:00
status = isBreak ( source , offset + UTF16 . getCharCount ( cp ) , recommended ) ? BREAK : NOBREAK ;
string . append ( ' ' ) . append ( status ) ;
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( rule ) . append ( ']' ) ;
}
if ( comments ) string . append ( comment ) ;
out . println ( string ) ;
2002-07-30 09:57:18 +00:00
}
2002-08-04 21:38:45 +00:00
public void findSamples ( ) {
2002-07-30 09:57:18 +00:00
for ( int i = 1 ; i < = 0x10FFFF ; + + i ) {
2004-02-07 01:01:17 +00:00
if ( ! Default . ucd ( ) . isAllocated ( i ) ) continue ;
2002-08-04 21:38:45 +00:00
if ( 0xD800 < = i & & i < = 0xDFFF ) continue ;
if ( i = = 0x1100 ) {
System . out . print ( " here " ) ;
}
byte lb = getType ( i ) ;
2002-07-30 09:57:18 +00:00
if ( samples [ lb ] = = null ) {
samples [ lb ] = UTF16 . valueOf ( i ) ;
}
}
2002-08-04 21:38:45 +00:00
for ( int i = 0 ; i < TypeOrder . length ; + + i ) {
String sample = samples [ i ] ;
2004-02-07 01:01:17 +00:00
System . out . println ( getTypeID ( sample ) + " : \ t " + Default . ucd ( ) . getCodeAndName ( sample ) ) ;
2002-08-04 21:38:45 +00:00
}
2002-07-30 09:57:18 +00:00
}
2002-08-04 21:38:45 +00:00
public String getTypeID ( String s ) {
if ( s = = null ) return " <null> " ;
if ( s . length ( ) = = 1 ) return getTypeID ( s . charAt ( 0 ) ) ;
2002-07-30 09:57:18 +00:00
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < s . length ( ) ; i + = UTF32 . count16 ( cp ) ) {
cp = UTF32 . char32At ( s , i ) ;
if ( i > 0 ) result . append ( " " ) ;
2002-08-04 21:38:45 +00:00
result . append ( getTypeID ( cp ) ) ;
2002-07-30 09:57:18 +00:00
}
return result . toString ( ) ;
}
2002-08-04 21:38:45 +00:00
public int findLastNon ( String source , int offset , byte notLBType , boolean recommended ) {
2002-07-30 09:57:18 +00:00
int cp ;
2002-08-04 21:38:45 +00:00
for ( int i = offset - 1 ; i > = 0 ; i - = UTF16 . getCharCount ( cp ) ) {
2002-07-30 09:57:18 +00:00
cp = UTF16 . charAt ( source , i ) ;
2002-08-04 21:38:45 +00:00
byte f = getResolvedType ( cp , recommended ) ;
if ( f ! = notLBType ) return i ;
2002-07-30 09:57:18 +00:00
}
2002-08-04 21:38:45 +00:00
return - 1 ;
2002-07-30 09:57:18 +00:00
}
2002-08-04 21:38:45 +00:00
public byte getResolvedType ( int cp , boolean recommended ) {
2002-07-30 09:57:18 +00:00
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
2002-08-04 21:38:45 +00:00
byte result = getType ( cp ) ;
2002-07-30 09:57:18 +00:00
switch ( result ) {
case LB_AI : result = LB_AI ; break ;
// case LB_CB: result = LB_ID; break;
case LB_SA : result = LB_AL ; break ;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX : result = LB_AL ; break ;
}
2002-08-04 21:38:45 +00:00
if ( recommended ) {
if ( getHangulType ( cp ) ! = hNot ) {
result = LB_ID ;
}
}
2002-07-30 09:57:18 +00:00
return result ;
}
2002-08-04 21:38:45 +00:00
public boolean onCodepointBoundary ( String s , int offset ) {
if ( offset < 0 | | offset > s . length ( ) ) return false ;
if ( offset = = 0 | | offset = = s . length ( ) ) return true ;
if ( UTF16 . isLeadSurrogate ( s . charAt ( offset - 1 ) )
& & UTF16 . isTrailSurrogate ( s . charAt ( offset ) ) ) return false ;
return true ;
}
2002-07-30 09:57:18 +00:00
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
2002-08-04 21:38:45 +00:00
public boolean isBreak ( String source , int offset , boolean recommended ) {
2002-07-30 09:57:18 +00:00
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
2002-08-04 21:38:45 +00:00
// this is taken care of in the getResolvedType function
2002-07-30 09:57:18 +00:00
// LB 2a Never break at the start of text
rule = " 2a " ;
if ( offset < = 0 ) return false ;
// LB 2b Always break at the end of text
rule = " 2b " ;
if ( offset > = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
2002-08-04 21:38:45 +00:00
if ( ! onCodepointBoundary ( source , offset ) ) return false ;
2002-07-30 09:57:18 +00:00
// now get the character before and after, and their types
int cpBefore = UTF16 . charAt ( source , offset - 1 ) ;
int cpAfter = UTF16 . charAt ( source , offset ) ;
2002-08-04 21:38:45 +00:00
byte before = getResolvedType ( cpBefore , recommended ) ;
byte after = getResolvedType ( cpAfter , recommended ) ;
2002-07-30 09:57:18 +00:00
rule = " 3a " ;
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if ( before = = LB_CR & & after = = LB_LF ) return false ;
if ( before = = LB_BK | | before = = LB_LF | | before = = LB_CR ) return true ;
2004-04-17 18:21:39 +00:00
//LB 3b Don’ t break before hard line breaks.
2002-07-30 09:57:18 +00:00
rule = " 3b " ;
if ( after = = LB_BK | | after = = LB_LF | after = = LB_CR ) return false ;
2004-04-17 18:21:39 +00:00
// LB 4 Don’ t break before spaces or zero-width space.
// × SP
// × ZW
2002-07-30 09:57:18 +00:00
rule = " 4 " ;
if ( after = = LB_SP | | after = = LB_ZW ) return false ;
// LB 5 Break after zero-width space.
2004-04-17 18:21:39 +00:00
// ZW ÷
2002-07-30 09:57:18 +00:00
rule = " 5 " ;
if ( before = = LB_ZW ) return true ;
2004-04-17 18:21:39 +00:00
// LB 6 Don’ t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
2002-07-30 09:57:18 +00:00
rule = " 6 " ;
if ( after = = LB_CM ) return false ;
2002-08-04 21:38:45 +00:00
if ( before = = LB_L & & ( after = = LB_L | | after = = LB_V | | after = = LB_LV | | after = = LB_LVT ) ) return false ;
if ( ( before = = LB_LV | | before = = LB_V ) & & ( after = = LB_V | | after = = LB_T ) ) return false ;
if ( ( before = = LB_LVT | | before = = LB_T ) & & ( after = = LB_T ) ) return false ;
2002-07-30 09:57:18 +00:00
boolean setBase = false ;
if ( before = = LB_CM ) {
setBase = true ;
2002-08-04 21:38:45 +00:00
int backOffset = findLastNon ( source , offset , LB_CM , recommended ) ;
if ( backOffset < 0 ) {
2002-07-30 09:57:18 +00:00
before = LB_ID ;
} else {
2002-08-04 21:38:45 +00:00
before = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
2002-07-30 09:57:18 +00:00
}
}
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
rule = " 7 " ;
if ( setBase & & before = = LB_SP ) before = LB_ID ;
2004-04-17 18:21:39 +00:00
// LB 8 Don’ t break before ‘ ]’ or ‘ !’ or ‘ ;’ or ‘ /’ , even after spaces.
// × CL, × EX, × IS, × SY
2002-07-30 09:57:18 +00:00
rule = " 8 " ;
if ( after = = LB_CL | | after = = LB_EX | | after = = LB_SY | after = = LB_IS ) return false ;
// find the last non-space character; we will need it
byte lastNonSpace = before ;
if ( lastNonSpace = = LB_SP ) {
2002-08-04 21:38:45 +00:00
int backOffset = findLastNon ( source , offset , LB_CM , recommended ) ;
if ( backOffset > = 0 ) {
lastNonSpace = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
2002-07-30 09:57:18 +00:00
}
}
2004-04-17 18:21:39 +00:00
// LB 9 Don’ t break after ‘ [’ , even after spaces.
// OP SP* ×
2002-07-30 09:57:18 +00:00
rule = " 9 " ;
if ( lastNonSpace = = LB_OP ) return false ;
2004-04-17 18:21:39 +00:00
// LB 10 Don’ t break within ‘”[’ , , even with intervening spaces.
// QU SP* × OP
2002-07-30 09:57:18 +00:00
rule = " 10 " ;
if ( lastNonSpace = = LB_QU & & after = = LB_OP ) return false ;
2004-04-17 18:21:39 +00:00
// LB 11 Don’ t break within ‘ ]h’ , even with intervening spaces.
// CL SP* × NS
2002-07-30 09:57:18 +00:00
rule = " 11 " ;
if ( lastNonSpace = = LB_CL & & after = = LB_NS ) return false ;
2004-04-17 18:21:39 +00:00
// LB 11a Don’ t break within ‘——’, even with intervening spaces.
// B2 × B2
2002-07-30 09:57:18 +00:00
rule = " 11a " ;
if ( lastNonSpace = = LB_B2 & & after = = LB_B2 ) return false ;
if ( recommended ) {
2004-04-17 18:21:39 +00:00
// LB 13 Don’ t break before or after NBSP or WORD JOINER
// × GL
// GL ×
2002-07-30 09:57:18 +00:00
rule = " 11b " ;
if ( after = = LB_GL | | before = = LB_GL ) return false ;
}
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
rule = " 12 " ;
// LB 12 Break after spaces
2004-04-17 18:21:39 +00:00
// SP ÷
2002-07-30 09:57:18 +00:00
if ( before = = LB_SP ) return true ;
if ( ! recommended ) {
2004-04-17 18:21:39 +00:00
// LB 13 Don’ t break before or after NBSP or WORD JOINER
// × GL
// GL ×
2002-07-30 09:57:18 +00:00
rule = " 13 " ;
if ( after = = LB_GL | | before = = LB_GL ) return false ;
}
rule = " 14 " ;
2004-04-17 18:21:39 +00:00
// LB 14 Don’ t break before or after ‘”’
// × QU
// QU ×
2002-07-30 09:57:18 +00:00
if ( before = = LB_QU | | after = = LB_QU ) return false ;
2004-04-17 18:21:39 +00:00
// LB 15 Don’ t break before hyphen-minus, other hyphens, fixed-width spaces,
2002-07-30 09:57:18 +00:00
// small kana and other non- starters, or after acute accents:
2004-04-17 18:21:39 +00:00
// × BA
// × HY
// × NS
// BB ×
2002-07-30 09:57:18 +00:00
if ( recommended ) {
// LB 14a Break before and after CB
2004-04-17 18:21:39 +00:00
// CB ÷
// ÷ CB
2002-07-30 09:57:18 +00:00
if ( before = = LB_CB | | after = = LB_CB ) return true ;
}
rule = " 15 " ;
if ( after = = LB_NS ) return false ;
if ( after = = LB_HY ) return false ;
if ( after = = LB_BA ) return false ;
if ( before = = LB_BB ) return false ;
if ( ! recommended ) {
// LB 15b Break after hyphen-minus, and before acute accents:
2004-04-17 18:21:39 +00:00
// HY ÷
// ÷ BB
2002-07-30 09:57:18 +00:00
rule = " 15b " ;
if ( before = = LB_HY ) return true ;
if ( after = = LB_BB ) return true ;
}
2004-04-17 18:21:39 +00:00
// LB 16 Don’ t break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: ’ 9...’ , ‘ a...’ , ‘ H...’
2002-07-30 09:57:18 +00:00
rule = " 16 " ;
if ( ( before = = LB_NU | | before = = LB_AL | | before = = LB_ID ) & & after = = LB_IN ) return false ;
if ( before = = LB_IN & & after = = LB_IN ) return false ;
// Don't break alphanumerics.
2004-04-17 18:21:39 +00:00
// LB 17 Don’ t break within ‘ a9’ , ‘ 3a’ , or ‘ H%’
// ID × PO
// AL × NU
// NU × AL
2002-07-30 09:57:18 +00:00
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
2004-04-17 18:21:39 +00:00
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
2002-07-30 09:57:18 +00:00
// This is approximated with the following rules. (Some cases already handled above,
2004-04-17 18:21:39 +00:00
// like ‘ 9,’ , ‘ [9’ .)
2002-07-30 09:57:18 +00:00
rule = " 17 " ;
if ( before = = LB_ID & & after = = LB_PO ) return false ;
if ( before = = LB_AL & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_AL ) return false ;
2004-04-17 18:21:39 +00:00
// LB 18 Don’ t break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: ‘ $9’ , ‘ $[’ , ‘ $-‘ , ‘ -9’ , ‘ /9’ , ‘ 99’ , ‘ ,9’ , ‘ 9%’ ‘ ]%’
2002-07-30 09:57:18 +00:00
rule = " 18 " ;
if ( before = = LB_CL & & after = = LB_PO ) return false ;
if ( before = = LB_HY & & after = = LB_NU ) return false ;
if ( before = = LB_IS & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_PO ) return false ;
if ( before = = LB_PR & & after = = LB_AL ) return false ;
if ( before = = LB_PR & & after = = LB_HY ) return false ;
if ( before = = LB_PR & & after = = LB_ID ) return false ;
if ( before = = LB_PR & & after = = LB_NU ) return false ;
if ( before = = LB_PR & & after = = LB_OP ) return false ;
if ( before = = LB_SY & & after = = LB_NU ) return false ;
if ( recommended ) {
// LB 15b Break after hyphen-minus, and before acute accents:
2004-04-17 18:21:39 +00:00
// HY ÷
// ÷ BB
2002-07-30 09:57:18 +00:00
rule = " 18b " ;
if ( before = = LB_HY ) return true ;
if ( after = = LB_BB ) return true ;
}
2004-04-17 18:21:39 +00:00
// LB 19 Don’ t break between alphabetics (“at”)
// AL × AL
2002-07-30 09:57:18 +00:00
rule = " 19 " ;
if ( before = = LB_AL & & after = = LB_AL ) return false ;
// LB 20 Break everywhere else
2004-04-17 18:21:39 +00:00
// ALL ÷
// ÷ ALL
2002-07-30 09:57:18 +00:00
rule = " 20 " ;
return true ;
}
2002-08-04 21:38:45 +00:00
static class GenerateWordBreakTest extends GenerateLineBreakTest {
static final byte CR = 0 , LF = 1 , Control = 2 , Extend = 3 , Link = 4 , CGJ = 5 , Base = 6 , LetterBase = 7 , Other = 8 ,
oLIMIT = 9 , // RESET THIS IF LIST ABOVE CHANGES!
L = oLIMIT + hL , V = oLIMIT + hV , T = oLIMIT + hT , LV = oLIMIT + hLV , LVT = oLIMIT + hLVT ,
LIMIT = LVT + 1 ;
static final String [ ] Names = { " CR " , " LF " , " CTL " , " Extend " , " Link " , " CGJ " , " Base " , " LetterBase " , " Other " } ;
2004-02-06 18:32:05 +00:00
static UCDProperty extendProp = UnifiedBinaryProperty . make ( DERIVED | GraphemeExtend ) ;
static UCDProperty baseProp = UnifiedBinaryProperty . make ( DERIVED | GraphemeBase ) ;
static UCDProperty linkProp = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | GraphemeLink ) ;
2002-08-04 21:38:45 +00:00
{
fileName = " Word " ;
TypeOrder = new byte [ LIMIT ] ;
for ( byte i = 0 ; i < TypeOrder . length ; + + i ) {
TypeOrder [ i ] = i ;
}
}
boolean skipType ( byte type ) {
return false ;
}
public int getLimit ( ) {
return LIMIT ;
}
public int getTableLimit ( ) {
return LIMIT ;
}
// stuff that subclasses need to override
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
return 1 ;
}
public String getTableEntry ( String before , String after , boolean recommended , String [ ] ruleOut ) {
boolean normalBreak = isBreak ( before + after , before . length ( ) , recommended ) ;
String normalRule = rule ;
ruleOut [ 0 ] = rule ;
return normalBreak ? BREAK : NOBREAK ;
}
// stuff that subclasses need to override
public String getTypeID ( int cp ) {
byte type = getType ( cp ) ;
if ( type > = oLIMIT ) return hNames [ type - oLIMIT ] ;
return Names [ type ] ;
}
// stuff that subclasses need to override
public byte getType ( int cp ) {
// single characters
if ( cp = = 0xA ) return LF ;
if ( cp = = 0xD ) return CR ;
if ( cp = = 0x034F ) return CGJ ;
if ( cp = = 0x2028 | | cp = = 0x2029 ) return Control ;
// Hangul
byte result = getHangulType ( cp ) ;
if ( result ! = hNot ) return ( byte ) ( result + oLIMIT ) ;
// other properties
// category based
2004-02-07 01:01:17 +00:00
byte cat = Default . ucd ( ) . getCategory ( cp ) ;
2002-08-04 21:38:45 +00:00
if ( cat = = Cc ) return Control ;
if ( cat = = Cf ) return Extend ;
if ( ( ( 1 < < cat ) & LETTER_MASK ) ! = 0 ) return LetterBase ;
// other binary properties
if ( linkProp . hasValue ( cp ) ) return Link ;
if ( extendProp . hasValue ( cp ) ) return Extend ;
if ( baseProp . hasValue ( cp ) ) return Base ;
return Other ;
}
public byte getResolvedType ( int cp , boolean recommended ) {
return getType ( cp ) ;
}
public boolean isBreak ( String source , int offset , boolean recommended ) {
rule = " 1 " ;
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
rule = " 2 " ;
if ( offset = = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
if ( ! onCodepointBoundary ( source , offset ) ) return false ;
// now get the character before and after, and their types
int cpBefore = UTF16 . charAt ( source , offset - 1 ) ;
int cpAfter = UTF16 . charAt ( source , offset ) ;
byte before = getResolvedType ( cpBefore , recommended ) ;
byte after = getResolvedType ( cpAfter , recommended ) ;
rule = " 3 " ;
if ( before = = CR & & after = = LF ) return false ;
rule = " 4 " ;
if ( before = = CR | | before = = LF | | before = = Control
| | after = = Control | | after = = LF | | after = = CR ) return true ;
rule = " 6 " ;
if ( before = = L & & ( after = = L | | after = = V | | after = = LV | | after = = LVT ) ) return false ;
rule = " 7 " ;
if ( ( before = = LV | | before = = V ) & & ( after = = V | | after = = T ) ) return false ;
rule = " 8 " ;
if ( ( before = = LVT | | before = = T ) & & ( after = = T ) ) return false ;
rule = " 9 " ;
if ( after = = Extend ) return false ;
if ( recommended ) {
if ( after = = Link | | after = = CGJ ) return false ;
} else {
// Do not break around a CGJ.
rule = " 10 " ;
if ( before = = CGJ & & ( after = = Base
| | after = = LetterBase | | after = = L | | after = = V | | after = = T | | after = = LV | | after = = LVT ) ) return false ;
rule = " 11 " ;
if ( after = = CGJ ) return false ;
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = " 12 " ;
2004-04-17 18:21:39 +00:00
//Link Extend* × LetterBase (12)
2002-08-04 21:38:45 +00:00
if ( after = = LetterBase | | after = = L | | after = = V | | after = = T | | after = = LV | | after = = LVT ) {
int backOffset = findLastNon ( source , offset , Extend , recommended ) ;
if ( backOffset > = 0 ) {
byte last = getResolvedType ( UTF16 . charAt ( source , backOffset ) , recommended ) ;
if ( last = = Link ) return false ;
}
}
rule = " 13 " ;
if ( after = = Link ) return false ;
}
// Otherwise break after all characters.
rule = " 14 " ;
return true ;
}
}
2002-07-30 09:57:18 +00:00
}