2002-08-08 15:38:16 +00:00
/ * *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright ( C ) 1996 - 2001 , International Business Machines Corporation and *
* others . All Rights Reserved . *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
*
* $Source : / xsrl / Nsvn / icu / unicodetools / com / ibm / text / UCD / GenerateBreakTest . java , v $
2004-04-17 18:21:39 +00:00
* $Date : 2004 / 04 / 17 18 : 21 : 39 $
* $Revision : 1 . 12 $
2002-08-08 15:38:16 +00:00
*
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* /
package com.ibm.text.UCD ;
import java.util.* ;
import java.io.* ;
import com.ibm.text.utility.* ;
import com.ibm.icu.text.UTF16 ;
import com.ibm.icu.text.UnicodeSet ;
abstract public class GenerateBreakTest implements UCD_Types {
static boolean DEBUG = false ;
2003-04-03 02:29:31 +00:00
static final boolean SHOW_TYPE = false ;
2004-02-06 18:32:05 +00:00
UCD ucd ;
Normalizer nfd ;
Normalizer nfkd ;
2003-04-03 02:29:31 +00:00
2004-02-12 08:23:19 +00:00
OldUnicodeMap sampleMap = null ;
OldUnicodeMap map = new OldUnicodeMap ( ) ;
2003-04-03 02:29:31 +00:00
// ====================== Main ===========================
public static void main ( String [ ] args ) throws IOException {
System . out . println ( " Remember to add length marks (half & full) and other punctuation for sentence, with FF61 " ) ;
//Default.setUCD();
2004-02-07 01:01:17 +00:00
new GenerateGraphemeBreakTest ( Default . ucd ( ) ) . run ( ) ;
new GenerateWordBreakTest ( Default . ucd ( ) ) . run ( ) ;
new GenerateLineBreakTest ( Default . ucd ( ) ) . run ( ) ;
new GenerateSentenceBreakTest ( Default . ucd ( ) ) . run ( ) ;
2004-02-06 18:32:05 +00:00
}
GenerateBreakTest ( UCD ucd ) {
this . ucd = ucd ;
nfd = new Normalizer ( Normalizer . NFD , ucd . getVersion ( ) ) ;
nfkd = new Normalizer ( Normalizer . NFKD , ucd . getVersion ( ) ) ;
2003-04-03 02:29:31 +00:00
}
2002-08-08 15:38:16 +00:00
// COMMON STUFF for Hangul
2003-04-03 02:29:31 +00:00
/ *
2002-08-08 15:38:16 +00:00
static final byte hNot = - 1 , hL = 0 , hV = 1 , hT = 2 , hLV = 3 , hLVT = 4 , hLIMIT = 5 ;
static final String [ ] hNames = { " L " , " V " , " T " , " LV " , " LVT " } ;
2003-04-03 02:29:31 +00:00
2002-08-08 15:38:16 +00:00
static byte getHangulType ( int cp ) {
2004-02-06 18:32:05 +00:00
if ( ucd . isLeadingJamo ( cp ) ) return hL ;
if ( ucd . isVowelJamo ( cp ) ) return hV ;
if ( ucd . isTrailingJamo ( cp ) ) return hT ;
if ( ucd . isHangulSyllable ( cp ) ) {
if ( ucd . isDoubleHangul ( cp ) ) return hLV ;
2002-08-08 15:38:16 +00:00
return hLVT ;
}
return hNot ;
}
2003-04-03 02:29:31 +00:00
* /
/ * static {
2004-02-06 18:32:05 +00:00
setUCD ( ) ;
2003-04-03 02:29:31 +00:00
}
* /
2002-08-08 15:38:16 +00:00
public static boolean onCodepointBoundary ( String s , int offset ) {
if ( offset < 0 | | offset > s . length ( ) ) return false ;
if ( offset = = 0 | | offset = = s . length ( ) ) return true ;
if ( UTF16 . isLeadSurrogate ( s . charAt ( offset - 1 ) )
& & UTF16 . isTrailSurrogate ( s . charAt ( offset ) ) ) return false ;
return true ;
}
// finds the first base character, or the first character if there is no base
2004-02-06 18:32:05 +00:00
public int findFirstBase ( String source , int start , int limit ) {
2002-08-08 15:38:16 +00:00
int cp ;
for ( int i = start ; i < limit ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
2004-02-06 18:32:05 +00:00
byte cat = ucd . getCategory ( cp ) ;
2002-08-08 15:38:16 +00:00
if ( ( ( 1 < < cat ) & MARK_MASK ) ! = 0 ) continue ;
return cp ;
}
return UTF16 . charAt ( source , start ) ;
}
2002-08-09 23:56:24 +00:00
// quick & dirty routine
String insertEverywhere ( String source , String insertion , GenerateBreakTest breaker ) {
String result = insertion ;
for ( int i = 0 ; i < source . length ( ) ; + + i ) {
result + = source . charAt ( i ) ;
2004-02-06 18:32:05 +00:00
if ( breaker . isBreak ( source , i ) ) {
2002-08-09 23:56:24 +00:00
result + = insertion ;
}
}
return result + insertion ;
}
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
static void checkDecomps ( UCD ucd ) {
UCDProperty [ ] INFOPROPS = { UnifiedProperty . make ( CATEGORY ) , UnifiedProperty . make ( LINE_BREAK ) } ;
2002-08-08 15:38:16 +00:00
GenerateBreakTest [ ] tests = {
2004-02-06 18:32:05 +00:00
new GenerateGraphemeBreakTest ( ucd ) ,
new GenerateWordBreakTest ( ucd ) ,
new GenerateLineBreakTest ( ucd ) ,
2002-08-08 15:38:16 +00:00
} ;
2004-02-06 18:32:05 +00:00
tests [ 0 ] . isBreak ( " \ u0300 \ u0903 " , 1 ) ;
Normalizer nfd = new Normalizer ( Normalizer . NFD , ucd . getVersion ( ) ) ;
2002-08-08 15:38:16 +00:00
System . out . println ( " Check Decomps " ) ;
2003-04-01 02:53:07 +00:00
//System.out.println("otherExtendSet: " + ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet.toPattern(true));
2004-02-06 18:32:05 +00:00
//Utility.showSetNames("", ((GenerateGraphemeBreakTest)tests[0]).otherExtendSet, false, ucd);
2002-08-08 15:38:16 +00:00
for ( int k = 0 ; k < tests . length ; + + k ) {
for ( int i = 0 ; i < 0x10FFFF ; + + i ) {
2004-02-06 18:32:05 +00:00
if ( ! ucd . isAllocated ( i ) ) continue ;
if ( ucd . isHangulSyllable ( i ) ) continue ;
if ( nfd . isNormalized ( i ) ) continue ;
String decomp = nfd . normalize ( i ) ;
2002-08-08 15:38:16 +00:00
boolean shown = false ;
String test = decomp ;
for ( int j = 1 ; j < test . length ( ) ; + + j ) {
2004-02-06 18:32:05 +00:00
if ( tests [ k ] . isBreak ( test , j ) ) {
2002-08-08 15:38:16 +00:00
if ( ! shown ) {
2004-02-06 18:32:05 +00:00
System . out . println ( showData ( ucd , UTF16 . valueOf ( i ) , INFOPROPS , " \ r \ n \ t " ) ) ;
System . out . println ( " => " + showData ( ucd , decomp , INFOPROPS , " \ r \ n \ t " ) ) ;
2002-08-08 15:38:16 +00:00
shown = true ;
}
System . out . println ( j + " : " + tests [ k ] . fileName ) ;
}
}
}
}
}
2004-02-06 18:32:05 +00:00
static String showData ( UCD ucd , String source , UCDProperty [ ] props , String separator ) {
2002-08-08 15:38:16 +00:00
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < source . length ( ) ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
if ( i ! = 0 ) result . append ( separator ) ;
2004-02-06 18:32:05 +00:00
result . append ( ucd . getCodeAndName ( cp ) ) ;
2002-08-08 15:38:16 +00:00
for ( int j = 0 ; j < props . length ; + + j ) {
result . append ( " , " ) ;
2004-02-18 03:09:02 +00:00
result . append ( props [ j ] . getPropertyName ( SHORT ) ) . append ( '=' ) . append ( props [ j ] . getValue ( cp , SHORT ) ) ;
2002-08-08 15:38:16 +00:00
}
}
return result . toString ( ) ;
}
2004-02-06 18:32:05 +00:00
void showSet ( String title , UnicodeSet set ) {
2002-08-09 23:56:24 +00:00
System . out . println ( title + " : " + set . toPattern ( true ) ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , set , false , ucd ) ;
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
// determines if string is of form Base NSM*
2004-02-06 18:32:05 +00:00
boolean isBaseNSMStar ( String source ) {
2002-08-08 15:38:16 +00:00
int cp ;
int status = 0 ;
for ( int i = 0 ; i < source . length ( ) ; i + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
2004-02-06 18:32:05 +00:00
byte cat = ucd . getCategory ( cp ) ;
2002-08-08 15:38:16 +00:00
int catMask = 1 < < cat ;
switch ( status ) {
case 0 : if ( ( catMask & BASE_MASK ) = = 0 ) return false ;
status = 1 ;
break ;
case 1 : if ( ( catMask & NONSPACING_MARK_MASK ) = = 0 ) return false ;
break ;
}
}
return true ;
}
2004-02-06 18:32:05 +00:00
UnicodeSet getClosure ( UnicodeSet source ) {
2002-08-08 15:38:16 +00:00
UnicodeSet result = new UnicodeSet ( source ) ;
for ( int i = 0 ; i < 0x10FFFF ; + + i ) {
2004-02-06 18:32:05 +00:00
if ( ! ucd . isAllocated ( i ) ) continue ;
if ( nfkd . isNormalized ( i ) ) continue ;
String decomp = nfkd . normalize ( i ) ;
2002-08-08 15:38:16 +00:00
if ( source . containsAll ( decomp ) ) result . add ( i ) ;
}
return result ;
}
2003-04-01 02:53:07 +00:00
2003-04-03 02:29:31 +00:00
/ *
static UnicodeSet extraAlpha = new UnicodeSet ( " [ \\ u02B9- \\ u02BA \\ u02C2- \\ u02CF \\ u02D2- \\ u02DF \\ u02E5- \\ u02ED \\ u05F3] " ) ;
static UnicodeSet alphabeticSet = UnifiedBinaryProperty . make ( DERIVED | PropAlphabetic ) . getSet ( )
. addAll ( extraAlpha ) ;
static UnicodeSet ideographicSet = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Ideographic ) . getSet ( ) ;
static {
if ( false ) System . out . println ( " alphabetic: " + alphabeticSet . toPattern ( true ) ) ;
}
* /
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
void generateTerminalClosure ( ) {
2003-04-03 02:29:31 +00:00
UnicodeSet midLetterSet = new UnicodeSet ( " [ \ u0027 \ u002E \ u003A \ u00AD \ u05F3 \ u05F4 \ u2019 \ uFE52 \ uFE55 \ uFF07 \ uFF0E \ uFF1A] " ) ;
UnicodeSet ambigSentPunct = new UnicodeSet ( " [ \ u002E \ u0589 \ u06D4] " ) ;
UnicodeSet sentPunct = new UnicodeSet ( " [ \ u0021 \ u003F \ u0387 \ u061F \ u0964 \ u203C \ u203D \ u2048 \ u2049 "
+ " \ u3002 \ ufe52 \ ufe57 \ uff01 \ uff0e \ uff1f \ uff61] " ) ;
2002-08-08 15:38:16 +00:00
UnicodeSet terminals = UnifiedBinaryProperty . make ( BINARY_PROPERTIES | Terminal_Punctuation ) . getSet ( ) ;
UnicodeSet extras = getClosure ( terminals ) . removeAll ( terminals ) ;
System . out . println ( " Current Terminal_Punctuation " ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , terminals , true , ucd ) ;
2002-08-08 15:38:16 +00:00
System . out . println ( " Missing Terminal_Punctuation " ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , extras , true , ucd ) ;
2002-08-08 15:38:16 +00:00
System . out . println ( " midLetterSet " ) ;
System . out . println ( midLetterSet . toPattern ( true ) ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , midLetterSet , true , ucd ) ;
2002-08-08 15:38:16 +00:00
System . out . println ( " ambigSentPunct " ) ;
System . out . println ( ambigSentPunct . toPattern ( true ) ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , ambigSentPunct , true , ucd ) ;
2002-08-08 15:38:16 +00:00
System . out . println ( " sentPunct " ) ;
System . out . println ( sentPunct . toPattern ( true ) ) ;
2004-02-06 18:32:05 +00:00
Utility . showSetNames ( " " , sentPunct , true , ucd ) ;
2002-08-08 15:38:16 +00:00
/ *
UnicodeSet sentencePunctuation = new UnicodeSet ( " [ \ u0021 \ 003F ; Terminal_Punctuation # Po QUESTION MARK
037E ; Terminal_Punctuation # Po GREEK QUESTION MARK
061F ; Terminal_Punctuation # Po ARABIC QUESTION MARK
06D4 ; Terminal_Punctuation # Po ARABIC FULL STOP
203C . . 203D ; Terminal_Punctuation # Po [ 2 ] DOUBLE EXCLAMATION MARK . . INTERROBANG
3002 ; Terminal_Punctuation # Po IDEOGRAPHIC FULL STOP
2048 . . 2049 ; Terminal_Punctuation # Po [ 2 ] QUESTION EXCLAMATION MARK . . EXCLAMATION QUESTION MARK
* /
}
//============================
2003-04-03 02:29:31 +00:00
protected String currentRule ;
2002-08-08 15:38:16 +00:00
protected String fileName ;
protected String [ ] samples = new String [ 100 ] ;
protected String [ ] extraSamples = new String [ 0 ] ;
protected String [ ] extraSingleSamples = new String [ 0 ] ;
protected int sampleLimit = 0 ;
protected int tableLimit = - 1 ;
2003-04-03 02:29:31 +00:00
protected int [ ] skippedSamples = new int [ 100 ] ;
protected boolean didSkipSamples = false ;
private String [ ] ruleList = new String [ 100 ] ;
private int ruleListCount = 0 ;
protected boolean collectingRules = false ;
public void setRule ( String rule ) {
if ( collectingRules ) {
ruleList [ ruleListCount + + ] = rule ;
}
currentRule = rule ;
}
public String getRule ( ) {
return currentRule ;
}
2002-08-08 15:38:16 +00:00
public void run ( ) throws IOException {
findSamples ( ) ;
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
2004-02-06 18:32:05 +00:00
PrintWriter out = Utility . openPrintWriter ( " TR29 \\ "
+ fileName + " BreakTest- "
+ ucd . getVersion ( )
+ " .html " , Utility . UTF8_WINDOWS ) ;
2003-04-01 02:53:07 +00:00
out . println ( " <html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'> " ) ;
out . println ( " <title> " + fileName + " Break Chart</title> " ) ;
out . println ( " <style> " ) ;
out . println ( " td, th { vertical-align: top } " ) ;
out . println ( " </style></head> " ) ;
2003-04-03 02:29:31 +00:00
out . println ( " <body bgcolor='#FFFFFF'> " ) ;
out . println ( " <h2> " + fileName + " Break Chart</h2> " ) ;
2004-02-06 18:32:05 +00:00
out . println ( " <p><b>Unicode Version:</b> " + ucd . getVersion ( ) + " ; <b>Date:</b> " + ucd . getDate ( ) + " </p> " ) ;
generateTable ( out ) ;
2003-04-01 02:53:07 +00:00
if ( sampleMap ! = null ) {
out . println ( " <h3>Character Type Breakdown</h3> " ) ;
out . println ( " <table border='1' cellspacing='0' width='100%'> " ) ;
for ( int i = 0 ; i < sampleMap . size ( ) ; + + i ) {
out . println ( " <tr><th> " + sampleMap . getLabelFromIndex ( i )
+ " </th><td> " + sampleMap . getSetFromIndex ( i )
+ " </td></tr> " ) ;
}
out . println ( " </table> " ) ;
}
2002-08-08 15:38:16 +00:00
out . close ( ) ;
2002-08-09 23:56:24 +00:00
2004-02-06 18:32:05 +00:00
generateTest ( false ) ;
2003-04-01 02:53:07 +00:00
2002-08-09 23:56:24 +00:00
}
2004-02-06 18:32:05 +00:00
public void generateTest ( boolean shortVersion ) throws IOException {
2002-08-08 15:38:16 +00:00
String [ ] testCase = new String [ 50 ] ;
// do main test
2003-04-01 02:53:07 +00:00
PrintWriter out = Utility . openPrintWriter ( " TR29 \\ " + fileName + " BreakTest "
2002-08-09 23:56:24 +00:00
+ ( shortVersion ? " _SHORT " : " " )
2004-02-06 18:32:05 +00:00
+ " - " + ucd . getVersion ( )
2003-02-25 23:38:23 +00:00
+ " .txt " , Utility . UTF8_WINDOWS ) ;
2002-08-09 23:56:24 +00:00
int counter = 0 ;
out . println ( " # Default " + fileName + " Break Test " ) ;
2004-02-06 18:32:05 +00:00
out . println ( " # Generated: " + ucd . getDate ( ) + " , MED " ) ;
2002-08-09 23:56:24 +00:00
out . println ( " # " ) ;
out . println ( " # Format: " ) ;
out . println ( " # <string> (# <comment>)? " ) ;
out . println ( " # <string> contains hex Unicode code points, with " ) ;
out . println ( " # \ t " + BREAK + " wherever there is a break opportunity, and " ) ;
out . println ( " # \ t " + NOBREAK + " wherever there is not. " ) ;
out . println ( " # <comment> the format can change, but currently it shows: " ) ;
out . println ( " # \ t- the sample character name " ) ;
out . println ( " # \ t- (x) the line_break property* for the sample character " ) ;
out . println ( " # \ t- [x] the rule that determines whether there is a break or not " ) ;
out . println ( " # " ) ;
sampleDescription ( out ) ;
out . println ( " # These samples may be extended or changed in the future. " ) ;
out . println ( " # " ) ;
for ( int ii = 0 ; ii < sampleLimit ; + + ii ) {
String before = samples [ ii ] ;
for ( int jj = 0 ; jj < sampleLimit ; + + jj ) {
Utility . dot ( counter ) ;
String after = samples [ jj ] ;
// do line straight
int len = genTestItems ( before , after , testCase ) ;
for ( int q = 0 ; q < len ; + + q ) {
2004-02-06 18:32:05 +00:00
printLine ( out , testCase [ q ] , ! shortVersion & & q = = 0 , false ) ;
2002-08-09 23:56:24 +00:00
+ + counter ;
2002-08-08 15:38:16 +00:00
}
}
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
for ( int ii = 0 ; ii < extraSingleSamples . length ; + + ii ) {
2004-02-06 18:32:05 +00:00
printLine ( out , extraSingleSamples [ ii ] , true , false ) ;
2002-08-08 15:38:16 +00:00
}
2002-08-09 23:56:24 +00:00
out . println ( " # Lines: " + counter ) ;
out . close ( ) ;
2002-08-08 15:38:16 +00:00
}
public void sampleDescription ( PrintWriter out ) { }
2004-02-06 18:32:05 +00:00
abstract public boolean isBreak ( String source , int offset ) ;
2003-04-03 02:29:31 +00:00
abstract public String fullBreakSample ( ) ;
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
abstract public byte getType ( int cp ) ;
2003-04-02 05:16:44 +00:00
2004-02-06 18:32:05 +00:00
public byte getSampleType ( int cp ) {
return getType ( cp ) ;
2003-04-03 02:29:31 +00:00
}
2003-04-02 05:16:44 +00:00
public int mapType ( int input ) {
return input ;
}
public boolean highlightTableEntry ( int x , int y , String s ) {
return false ;
}
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
abstract public String getTypeID ( int s ) ;
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
public String getTypeID ( String s ) {
2002-08-08 15:38:16 +00:00
if ( s = = null ) return " <null> " ;
2004-02-06 18:32:05 +00:00
if ( s . length ( ) = = 1 ) return getTypeID ( s . charAt ( 0 ) ) ;
2002-08-08 15:38:16 +00:00
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < s . length ( ) ; i + = UTF32 . count16 ( cp ) ) {
cp = UTF32 . char32At ( s , i ) ;
if ( i > 0 ) result . append ( " " ) ;
2004-02-06 18:32:05 +00:00
result . append ( getTypeID ( cp ) ) ;
2002-08-08 15:38:16 +00:00
}
return result . toString ( ) ;
}
static final int DONE = - 1 ;
2004-02-06 18:32:05 +00:00
public int next ( String source , int offset ) {
2002-08-08 15:38:16 +00:00
for ( int i = offset + 1 ; i < = source . length ( ) ; + + i ) {
2004-02-06 18:32:05 +00:00
if ( isBreak ( source , i ) ) return i ;
2002-08-08 15:38:16 +00:00
}
return DONE ;
}
2004-02-06 18:32:05 +00:00
public int previous ( String source , int offset ) {
2002-08-08 15:38:16 +00:00
for ( int i = offset - 1 ; i > = 0 ; - - i ) {
2004-02-06 18:32:05 +00:00
if ( isBreak ( source , i ) ) return i ;
2002-08-08 15:38:16 +00:00
}
return DONE ;
}
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
return 1 ;
}
2004-02-06 18:32:05 +00:00
public String getTableEntry ( String before , String after , String [ ] ruleOut ) {
boolean normalBreak = isBreak ( before + after , before . length ( ) ) ;
2003-04-03 02:29:31 +00:00
String normalRule = getRule ( ) ;
ruleOut [ 0 ] = normalRule ;
2002-08-08 15:38:16 +00:00
return normalBreak ? BREAK : NOBREAK ;
}
2004-02-06 18:32:05 +00:00
public byte getResolvedType ( int cp ) {
return getType ( cp ) ;
2002-08-08 15:38:16 +00:00
}
boolean skipType ( int type ) {
return false ;
}
2004-02-06 18:32:05 +00:00
String getInfo ( String s ) {
2002-08-08 15:38:16 +00:00
if ( s = = null | | s . length ( ) = = 0 ) return " NULL " ;
StringBuffer result = new StringBuffer ( ) ;
int cp ;
for ( int i = 0 ; i < s . length ( ) ; i + = UTF32 . count16 ( cp ) ) {
cp = UTF32 . char32At ( s , i ) ;
if ( i > 0 ) result . append ( " , " ) ;
2004-02-06 18:32:05 +00:00
result . append ( ucd . getCodeAndName ( cp ) ) ;
result . append ( " , gc= " + ucd . getCategoryID_fromIndex ( ucd . getCategory ( cp ) , SHORT ) ) ;
result . append ( " , sc= " + ucd . getScriptID_fromIndex ( ucd . getScript ( cp ) , SHORT ) ) ;
result . append ( " , lb= " + ucd . getLineBreakID_fromIndex ( ucd . getLineBreak ( cp ) )
+ " = " + ucd . getLineBreakID_fromIndex ( ucd . getLineBreak ( cp ) , LONG ) ) ;
2002-08-08 15:38:16 +00:00
}
return result . toString ( ) ;
}
2004-02-06 18:32:05 +00:00
public void generateTable ( PrintWriter out ) {
2003-04-01 02:53:07 +00:00
String width = " width=' " + ( 100 / ( tableLimit + 1 ) ) + " %' " ;
2002-08-08 15:38:16 +00:00
out . print ( " <table border='1' cellspacing='0' width='100%'> " ) ;
String types = " " ;
String codes = " " ;
for ( int type = 0 ; type < tableLimit ; + + type ) {
String after = samples [ type ] ;
if ( after = = null ) continue ;
2004-02-06 18:32:05 +00:00
String h = getTypeID ( after ) ;
2003-04-02 05:16:44 +00:00
types + = " <th " + width + " title=' " + getInfo ( after ) + " '><a class='lbclass' href='# " + h + " '> " + h + " </th> " ;
2003-04-01 02:53:07 +00:00
//codes += "<th " + width + " title='" + getInfo(after) + "'>" + Utility.hex(after) + "</th>";
2002-08-08 15:38:16 +00:00
}
2003-04-01 02:53:07 +00:00
out . println ( " <tr><th " + width + " ></th> " + types + " </tr> " ) ;
// out.println("<tr><th " + width + "></th><th " + width + "></th>" + codes + "</tr>");
2002-08-08 15:38:16 +00:00
String [ ] rule = new String [ 1 ] ;
String [ ] rule2 = new String [ 1 ] ;
2002-08-09 23:56:24 +00:00
for ( int type = 0 ; type < sampleLimit ; + + type ) {
2003-04-03 02:29:31 +00:00
if ( type = = tableLimit ) {
out . println ( " <tr><td bgcolor='#0000FF' colSpan=' " + ( tableLimit + 1 ) + " ' style='font-size: 1px'> </td></tr> " ) ;
}
2002-08-08 15:38:16 +00:00
String before = samples [ type ] ;
if ( before = = null ) continue ;
2004-02-06 18:32:05 +00:00
String h = getTypeID ( before ) ;
String line = " <tr><th title=' " + ucd . getCodeAndName ( before ) + " '><a class='lbclass' href='# " + h + " '> "
2003-04-02 05:16:44 +00:00
+ h + " </th> " ;
2002-08-08 15:38:16 +00:00
for ( int type2 = 0 ; type2 < tableLimit ; + + type2 ) {
2003-04-02 05:16:44 +00:00
2002-08-08 15:38:16 +00:00
String after = samples [ type2 ] ;
if ( after = = null ) continue ;
2004-02-06 18:32:05 +00:00
String t = getTableEntry ( before , after , rule ) ;
2002-08-08 15:38:16 +00:00
String background = " " ;
2004-02-06 18:32:05 +00:00
String t2 = getTableEntry ( before , after , rule2 ) ;
2003-04-02 05:16:44 +00:00
if ( highlightTableEntry ( type , type2 , t ) ) {
background = " bgcolor='#FFFF00' " ;
}
2002-08-08 15:38:16 +00:00
if ( ! t . equals ( t2 ) ) {
if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCFFFF' " ;
} else {
background = " bgcolor='#FFFF00' " ;
}
} else if ( t . equals ( NOBREAK ) ) {
background = " bgcolor='#CCCCFF' " ;
}
2003-04-03 02:29:31 +00:00
line + = " <th title=' " + rule [ 0 ] + " ' " + background + " class='pairItem'> " + t + " </th> " ;
2002-08-08 15:38:16 +00:00
}
out . println ( line + " </tr> " ) ;
}
out . println ( " </table> " ) ;
2003-04-03 02:29:31 +00:00
if ( didSkipSamples ) {
out . println ( " <p><b>Suppressed:</b> " ) ;
for ( int i = 0 ; i < skippedSamples . length ; + + i ) {
if ( skippedSamples [ i ] > 0 ) {
2003-04-23 20:18:43 +00:00
String tmp = UTF16 . valueOf ( skippedSamples [ i ] ) ;
2004-02-06 18:32:05 +00:00
out . println ( " <span title=' " + getInfo ( tmp ) + " '> " + getTypeID ( tmp ) + " </span> " ) ;
2003-04-03 02:29:31 +00:00
}
}
out . println ( " </p> " ) ;
}
// gather the data for the rules
collectingRules = true ;
2004-02-06 18:32:05 +00:00
isBreak ( fullBreakSample ( ) , 1 ) ;
2003-04-03 02:29:31 +00:00
collectingRules = false ;
2003-04-01 02:53:07 +00:00
2003-04-03 02:29:31 +00:00
out . println ( " <h3>Rules</h3> " ) ;
out . println ( " <ul> " ) ;
for ( int ii = 0 ; ii < ruleListCount ; + + ii ) {
out . println ( " <li> " + ruleList [ ii ] + " </li> " ) ;
2002-08-08 15:38:16 +00:00
}
2003-04-03 02:29:31 +00:00
out . println ( " </ul> " ) ;
if ( extraSingleSamples . length > 0 ) {
out . println ( " <h3>Sample Strings</h3> " ) ;
out . println ( " <ol> " ) ;
for ( int ii = 0 ; ii < extraSingleSamples . length ; + + ii ) {
out . println ( " <li><font size='5'> " ) ;
2004-02-06 18:32:05 +00:00
printLine ( out , extraSingleSamples [ ii ] , true , true ) ;
2003-04-03 02:29:31 +00:00
out . println ( " </font></li> " ) ;
}
out . println ( " </ol> " ) ;
}
2002-08-08 15:38:16 +00:00
}
static final String BREAK = " \ u00F7 " ;
static final String NOBREAK = " \ u00D7 " ;
2004-02-06 18:32:05 +00:00
public void printLine ( PrintWriter out , String source , boolean comments , boolean html ) {
2002-08-08 15:38:16 +00:00
int cp ;
StringBuffer string = new StringBuffer ( ) ;
StringBuffer comment = new StringBuffer ( " \ t# " ) ;
2004-02-06 18:32:05 +00:00
boolean hasBreak = isBreak ( source , 0 ) ;
2002-08-08 15:38:16 +00:00
String status ;
if ( html ) {
status = hasBreak ? " style='border-right: 1px solid blue' " : " " ;
2003-04-03 02:29:31 +00:00
string . append ( " <span title=' " + getRule ( ) + " '><span " + status + " > </span> <span> " ) ;
2002-08-08 15:38:16 +00:00
} else {
status = hasBreak ? BREAK : NOBREAK ;
string . append ( status ) ;
}
2003-04-03 02:29:31 +00:00
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( getRule ( ) ) . append ( ']' ) ;
2002-08-08 15:38:16 +00:00
for ( int offset = 0 ; offset < source . length ( ) ; offset + = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , offset ) ;
2004-02-06 18:32:05 +00:00
hasBreak = isBreak ( source , offset + UTF16 . getCharCount ( cp ) ) ;
2002-08-08 15:38:16 +00:00
if ( html ) {
status = hasBreak ? " style='border-right: 1px solid blue' " : " " ;
string . append ( " <span title=' " +
2004-02-06 18:32:05 +00:00
Utility . quoteXML ( ucd . getCodeAndName ( cp ) + " ( " + getTypeID ( cp ) + " ) " , true )
2002-08-08 15:38:16 +00:00
+ " '> "
+ Utility . quoteXML ( Utility . getDisplay ( cp ) , true )
+ " </span> " ) ;
2003-04-03 02:29:31 +00:00
string . append ( " <span title=' " + getRule ( ) + " '><span " + status + " > </span> <span> " ) ;
2002-08-08 15:38:16 +00:00
} else {
if ( string . length ( ) > 0 ) {
string . append ( ' ' ) ;
comment . append ( ' ' ) ;
}
status = hasBreak ? BREAK : NOBREAK ;
string . append ( Utility . hex ( cp ) ) ;
2004-02-06 18:32:05 +00:00
comment . append ( ucd . getName ( cp ) + " ( " + getTypeID ( cp ) + " ) " ) ;
2002-08-08 15:38:16 +00:00
string . append ( ' ' ) . append ( status ) ;
2003-04-03 02:29:31 +00:00
comment . append ( ' ' ) . append ( status ) . append ( " [ " ) . append ( getRule ( ) ) . append ( ']' ) ;
2002-08-08 15:38:16 +00:00
}
}
if ( comments & & ! html ) string . append ( comment ) ;
out . println ( string ) ;
}
public void findSamples ( ) {
// what we want is a list of sample characters. In the simple case, this is just one per type.
// However, if there are characters that have different types (when recommended or not), then
// we want a type for each cross-section
BitSet bitset = new BitSet ( ) ;
Map list = new TreeMap ( ) ;
for ( int i = 1 ; i < = 0x10FFFF ; + + i ) {
2004-02-06 18:32:05 +00:00
if ( ! ucd . isAllocated ( i ) ) continue ;
2002-08-08 15:38:16 +00:00
if ( 0xD800 < = i & & i < = 0xDFFF ) continue ;
if ( DEBUG & & i = = 0x1100 ) {
System . out . println ( " debug " ) ;
}
2004-02-06 18:32:05 +00:00
byte lb = getSampleType ( i ) ;
byte lb2 = lb ; // HACK
2003-04-03 02:29:31 +00:00
if ( lb = = lb2 & & skipType ( lb ) ) {
skippedSamples [ lb ] = i ;
didSkipSamples = true ;
continue ;
}
2002-08-08 15:38:16 +00:00
2003-04-02 05:16:44 +00:00
int combined = ( mapType ( lb ) < < 7 ) + mapType ( lb2 ) ;
2002-08-08 15:38:16 +00:00
if ( ! bitset . get ( combined ) ) {
bitset . set ( combined ) ;
list . put ( new Integer ( combined ) , UTF16 . valueOf ( i ) ) ;
}
/ *
// if the sample slot is full OR
if ( samples [ lb ] = = null ) {
samples [ lb ] = UTF16 . valueOf ( i ) ;
if ( sampleLimit < = lb ) sampleLimit = lb + 1 ;
// byte lb2 = getType(i, true);
// if (lb2 != lb) bs.set(lb);
}
* /
}
Iterator it = list . keySet ( ) . iterator ( ) ;
while ( it . hasNext ( ) ) {
String sample = ( String ) list . get ( it . next ( ) ) ;
samples [ sampleLimit + + ] = sample ;
2004-02-06 18:32:05 +00:00
if ( DEBUG ) System . out . println ( getTypeID ( sample ) + " : \ t " + ucd . getCodeAndName ( sample ) ) ;
2002-08-08 15:38:16 +00:00
}
tableLimit = sampleLimit ;
// now add values that are different
/ *
for ( int i = 1 ; i < = 0x10FFFF ; + + i ) {
2004-02-06 18:32:05 +00:00
if ( ! ucd . isAllocated ( i ) ) continue ;
2002-08-08 15:38:16 +00:00
if ( 0xD800 < = i & & i < = 0xDFFF ) continue ;
byte lb = getType ( i ) ;
byte lb2 = getType ( i , true ) ;
if ( lb = = lb2 ) continue ;
// pick some different ones
if ( ! bs . get ( lb ) ) {
samples [ sampleLimit + + ] = UTF16 . valueOf ( i ) ;
bs . set ( lb ) ;
}
if ( ! bs2 . get ( lb2 ) ) {
samples [ sampleLimit + + ] = UTF16 . valueOf ( i ) ;
bs . set ( lb2 ) ;
}
}
* /
2002-08-09 23:56:24 +00:00
if ( extraSamples . length > 0 ) {
System . arraycopy ( extraSamples , 0 , samples , sampleLimit , extraSamples . length ) ;
sampleLimit + = extraSamples . length ;
}
2002-08-08 15:38:16 +00:00
}
2004-02-06 18:32:05 +00:00
public int findLastNon ( String source , int offset , byte notLBType ) {
2002-08-08 15:38:16 +00:00
int cp ;
for ( int i = offset - 1 ; i > = 0 ; i - = UTF16 . getCharCount ( cp ) ) {
cp = UTF16 . charAt ( source , i ) ;
2004-02-06 18:32:05 +00:00
byte f = getResolvedType ( cp ) ;
2002-08-08 15:38:16 +00:00
if ( f ! = notLBType ) return i ;
}
return - 1 ;
}
2004-02-06 18:32:05 +00:00
public static UnicodeSet getSet ( UCD ucd , int prop , byte propValue ) {
return UnifiedBinaryProperty . make ( prop | propValue , ucd ) . getSet ( ) ;
2003-04-01 02:53:07 +00:00
}
2002-08-08 15:38:16 +00:00
2003-02-25 23:38:23 +00:00
static public class Context {
public int cpBefore2 , cpBefore , cpAfter , cpAfter2 ;
public byte tBefore2 , tBefore , tAfter , tAfter2 ;
public String toString ( ) {
return " [ "
+ Utility . hex ( cpBefore2 ) + " ( " + tBefore2 + " ), "
+ Utility . hex ( cpBefore ) + " ( " + tBefore + " ), "
+ Utility . hex ( cpAfter ) + " ( " + tAfter + " ), "
+ Utility . hex ( cpAfter2 ) + " ( " + tAfter2 + " )] " ;
}
}
2004-02-06 18:32:05 +00:00
public void getGraphemeBases ( MyBreakIterator graphemeIterator , String source , int offset , int ignoreType , Context context ) {
2003-02-25 23:38:23 +00:00
context . cpBefore2 = context . cpBefore = context . cpAfter = context . cpAfter2 = - 1 ;
context . tBefore2 = context . tBefore = context . tAfter = context . tAfter2 = - 1 ;
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(source) + "; " + offset + "; " + ignoreType);
2004-02-06 18:32:05 +00:00
//MyBreakIterator graphemeIterator = new MyBreakIterator(new GenerateGraphemeBreakTest(ucd));
2003-02-25 23:38:23 +00:00
graphemeIterator . set ( source , offset ) ;
while ( true ) {
int cp = graphemeIterator . previousBase ( ) ;
if ( cp = = - 1 ) break ;
2004-02-06 18:32:05 +00:00
byte t = getResolvedType ( cp ) ;
2003-02-25 23:38:23 +00:00
if ( t = = ignoreType ) continue ;
if ( context . cpBefore = = - 1 ) {
context . cpBefore = cp ;
context . tBefore = t ;
} else {
context . cpBefore2 = cp ;
context . tBefore2 = t ;
break ;
}
}
graphemeIterator . set ( source , offset ) ;
while ( true ) {
int cp = graphemeIterator . nextBase ( ) ;
if ( cp = = - 1 ) break ;
2004-02-06 18:32:05 +00:00
byte t = getResolvedType ( cp ) ;
2003-02-25 23:38:23 +00:00
if ( t = = ignoreType ) continue ;
if ( context . cpAfter = = - 1 ) {
context . cpAfter = cp ;
context . tAfter = t ;
} else {
context . cpAfter2 = cp ;
context . tAfter2 = t ;
break ;
}
}
}
2003-04-01 02:53:07 +00:00
//==============================================
static class GenerateGraphemeBreakTest extends GenerateBreakTest {
2004-02-06 18:32:05 +00:00
GenerateGraphemeBreakTest ( UCD ucd ) {
super ( ucd ) ;
fileName = " GraphemeCluster " ;
sampleMap = map ;
}
2003-04-01 02:53:07 +00:00
2004-02-06 18:32:05 +00:00
final int
2003-04-23 20:18:43 +00:00
CR = map . add ( " CR " , new UnicodeSet ( 0xD , 0xD ) ) ,
LF = map . add ( " LF " , new UnicodeSet ( 0xA , 0xA ) ) ,
2003-04-01 02:53:07 +00:00
Control = map . add ( " Control " ,
2004-02-06 18:32:05 +00:00
getSet ( ucd , CATEGORY , Cc )
. addAll ( getSet ( ucd , CATEGORY , Cf ) )
. addAll ( getSet ( ucd , CATEGORY , Zp ) )
. addAll ( getSet ( ucd , CATEGORY , Zl ) )
2003-04-01 02:53:07 +00:00
. removeAll ( map . getSetFromIndex ( CR ) )
. removeAll ( map . getSetFromIndex ( LF ) ) ) ,
2004-02-06 18:32:05 +00:00
Extend = map . add ( " Extend " , getSet ( ucd , DERIVED , GraphemeExtend ) ) ,
L = map . add ( " L " , getSet ( ucd , HANGUL_SYLLABLE_TYPE , UCD_Types . L ) ) ,
V = map . add ( " V " , getSet ( ucd , HANGUL_SYLLABLE_TYPE , UCD_Types . V ) ) ,
T = map . add ( " T " , getSet ( ucd , HANGUL_SYLLABLE_TYPE , UCD_Types . T ) ) ,
LV = map . add ( " LV " , getSet ( ucd , HANGUL_SYLLABLE_TYPE , UCD_Types . LV ) ) ,
LVT = map . add ( " LVT " , getSet ( ucd , HANGUL_SYLLABLE_TYPE , UCD_Types . LVT ) ) ,
2003-04-01 02:53:07 +00:00
Other = map . add ( " Other " , new UnicodeSet ( 0 , 0x10FFFF ) , false , false ) ;
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public String getTypeID ( int cp ) {
2003-04-01 02:53:07 +00:00
return map . getLabel ( cp ) ;
}
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public byte getType ( int cp ) {
2003-04-01 02:53:07 +00:00
return ( byte ) map . getIndex ( cp ) ;
}
2003-04-03 02:29:31 +00:00
public String fullBreakSample ( ) {
return " aa " ;
}
2003-04-01 02:53:07 +00:00
2004-02-06 18:32:05 +00:00
public boolean isBreak ( String source , int offset ) {
2003-04-03 02:29:31 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 1: sot ÷ " ) ;
2003-04-01 02:53:07 +00:00
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 2: ÷ eot " ) ;
2003-04-01 02:53:07 +00:00
if ( offset = = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
if ( ! onCodepointBoundary ( source , offset ) ) return false ;
// now get the character before and after, and their types
int cpBefore = UTF16 . charAt ( source , offset - 1 ) ;
int cpAfter = UTF16 . charAt ( source , offset ) ;
2004-02-06 18:32:05 +00:00
byte before = getResolvedType ( cpBefore ) ;
byte after = getResolvedType ( cpAfter ) ;
2003-04-01 02:53:07 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 3: CR × LF " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = CR & & after = = LF ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 4: ( Control | CR | LF ) ÷ " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = CR | | before = = LF | | before = = Control ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 5: ÷ ( Control | CR | LF ) " ) ;
2003-04-01 02:53:07 +00:00
if ( after = = Control | | after = = LF | | after = = CR ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 6: L × ( L | V | LV | LVT ) " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = L & & ( after = = L | | after = = V | | after = = LV | | after = = LVT ) ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 7: ( LV | V ) × ( V | T ) " ) ;
2003-04-01 02:53:07 +00:00
if ( ( before = = LV | | before = = V ) & & ( after = = V | | after = = T ) ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 8: ( LVT | T ) × T " ) ;
2003-04-01 02:53:07 +00:00
if ( ( before = = LVT | | before = = T ) & & ( after = = T ) ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 9: × Extend " ) ;
2003-04-01 02:53:07 +00:00
if ( after = = Extend ) return false ;
// Otherwise break after all characters.
2004-04-17 18:21:39 +00:00
setRule ( " 10: Any ÷ Any " ) ;
2003-04-01 02:53:07 +00:00
return true ;
}
}
//==============================================
static class GenerateWordBreakTest extends GenerateBreakTest {
2004-02-06 18:32:05 +00:00
GenerateGraphemeBreakTest grapheme ;
MyBreakIterator breaker ;
Context context = new Context ( ) ;
GenerateWordBreakTest ( UCD ucd ) {
super ( ucd ) ;
grapheme = new GenerateGraphemeBreakTest ( ucd ) ;
breaker = new MyBreakIterator ( grapheme ) ;
fileName = " Word " ;
sampleMap = map ;
extraSamples = new String [ ] {
/*"\uFF70", "\uFF65", "\u30FD", */ " a \ u2060 " , " a: " , " a' " , " a' \ u2060 " , " a, " , " 1: " , " 1' " , " 1, " , " 1. \ u2060 "
} ;
String [ ] temp = { " can't " , " can \ u2019t " , " ab \ u00ADby " , " a$-34,567.14%b " , " 3a " } ;
extraSingleSamples = new String [ temp . length * 2 ] ;
System . arraycopy ( temp , 0 , extraSingleSamples , 0 , temp . length ) ;
for ( int i = 0 ; i < temp . length ; + + i ) {
extraSingleSamples [ i + temp . length ] = insertEverywhere ( temp [ i ] , " \ u2060 " , grapheme ) ;
}
if ( false ) Utility . showSetDifferences ( " Katakana " , map . getSetFromIndex ( Katakana ) ,
" Script=Katakana " , getSet ( ucd , SCRIPT , KATAKANA_SCRIPT ) , false , ucd ) ;
}
2003-04-01 02:53:07 +00:00
//static String LENGTH = "[\u30FC\uFF70]";
//static String HALFWIDTH_KATAKANA = "[\uFF66-\uFF9F]";
//static String KATAKANA_ITERATION = "[\u30FD\u30FE]";
//static String HIRAGANA_ITERATION = "[\u309D\u309E]";
2004-02-06 18:32:05 +00:00
final int
Format = map . add ( " Format " , getSet ( ucd , CATEGORY , Cf ) . remove ( 0x00AD ) ) ,
Katakana = map . add ( " Katakana " , getSet ( ucd , SCRIPT , KATAKANA_SCRIPT )
2003-04-01 02:53:07 +00:00
. addAll ( new UnicodeSet ( " [ \ u30FC \ uFF70 \ uFF9E \ uFF9F] " ) )
//.addAll(new UnicodeSet(HALFWIDTH_KATAKANA))
//.addAll(new UnicodeSet(KATAKANA_ITERATION))
) ,
ALetter = map . add ( " ALetter " ,
2004-02-06 18:32:05 +00:00
getSet ( ucd , DERIVED , PropAlphabetic )
2003-04-01 02:53:07 +00:00
. add ( 0x05F3 , 0x05F3 )
. removeAll ( map . getSetFromIndex ( Katakana ) )
2004-02-06 18:32:05 +00:00
. removeAll ( getSet ( ucd , BINARY_PROPERTIES , Ideographic ) )
. removeAll ( getSet ( ucd , SCRIPT , THAI_SCRIPT ) )
. removeAll ( getSet ( ucd , SCRIPT , LAO_SCRIPT ) )
. removeAll ( getSet ( ucd , SCRIPT , HIRAGANA_SCRIPT ) )
2003-04-01 02:53:07 +00:00
) ,
MidLetter = map . add ( " MidLetter " ,
new UnicodeSet ( " [ \\ u0027 \\ u00AD \\ u00B7 \\ u05f4 \\ u05F4 \\ u2019 \\ u2027] " ) ) ,
MidNumLet = map . add ( " MidNumLet " ,
new UnicodeSet ( " [ \\ u002E \\ u003A] " ) ) ,
2004-02-06 18:32:05 +00:00
MidNum = map . add ( " MidNum " , getSet ( ucd , LINE_BREAK , LB_IN )
2003-04-01 02:53:07 +00:00
. removeAll ( map . getSetFromIndex ( MidNumLet ) ) ) ,
2004-02-06 18:32:05 +00:00
Numeric = map . add ( " Numeric " , getSet ( ucd , LINE_BREAK , LB_NU ) ) ,
2003-04-01 02:53:07 +00:00
Other = map . add ( " Other " , new UnicodeSet ( 0 , 0x10FFFF ) , false , false ) ;
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public String getTypeID ( int cp ) {
2003-04-01 02:53:07 +00:00
return map . getLabel ( cp ) ;
}
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public byte getType ( int cp ) {
2003-04-01 02:53:07 +00:00
return ( byte ) map . getIndex ( cp ) ;
}
2003-04-03 02:29:31 +00:00
public String fullBreakSample ( ) {
return " a " ;
}
2003-04-01 02:53:07 +00:00
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
results [ 1 ] = 'a' + before + " \ u0301 \ u0308 " + after + " \ u0301 \ u0308 " + 'a' ;
results [ 2 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidLetter ] + after + " \ u0301 \ u0308 " + 'a' ;
results [ 3 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidNum ] + after + " \ u0301 \ u0308 " + 'a' ;
return 3 ;
}
2004-02-06 18:32:05 +00:00
public boolean isBreak ( String source , int offset ) {
2003-04-01 02:53:07 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 1: sot ÷ " ) ;
2003-04-01 02:53:07 +00:00
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 2: ÷ eot " ) ;
2003-04-01 02:53:07 +00:00
if ( offset = = source . length ( ) ) return true ;
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
2003-04-03 02:29:31 +00:00
setRule ( " 3: GC -> FC " ) ;
2004-02-06 18:32:05 +00:00
if ( ! grapheme . isBreak ( source , offset ) ) return false ;
2003-04-01 02:53:07 +00:00
2003-04-03 02:29:31 +00:00
setRule ( " 4: X Format* -> X " ) ;
2004-02-06 18:32:05 +00:00
byte afterChar = getResolvedType ( source . charAt ( offset ) ) ;
2003-04-03 02:29:31 +00:00
if ( afterChar = = Format ) return false ;
2003-04-01 02:53:07 +00:00
// now get the base character before and after, and their types
2004-02-06 18:32:05 +00:00
getGraphemeBases ( breaker , source , offset , Format , context ) ;
2003-04-01 02:53:07 +00:00
byte before = context . tBefore ;
byte after = context . tAfter ;
byte before2 = context . tBefore2 ;
byte after2 = context . tAfter2 ;
//Don't break between most letters
2004-04-17 18:21:39 +00:00
setRule ( " 5: ALetter × ALetter " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = ALetter & & after = = ALetter ) return false ;
2004-04-17 18:21:39 +00:00
// Don’ t break letters across certain punctuation
2003-04-01 02:53:07 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 6: ALetter × (MidLetter | MidNumLet) ALetter " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = ALetter & & ( after = = MidLetter | | after = = MidNumLet ) & & after2 = = ALetter ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 7: ALetter (MidLetter | MidNumLet) × ALetter " ) ;
2003-04-01 02:53:07 +00:00
if ( before2 = = ALetter & & ( before = = MidLetter | | before = = MidNumLet ) & & after = = ALetter ) return false ;
2004-04-17 18:21:39 +00:00
// Don’ t break within sequences of digits, or digits adjacent to letters.
2003-04-01 02:53:07 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 8: Numeric × Numeric " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = Numeric & & after = = Numeric ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 9: ALetter × Numeric " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = ALetter & & after = = Numeric ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 10: Numeric × ALetter " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = Numeric & & after = = ALetter ) return false ;
2004-04-17 18:21:39 +00:00
// Don’ t break within sequences like: '-3.2'
setRule ( " 11: Numeric (MidNum | MidNumLet) × Numeric " ) ;
2003-04-01 02:53:07 +00:00
if ( before2 = = Numeric & & ( before = = MidNum | | before = = MidNumLet ) & & after = = Numeric ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 12: Numeric × (MidNum | MidNumLet) Numeric " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = Numeric & & ( after = = MidNum | | after = = MidNumLet ) & & after2 = = Numeric ) return false ;
// Don't break between Katakana
2004-04-17 18:21:39 +00:00
setRule ( " 13: Katakana × Katakana " ) ;
2003-04-01 02:53:07 +00:00
if ( before = = Katakana & & after = = Katakana ) return false ;
// Otherwise break always.
2004-04-17 18:21:39 +00:00
setRule ( " 14: Any ÷ Any " ) ;
2003-04-01 02:53:07 +00:00
return true ;
}
}
2002-08-08 15:38:16 +00:00
// ========================================
static class GenerateLineBreakTest extends GenerateBreakTest {
2004-02-06 18:32:05 +00:00
GenerateGraphemeBreakTest grapheme ;
MyBreakIterator breaker ;
Context context = new Context ( ) ;
GenerateLineBreakTest ( UCD ucd ) {
super ( ucd ) ;
grapheme = new GenerateGraphemeBreakTest ( ucd ) ;
breaker = new MyBreakIterator ( grapheme ) ;
sampleMap = map ;
fileName = " Line " ;
extraSingleSamples = new String [ ] { " can't " , " can \ u2019t " , " ab \ u00ADby " ,
" -3 " ,
" e.g. " ,
" \ u4e00. \ u4e00. " ,
" a b " ,
" a \ u200bb " ,
" a \ u0308b " ,
" 1 \ u0308b(a)-(b) " ,
} ;
}
2002-08-08 15:38:16 +00:00
// all the other items are supplied in UCD_TYPES
2003-04-02 05:16:44 +00:00
/ * static byte LB_L = LB_LIMIT + hL , LB_V = LB_LIMIT + hV , LB_T = LB_LIMIT + hT ,
2002-08-08 15:38:16 +00:00
LB_LV = LB_LIMIT + hLV , LB_LVT = LB_LIMIT + hLVT , LB_SUP = LB_LIMIT + hLIMIT ,
LB2_LIMIT = ( byte ) ( LB_SUP + 1 ) ;
2003-04-02 05:16:44 +00:00
* /
2002-08-08 15:38:16 +00:00
/ *
private byte [ ] AsmusOrderToMyOrder = {
LB_OP , LB_CL , LB_QU , LB_GL , LB_NS , LB_EX , LB_SY , LB_IS , LB_PR , LB_PO ,
LB_NU , LB_AL , LB_ID , LB_IN , LB_HY , LB_BA , LB_BB , LB_B2 , LB_ZW , LB_CM ,
// missing from Pair Table
LB_SP , LB_BK , LB_CR , LB_LF ,
// resolved types below
LB_CB , LB_AI , LB_SA , LB_SG , LB_XX ,
// 3 JAMO CLASSES, plus supplementary
LB_L , LB_V , LB_T , LB_LV , LB_LVT , LB_SUP
} ;
private byte [ ] MyOrderToAsmusOrder = new byte [ AsmusOrderToMyOrder . length ] ;
{
for ( byte i = 0 ; i < AsmusOrderToMyOrder . length ; + + i ) {
MyOrderToAsmusOrder [ AsmusOrderToMyOrder [ i ] ] = i ;
}
2003-04-02 05:16:44 +00:00
* /
2004-02-06 18:32:05 +00:00
{
2003-04-02 05:16:44 +00:00
//System.out.println("Adding Linebreak");
for ( int i = 0 ; i < = 0x10FFFF ; + + i ) {
2004-02-06 18:32:05 +00:00
map . put ( i , ucd . getLineBreak ( i ) ) ;
2003-04-02 05:16:44 +00:00
}
for ( int i = 0 ; i < LB_LIMIT ; + + i ) {
2004-02-06 18:32:05 +00:00
map . setLabel ( i , ucd . getLineBreakID_fromIndex ( ( byte ) i , SHORT ) ) ;
2003-04-02 05:16:44 +00:00
}
//System.out.println(map.getSetFromIndex(LB_CL));
//System.out.println("Done adding Linebreak");
}
public int mapType ( int input ) {
int old = input ;
switch ( input ) {
case LB_BA : input = 16 ; break ;
case LB_BB : input = 17 ; break ;
case LB_B2 : input = 18 ; break ;
case LB_ZW : input = 19 ; break ;
case LB_CM : input = 20 ; break ;
case LB_WJ : input = 21 ; break ;
case LB_SP : input = 22 ; break ;
case LB_BK : input = 23 ; break ;
case LB_NL : input = 24 ; break ;
case LB_CR : input = 25 ; break ;
case LB_LF : input = 26 ; break ;
case LB_CB : input = 27 ; break ;
case LB_SA : input = 28 ; break ;
case LB_AI : input = 29 ; break ;
case LB_SG : input = 30 ; break ;
}
//if (old != input) System.out.println(old + " => " + input);
return input ;
}
2002-08-08 15:38:16 +00:00
public void sampleDescription ( PrintWriter out ) {
out . println ( " # Samples: " ) ;
out . println ( " # The test currently takes all pairs of linebreak types*, " ) ;
out . println ( " # picks a sample for each type, and generates three strings: " ) ;
out . println ( " # \ t- the pair alone " ) ;
out . println ( " # \ t- the pair alone with an imbeded space " ) ;
out . println ( " # \ t- the pair alone with embedded combining marks " ) ;
out . println ( " # The sample for each type is simply the first code point (above NULL) " ) ;
out . println ( " # with that property. " ) ;
out . println ( " # * Note: " ) ;
out . println ( " # \ t- SG is omitted " ) ;
out . println ( " # \ t- 3 different Jamo characters and a supplementary character are added " ) ;
out . println ( " # \ t The syllable types for the Jamo (L, V, T) are displayed in comments " ) ;
out . println ( " # \ t instead of the linebreak property " ) ;
out . println ( " # " ) ;
}
// stuff that subclasses need to override
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
results [ 1 ] = before + " " + after ;
results [ 2 ] = before + " \ u0301 \ u0308 " + after ;
return 3 ;
}
// stuff that subclasses need to override
boolean skipType ( int type ) {
2003-04-02 05:16:44 +00:00
return type = = LB_AI | | type = = LB_SA | | type = = LB_SG | | type = = LB_XX
| | type = = LB_CB | | type = = LB_CR | | type = = LB_BK | | type = = LB_LF
| | type = = LB_NL | | type = = LB_SP ;
2002-08-08 15:38:16 +00:00
}
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public String getTypeID ( int cp ) {
2003-04-02 05:16:44 +00:00
/ *
2004-02-06 18:32:05 +00:00
byte result = getType ( cp ) ;
2002-08-08 15:38:16 +00:00
if ( result = = LB_SUP ) return " SUP " ;
if ( result > = LB_LIMIT ) return hNames [ result - LB_LIMIT ] ;
2003-04-02 05:16:44 +00:00
* /
2004-02-06 18:32:05 +00:00
// return ucd.getLineBreakID_fromIndex(cp); // AsmusOrderToMyOrder[result]);
return ucd . getLineBreakID ( cp ) ; // AsmusOrderToMyOrder[result]);
2002-08-08 15:38:16 +00:00
}
2003-04-03 02:29:31 +00:00
public String fullBreakSample ( ) {
return " )a " ;
}
2002-08-08 15:38:16 +00:00
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public byte getType ( int cp ) {
2003-04-02 05:16:44 +00:00
/ * if ( cp > 0xFFFF ) return LB_SUP ;
2002-08-08 15:38:16 +00:00
byte result = getHangulType ( cp ) ;
if ( result ! = hNot ) return ( byte ) ( result + LB_LIMIT ) ;
2003-04-02 05:16:44 +00:00
* /
2004-02-06 18:32:05 +00:00
// return MyOrderToAsmusOrder[ucd.getLineBreak(cp)];
return ucd . getLineBreak ( cp ) ;
2002-08-08 15:38:16 +00:00
}
2004-02-06 18:32:05 +00:00
public String getTableEntry ( String before , String after , String [ ] ruleOut ) {
2003-04-02 05:16:44 +00:00
String t = " _ " ; // break
2004-02-06 18:32:05 +00:00
boolean spaceBreak = isBreak ( before + " " + after , before . length ( ) + 1 ) ;
2003-04-03 02:29:31 +00:00
String spaceRule = getRule ( ) ;
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
boolean spaceBreak2 = isBreak ( before + " " + after , before . length ( ) ) ;
2003-04-03 02:29:31 +00:00
String spaceRule2 = getRule ( ) ;
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
boolean normalBreak = isBreak ( before + after , before . length ( ) ) ;
2003-04-03 02:29:31 +00:00
String normalRule = getRule ( ) ;
2002-08-08 15:38:16 +00:00
2003-04-03 02:29:31 +00:00
ruleOut [ 0 ] = normalRule ;
2002-08-08 15:38:16 +00:00
if ( ! normalBreak ) {
if ( ! spaceBreak & & ! spaceBreak2 ) {
2003-04-02 05:16:44 +00:00
t = " ^ " ; // don't break, even with intervening spaces
2002-08-08 15:38:16 +00:00
} else {
2003-04-02 05:16:44 +00:00
t = " % " ; // don't break, but break with intervening spaces
}
if ( ! spaceRule2 . equals ( normalRule ) ) {
2003-04-03 02:29:31 +00:00
ruleOut [ 0 ] + = " [ " + spaceRule2 + " ] " ;
2003-04-02 05:16:44 +00:00
}
if ( ! spaceRule . equals ( normalRule ) & & ! spaceRule . equals ( spaceRule2 ) ) {
2003-04-03 02:29:31 +00:00
ruleOut [ 0 ] + = " { " + spaceRule + " } " ;
2002-08-08 15:38:16 +00:00
}
}
return t ;
}
2003-04-02 05:16:44 +00:00
public boolean highlightTableEntry ( int x , int y , String s ) {
2004-02-06 18:32:05 +00:00
return false ;
/ *
2003-04-02 05:16:44 +00:00
try {
return ! oldLineBreak [ x ] [ y ] . equals ( s ) ;
} catch ( Exception e ) { }
return true ;
2004-02-06 18:32:05 +00:00
* /
2003-04-02 05:16:44 +00:00
}
2004-02-06 18:32:05 +00:00
/ *
2003-04-02 05:16:44 +00:00
String [ ] [ ] oldLineBreak = {
{ " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " ^ " , " ^ " , " ^ " , " ^ " , " " , " % " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " ^ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " ^ " , " % " } ,
{ " % " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " % " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " % " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " % " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " % " , " % " , " % " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " % " , " % " , " % " , " _ " , " % " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " % " , " % " , " _ " , " % " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " % " , " _ " , " _ " , " _ " , " % " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " _ " , " ^ " , " % " } ,
{ " % " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " % " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " % " , " % " , " _ " , " ^ " , " ^ " , " % " } ,
{ " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " _ " , " ^ " , " % " } ,
{ " _ " , " ^ " , " % " , " % " , " % " , " ^ " , " ^ " , " ^ " , " _ " , " _ " , " % " , " % " , " _ " , " % " , " % " , " % " , " _ " , " _ " , " ^ " , " % " }
} ;
2004-02-06 18:32:05 +00:00
* /
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
public byte getResolvedType ( int cp ) {
2002-08-08 15:38:16 +00:00
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = getType ( cp ) ;
switch ( result ) {
case LB_AI : result = LB_AI ; break ;
// case LB_CB: result = LB_ID; break;
case LB_SA : result = LB_AL ; break ;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX : result = LB_AL ; break ;
}
2003-04-03 02:29:31 +00:00
/ *
2002-08-08 15:38:16 +00:00
if ( recommended ) {
if ( getHangulType ( cp ) ! = hNot ) {
result = LB_ID ;
}
}
2003-04-03 02:29:31 +00:00
* /
2002-08-08 15:38:16 +00:00
return result ;
}
2003-04-03 02:29:31 +00:00
2004-02-06 18:32:05 +00:00
public byte getSampleType ( int cp ) {
if ( ucd . getHangulSyllableType ( cp ) ! = NA ) return LB_XX ;
return getType ( cp ) ;
2003-04-03 02:29:31 +00:00
}
2002-08-08 15:38:16 +00:00
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
2004-02-06 18:32:05 +00:00
public boolean isBreak ( String source , int offset ) {
2002-08-08 15:38:16 +00:00
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
2004-04-17 18:21:39 +00:00
setRule ( " 2a: × sot " ) ;
2002-08-08 15:38:16 +00:00
if ( offset < = 0 ) return false ;
// LB 2b Always break at the end of text
2003-04-03 02:29:31 +00:00
setRule ( " 2b: ! eot " ) ;
2002-08-08 15:38:16 +00:00
if ( offset > = source . length ( ) ) return true ;
// UTF-16: never break in the middle of a code point
2003-04-02 05:16:44 +00:00
// now get the base character before and after, and their types
2004-02-06 18:32:05 +00:00
getGraphemeBases ( breaker , source , offset , - 1 , context ) ;
2003-04-02 05:16:44 +00:00
byte before = context . tBefore ;
byte after = context . tAfter ;
byte before2 = context . tBefore2 ;
byte after2 = context . tAfter2 ;
//if (!onCodepointBoundary(source, offset)) return false;
2002-08-08 15:38:16 +00:00
// now get the character before and after, and their types
2003-04-02 05:16:44 +00:00
//int cpBefore = UTF16.charAt(source, offset-1);
//int cpAfter = UTF16.charAt(source, offset);
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
//byte before = getResolvedType(cpBefore);
//byte after = getResolvedType(cpAfter);
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 3a: CR × LF ; ( BK | CR | LF | NL ) ! " ) ;
2003-04-02 05:16:44 +00:00
2002-08-08 15:38:16 +00:00
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if ( before = = LB_CR & & after = = LB_LF ) return false ;
if ( before = = LB_BK | | before = = LB_LF | | before = = LB_CR ) return true ;
2004-04-17 18:21:39 +00:00
//LB 3b Don’ t break before hard line breaks.
setRule ( " 3b: × ( BK | CR | LF ) " ) ;
2003-04-03 02:29:31 +00:00
if ( after = = LB_BK | | after = = LB_LF | | after = = LB_CR ) return false ;
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
// LB 4 Don’ t break before spaces or zero-width space.
setRule ( " 4: × ( SP | ZW ) " ) ;
2002-08-08 15:38:16 +00:00
if ( after = = LB_SP | | after = = LB_ZW ) return false ;
// LB 5 Break after zero-width space.
2004-04-17 18:21:39 +00:00
setRule ( " 5: ZW ÷ " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_ZW ) return true ;
2004-04-17 18:21:39 +00:00
// LB 6 Don’ t break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
2003-04-23 20:18:43 +00:00
setRule ( " 6: DGC -> FC " ) ;
2004-02-06 18:32:05 +00:00
if ( ! grapheme . isBreak ( source , offset ) ) return false ;
2003-04-03 02:29:31 +00:00
2003-04-02 05:16:44 +00:00
/ *
2002-08-08 15:38:16 +00:00
if ( before = = LB_L & & ( after = = LB_L | | after = = LB_V | | after = = LB_LV | | after = = LB_LVT ) ) return false ;
if ( ( before = = LB_LV | | before = = LB_V ) & & ( after = = LB_V | | after = = LB_T ) ) return false ;
if ( ( before = = LB_LVT | | before = = LB_T ) & & ( after = = LB_T ) ) return false ;
2003-04-02 05:16:44 +00:00
* /
2003-04-03 02:29:31 +00:00
2003-04-23 20:18:43 +00:00
byte backBase = - 1 ;
2002-08-08 15:38:16 +00:00
boolean setBase = false ;
if ( before = = LB_CM ) {
setBase = true ;
2004-02-06 18:32:05 +00:00
int backOffset = findLastNon ( source , offset , LB_CM ) ;
2003-04-23 20:18:43 +00:00
if ( backOffset > = 0 ) {
2004-02-06 18:32:05 +00:00
backBase = getResolvedType ( UTF16 . charAt ( source , backOffset ) ) ;
2002-08-08 15:38:16 +00:00
}
}
2003-04-02 05:16:44 +00:00
2002-08-08 15:38:16 +00:00
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
2003-04-03 02:29:31 +00:00
setRule ( " 7: SP CM* -> ID " ) ;
2003-04-23 20:18:43 +00:00
if ( setBase & & backBase = = LB_SP ) before = LB_ID ;
2003-04-02 05:16:44 +00:00
if ( after = = LB_SP & & after2 = = LB_CM ) after = LB_ID ;
2002-08-08 15:38:16 +00:00
2003-04-23 20:18:43 +00:00
setRule ( " 7a: X CM* -> X " ) ;
if ( after = = LB_CM ) return false ;
if ( setBase & & backBase ! = - 1 ) before = LB_ID ;
setRule ( " 7b: CM -> AL " ) ;
if ( setBase & & backBase = = - 1 ) before = LB_AL ;
2004-04-17 18:21:39 +00:00
// LB 8 Don’ t break before ‘ ]’ or ‘ !’ or ‘ ;’ or ‘ /’ , even after spaces.
// × CL, × EX, × IS, × SY
setRule ( " 8: × ( CL | EX | IS | SY ) " ) ;
2002-08-08 15:38:16 +00:00
if ( after = = LB_CL | | after = = LB_EX | | after = = LB_SY | after = = LB_IS ) return false ;
// find the last non-space character; we will need it
byte lastNonSpace = before ;
if ( lastNonSpace = = LB_SP ) {
2004-02-06 18:32:05 +00:00
int backOffset = findLastNon ( source , offset , LB_SP ) ;
2002-08-08 15:38:16 +00:00
if ( backOffset > = 0 ) {
2004-02-06 18:32:05 +00:00
lastNonSpace = getResolvedType ( UTF16 . charAt ( source , backOffset ) ) ;
2002-08-08 15:38:16 +00:00
}
}
2004-04-17 18:21:39 +00:00
// LB 9 Don’ t break after ‘ [’ , even after spaces.
// OP SP* ×
setRule ( " 9: OP SP* × " ) ;
2002-08-08 15:38:16 +00:00
if ( lastNonSpace = = LB_OP ) return false ;
2004-04-17 18:21:39 +00:00
// LB 10 Don’ t break within ‘”[’ , , even with intervening spaces.
// QU SP* × OP
setRule ( " 10: QU SP* × OP " ) ;
2002-08-08 15:38:16 +00:00
if ( lastNonSpace = = LB_QU & & after = = LB_OP ) return false ;
2004-04-17 18:21:39 +00:00
// LB 11 Don’ t break within ‘ ]h’ , even with intervening spaces.
// CL SP* × NS
setRule ( " 11: CL SP* × NS " ) ;
2002-08-08 15:38:16 +00:00
if ( lastNonSpace = = LB_CL & & after = = LB_NS ) return false ;
2004-04-17 18:21:39 +00:00
// LB 11a Don’ t break within ‘——’, even with intervening spaces.
// B2 × B2
setRule ( " 11a: B2 × B2 " ) ;
2002-08-08 15:38:16 +00:00
if ( lastNonSpace = = LB_B2 & & after = = LB_B2 ) return false ;
2004-04-17 18:21:39 +00:00
// LB 13 Don’ t break before or after NBSP or WORD JOINER
// × GL
// GL ×
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 11b: × WJ ; WJ × " ) ;
2003-04-02 05:16:44 +00:00
if ( after = = LB_WJ | | before = = LB_WJ ) return false ;
2002-08-08 15:38:16 +00:00
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
// LB 12 Break after spaces
2004-04-17 18:21:39 +00:00
setRule ( " 12: SP ÷ " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_SP ) return true ;
2004-04-17 18:21:39 +00:00
// LB 13 Don’ t break before or after NBSP or WORD JOINER
setRule ( " 13: × GL ; GL × " ) ;
2003-04-02 05:16:44 +00:00
if ( after = = LB_GL | | before = = LB_GL ) return false ;
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
// LB 14 Don’ t break before or after ‘”’
setRule ( " 14: × QU ; QU × " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_QU | | after = = LB_QU ) return false ;
// LB 14a Break before and after CB
2004-04-17 18:21:39 +00:00
setRule ( " 14a: ÷ CB ; CB ÷ " ) ;
2003-04-02 05:16:44 +00:00
if ( before = = LB_CB | | after = = LB_CB ) return true ;
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
// LB 15 Don’ t break before hyphen-minus, other hyphens, fixed-width spaces,
2003-04-02 05:16:44 +00:00
// small kana and other non- starters, or after acute accents:
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 15: × ( BA | HY | NS ) ; BB × " ) ;
2002-08-08 15:38:16 +00:00
if ( after = = LB_NS ) return false ;
if ( after = = LB_HY ) return false ;
if ( after = = LB_BA ) return false ;
if ( before = = LB_BB ) return false ;
2004-04-17 18:21:39 +00:00
//setRule("15a: HY × NU"); // NEW
2003-04-02 05:16:44 +00:00
//if (before == LB_HY && after == LB_NU) return false;
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
// LB 16 Don’ t break between two ellipses, or between letters or numbers and ellipsis:
// Examples: ’ 9...’ , ‘ a...’ , ‘ H...’
setRule ( " 16: ( AL | ID | IN | NU ) × IN " ) ;
2002-08-08 15:38:16 +00:00
if ( ( before = = LB_NU | | before = = LB_AL | | before = = LB_ID ) & & after = = LB_IN ) return false ;
if ( before = = LB_IN & & after = = LB_IN ) return false ;
// Don't break alphanumerics.
2004-04-17 18:21:39 +00:00
// LB 17 Don’ t break within ‘ a9’ , ‘ 3a’ , or ‘ H%’
2002-08-08 15:38:16 +00:00
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
2004-04-17 18:21:39 +00:00
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
2002-08-08 15:38:16 +00:00
// This is approximated with the following rules. (Some cases already handled above,
2004-04-17 18:21:39 +00:00
// like ‘ 9,’ , ‘ [9’ .)
setRule ( " 17: ID × PO ; AL × NU; NU × AL " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_ID & & after = = LB_PO ) return false ;
if ( before = = LB_AL & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_AL ) return false ;
2004-04-17 18:21:39 +00:00
// LB 18 Don’ t break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: ‘ $9’ , ‘ $[’ , ‘ $-‘ , ‘ -9’ , ‘ /9’ , ‘ 99’ , ‘ ,9’ , ‘ 9%’ ‘ ]%’
setRule ( " 18: CL × PO ; NU × PO ; ( IS | NU | HY | PR | SY ) × NU ; PR × ( AL | HY | ID | OP ) " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_CL & & after = = LB_PO ) return false ;
if ( before = = LB_IS & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_NU ) return false ;
if ( before = = LB_NU & & after = = LB_PO ) return false ;
2003-04-02 05:16:44 +00:00
if ( before = = LB_HY & & after = = LB_NU ) return false ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_PR & & after = = LB_AL ) return false ;
if ( before = = LB_PR & & after = = LB_HY ) return false ;
if ( before = = LB_PR & & after = = LB_ID ) return false ;
if ( before = = LB_PR & & after = = LB_NU ) return false ;
if ( before = = LB_PR & & after = = LB_OP ) return false ;
if ( before = = LB_SY & & after = = LB_NU ) return false ;
2003-04-02 05:16:44 +00:00
// LB 15b Break after hyphen-minus, and before acute accents:
2004-04-17 18:21:39 +00:00
setRule ( " 18b: HY ÷ ; ÷ BB " ) ;
2003-04-02 05:16:44 +00:00
if ( before = = LB_HY ) return true ;
if ( after = = LB_BB ) return true ;
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
// LB 19 Don’ t break between alphabetics (“at”)
// AL × AL
2002-08-08 15:38:16 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 19: AL × AL " ) ;
2002-08-08 15:38:16 +00:00
if ( before = = LB_AL & & after = = LB_AL ) return false ;
// LB 20 Break everywhere else
2004-04-17 18:21:39 +00:00
// ALL ÷
// ÷ ALL
2002-08-08 15:38:16 +00:00
2004-02-06 18:32:05 +00:00
if ( ucd . getCompositeVersion ( ) > 0x040000 ) {
2004-04-17 18:21:39 +00:00
setRule ( " 19b: IS × AL " ) ;
2004-02-06 18:32:05 +00:00
if ( before = = LB_IS & & after = = LB_AL ) return false ;
}
// LB 20 Break everywhere else
2004-04-17 18:21:39 +00:00
// ALL ÷
// ÷ ALL
2004-02-06 18:32:05 +00:00
2004-04-17 18:21:39 +00:00
setRule ( " 20: ALL ÷ ; ÷ ALL " ) ;
2002-08-08 15:38:16 +00:00
return true ;
}
}
//==============================================
2003-04-01 02:53:07 +00:00
static class GenerateSentenceBreakTest extends GenerateBreakTest {
2004-02-06 18:32:05 +00:00
GenerateGraphemeBreakTest grapheme ;
MyBreakIterator breaker ;
GenerateSentenceBreakTest ( UCD ucd ) {
super ( ucd ) ;
grapheme = new GenerateGraphemeBreakTest ( ucd ) ;
breaker = new MyBreakIterator ( grapheme ) ;
fileName = " Sentence " ;
extraSamples = new String [ ] {
} ;
extraSingleSamples = new String [ ] {
" ( \" Go. \" ) (He did.) " ,
" ( \ u201CGo? \ u201D) (He did.) " ,
" U.S.A \ u0300. is " ,
" U.S.A \ u0300? He " ,
" U.S.A \ u0300. " ,
" 3.4 " ,
" c.d " ,
2004-04-17 18:21:39 +00:00
" etc.) \ u2019 \ u2018(the " ,
" etc.) \ u2019 \ u2018(The " ,
2004-02-06 18:32:05 +00:00
" the resp. leaders are " ,
" \ u5B57. \ u5B57 " ,
" etc. \ u5B83 " ,
" etc. \ u3002 " ,
" \ u5B57 \ u3002 \ u5B83 " ,
} ;
String [ ] temp = new String [ extraSingleSamples . length * 2 ] ;
System . arraycopy ( extraSingleSamples , 0 , temp , 0 , extraSingleSamples . length ) ;
for ( int i = 0 ; i < extraSingleSamples . length ; + + i ) {
temp [ i + extraSingleSamples . length ] = insertEverywhere ( extraSingleSamples [ i ] , " \ u2060 " , grapheme ) ;
}
extraSingleSamples = temp ;
}
final int
2003-04-01 02:53:07 +00:00
Sep = map . add ( " Sep " , new UnicodeSet ( " [ \\ u000A \\ u000D \\ u0085 \\ u2028 \\ u2029] " ) ) ,
2004-02-06 18:32:05 +00:00
Format = map . add ( " Format " , getSet ( ucd , CATEGORY , Cf ) ) ,
Sp = map . add ( " Sp " , getSet ( ucd , BINARY_PROPERTIES , White_space )
2003-04-01 02:53:07 +00:00
. removeAll ( map . getSetFromIndex ( Sep ) ) ) ,
2004-02-06 18:32:05 +00:00
Lower = map . add ( " Lower " , getSet ( ucd , DERIVED , PropLowercase ) ) ,
Upper = map . add ( " Upper " , getSet ( ucd , CATEGORY , Lt )
. addAll ( getSet ( ucd , DERIVED , PropUppercase ) ) ) ,
2003-04-01 02:53:07 +00:00
OLetter = map . add ( " OLetter " ,
2004-02-06 18:32:05 +00:00
getSet ( ucd , DERIVED , PropAlphabetic )
2003-04-01 02:53:07 +00:00
. add ( 0x05F3 , 0x05F3 )
. removeAll ( map . getSetFromIndex ( Lower ) )
. removeAll ( map . getSetFromIndex ( Upper ) )
) ,
2004-02-06 18:32:05 +00:00
Numeric = map . add ( " Numeric " , getSet ( ucd , LINE_BREAK , LB_NU ) ) ,
2003-04-01 02:53:07 +00:00
ATerm = map . add ( " ATerm " , new UnicodeSet ( 0x002E , 0x002E ) ) ,
Term = map . add ( " Term " , new UnicodeSet (
" [ \\ u0021 \\ u003F \\ u0589 \\ u061F \\ u06D4 \\ u0700 \\ u0701 \\ u0702 \\ u0964 \\ u1362 \\ u1367 "
+ " \\ u1368 \\ u104A \\ u104B \\ u166E \\ u1803 \\ u1809 \\ u203C \\ u203D \\ u2047 \\ u2048 \\ u2049 "
+ " \\ u3002 \\ uFE52 \\ uFE57 \\ uFF01 \\ uFF0E \\ uFF1F \\ uFF61] " ) ) ,
Close = map . add ( " Close " ,
2004-02-06 18:32:05 +00:00
getSet ( ucd , CATEGORY , Po )
. addAll ( getSet ( ucd , CATEGORY , Pe ) )
. addAll ( getSet ( ucd , LINE_BREAK , LB_QU ) )
2003-04-01 02:53:07 +00:00
. removeAll ( map . getSetFromIndex ( ATerm ) )
. removeAll ( map . getSetFromIndex ( Term ) )
. remove ( 0x05F3 )
) ,
Other = map . add ( " Other " , new UnicodeSet ( 0 , 0x10FFFF ) , false , false ) ;
2002-08-08 15:38:16 +00:00
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public String getTypeID ( int cp ) {
2003-04-01 02:53:07 +00:00
return map . getLabel ( cp ) ;
2002-08-08 15:38:16 +00:00
}
2003-04-03 02:29:31 +00:00
public String fullBreakSample ( ) {
return " !a " ;
}
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public byte getType ( int cp ) {
2003-04-01 02:53:07 +00:00
return ( byte ) map . getIndex ( cp ) ;
2002-08-08 15:38:16 +00:00
}
2003-04-02 05:16:44 +00:00
/ * LB_XX = 0 , LB_OP = 1 , LB_CL = 2 , LB_QU = 3 , LB_GL = 4 , LB_NS = 5 , LB_EX = 6 , LB_SY = 7 ,
LB_IS = 8 , LB_PR = 9 , LB_PO = 10 , LB_NU = 11 , LB_AL = 12 , LB_ID = 13 , LB_IN = 14 , LB_HY = 15 ,
LB_CM = 16 , LB_BB = 17 , LB_BA = 18 , LB_SP = 19 , LB_BK = 20 , LB_CR = 21 , LB_LF = 22 , LB_CB = 23 ,
LB_SA = 24 , LB_AI = 25 , LB_B2 = 26 , LB_SG = 27 , LB_ZW = 28 ,
LB_NL = 29 ,
LB_WJ = 30 ,
* /
2003-04-01 02:53:07 +00:00
/ *
2002-08-09 23:56:24 +00:00
static final byte Format = 0 , Sep = 1 , Sp = 2 , OLetter = 3 , Lower = 4 , Upper = 5 ,
2003-02-25 23:38:23 +00:00
Numeric = 6 , Close = 7 , ATerm = 8 , Term = 9 , Other = 10 ,
2002-08-09 23:56:24 +00:00
LIMIT = Other + 1 ;
2003-02-25 23:38:23 +00:00
static final String [ ] Names = { " Format " , " Sep " , " Sp " , " OLetter " , " Lower " , " Upper " , " Numeric " ,
2002-08-09 23:56:24 +00:00
" Close " , " ATerm " , " Term " , " Other " } ;
static UnicodeSet sepSet = new UnicodeSet ( " [ \\ u000a \\ u000d \\ u0085 \\ u2029 \\ u2028] " ) ;
static UnicodeSet atermSet = new UnicodeSet ( " [ \\ u002E] " ) ;
2003-02-25 23:38:23 +00:00
static UnicodeSet termSet = new UnicodeSet (
" [ \\ u0021 \\ u003F \\ u0589 \\ u061f \\ u06d4 \\ u0700- \\ u0702 \\ u0934 "
+ " \\ u1362 \\ u1367 \\ u1368 \\ u104A \\ u104B \\ u166E "
+ " \\ u1803 \\ u1809 \\ u203c \\ u203d "
+ " \\ u2048 \\ u2049 \\ u3002 \\ ufe52 \\ ufe57 \\ uff01 \\ uff0e \\ uff1f \\ uff61] " ) ;
2002-08-09 23:56:24 +00:00
static UnicodeProperty lowercaseProp = UnifiedBinaryProperty . make ( DERIVED | PropLowercase ) ;
static UnicodeProperty uppercaseProp = UnifiedBinaryProperty . make ( DERIVED | PropUppercase ) ;
2003-02-25 23:38:23 +00:00
UnicodeSet linebreakNS = UnifiedBinaryProperty . make ( LINE_BREAK | LB_NU ) . getSet ( ) ;
2003-04-01 02:53:07 +00:00
* /
2003-02-25 23:38:23 +00:00
2003-04-01 02:53:07 +00:00
/ *
2002-08-09 23:56:24 +00:00
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public String getTypeID ( int cp ) {
byte type = getType ( cp ) ;
2002-08-09 23:56:24 +00:00
return Names [ type ] ;
}
2002-08-08 15:38:16 +00:00
2002-08-09 23:56:24 +00:00
// stuff that subclasses need to override
2004-02-06 18:32:05 +00:00
public byte getType ( int cp ) {
byte cat = ucd . getCategory ( cp ) ;
2002-08-09 23:56:24 +00:00
if ( cat = = Cf ) return Format ;
if ( sepSet . contains ( cp ) ) return Sep ;
2004-02-06 18:32:05 +00:00
if ( ucd . getBinaryProperty ( cp , White_space ) ) return Sp ;
2003-02-25 23:38:23 +00:00
if ( linebreakNS . contains ( cp ) ) return Numeric ;
2002-08-09 23:56:24 +00:00
if ( lowercaseProp . hasValue ( cp ) ) return Lower ;
if ( uppercaseProp . hasValue ( cp ) | | cat = = Lt ) return Upper ;
2003-02-25 23:38:23 +00:00
if ( alphabeticSet . contains ( cp ) ) return OLetter ;
2002-08-09 23:56:24 +00:00
if ( atermSet . contains ( cp ) ) return ATerm ;
if ( termSet . contains ( cp ) ) return Term ;
if ( cat = = Po | | cat = = Pe
2004-02-06 18:32:05 +00:00
| | ucd . getLineBreak ( cp ) = = LB_QU ) return Close ;
2002-08-09 23:56:24 +00:00
return Other ;
}
2003-04-01 02:53:07 +00:00
* /
2002-08-09 23:56:24 +00:00
public int genTestItems ( String before , String after , String [ ] results ) {
results [ 0 ] = before + after ;
/ *
results [ 1 ] = 'a' + before + " \ u0301 \ u0308 " + after + " \ u0301 \ u0308 " + 'a' ;
results [ 2 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidLetter ] + after + " \ u0301 \ u0308 " + 'a' ;
2003-04-01 02:53:07 +00:00
results [ 3 ] = 'a' + before + " \ u0301 \ u0308 " + samples [ MidNum ] + after + " \ u0301 \ u0308 " + 'a' ;
2002-08-09 23:56:24 +00:00
* /
return 1 ;
2002-08-08 15:38:16 +00:00
}
2003-02-25 23:38:23 +00:00
static Context context = new Context ( ) ;
2004-02-06 18:32:05 +00:00
public boolean isBreak ( String source , int offset ) {
2003-04-01 02:53:07 +00:00
// Break at the start and end of text.
2004-04-17 18:21:39 +00:00
setRule ( " 1: sot ÷ " ) ;
2002-08-09 23:56:24 +00:00
if ( offset < 0 | | offset > source . length ( ) ) return false ;
if ( offset = = 0 ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 2: ÷ eot " ) ;
2002-08-09 23:56:24 +00:00
if ( offset = = source . length ( ) ) return true ;
2004-04-17 18:21:39 +00:00
setRule ( " 3: Sep ÷ " ) ;
2004-02-06 18:32:05 +00:00
byte beforeChar = getResolvedType ( source . charAt ( offset - 1 ) ) ;
2003-02-25 23:38:23 +00:00
if ( beforeChar = = Sep ) return true ;
2002-08-09 23:56:24 +00:00
// Treat a grapheme cluster as if it were a single character:
// the first base character, if there is one; otherwise the first character.
2003-04-03 02:29:31 +00:00
setRule ( " 4: GC -> FC " ) ;
2004-02-06 18:32:05 +00:00
if ( ! grapheme . isBreak ( source , offset ) ) return false ;
2002-08-09 23:56:24 +00:00
2003-04-03 02:29:31 +00:00
// Ignore interior Format characters. That is, ignore Format characters in all subsequent rules.
setRule ( " 5: X Format* -> X " ) ;
2004-02-06 18:32:05 +00:00
byte afterChar = getResolvedType ( source . charAt ( offset ) ) ;
2003-04-03 02:29:31 +00:00
if ( afterChar = = Format ) return false ;
2003-02-25 23:38:23 +00:00
2004-02-06 18:32:05 +00:00
getGraphemeBases ( breaker , source , offset , Format , context ) ;
2003-02-25 23:38:23 +00:00
byte before = context . tBefore ;
byte after = context . tAfter ;
byte before2 = context . tBefore2 ;
byte after2 = context . tAfter2 ;
2003-04-03 02:29:31 +00:00
// HACK COPY for rule collection!
if ( collectingRules ) {
2004-04-17 18:21:39 +00:00
setRule ( " 6: ATerm × ( Numeric | Lower ) " ) ;
setRule ( " 7: Upper ATerm × Upper " ) ;
setRule ( " 8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower " ) ;
setRule ( " 9: ( Term | ATerm ) Close* × ( Close | Sp | Sep ) " ) ;
setRule ( " 10: ( Term | ATerm ) Close* Sp × ( Sp | Sep ) " ) ;
setRule ( " 11: ( Term | ATerm ) Close* Sp* ÷ " ) ;
setRule ( " 12: Any × Any " ) ;
2003-04-03 02:29:31 +00:00
collectingRules = false ;
}
2002-08-09 23:56:24 +00:00
2003-02-25 23:38:23 +00:00
// Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence.
2003-04-03 02:29:31 +00:00
2003-02-25 23:38:23 +00:00
if ( before = = ATerm ) {
2004-04-17 18:21:39 +00:00
setRule ( " 6: ATerm × ( Numeric | Lower ) " ) ;
2003-02-25 23:38:23 +00:00
if ( after = = Lower | | after = = Numeric ) return false ;
2004-04-17 18:21:39 +00:00
setRule ( " 7: Upper ATerm × Upper " ) ;
2003-02-25 23:38:23 +00:00
if ( DEBUG_GRAPHEMES ) System . out . println ( context + " , " + Upper ) ;
if ( before2 = = Upper & & after = = Upper ) return false ;
}
// The following cases are all handled together.
2002-08-09 23:56:24 +00:00
// First we loop backwards, checking for the different types.
2004-02-06 18:32:05 +00:00
MyBreakIterator graphemeIterator = new MyBreakIterator ( grapheme ) ;
2002-08-09 23:56:24 +00:00
graphemeIterator . set ( source , offset ) ;
int state = 0 ;
2003-04-01 02:53:07 +00:00
int lookAfter = - 1 ;
2002-08-09 23:56:24 +00:00
int cp ;
byte t ;
boolean gotSpace = false ;
boolean gotClose = false ;
behindLoop :
while ( true ) {
cp = graphemeIterator . previousBase ( ) ;
if ( cp = = - 1 ) break ;
2004-02-06 18:32:05 +00:00
t = getResolvedType ( cp ) ;
if ( SHOW_TYPE ) System . out . println ( ucd . getCodeAndName ( cp ) + " , " + getTypeID ( cp ) ) ;
2002-08-09 23:56:24 +00:00
if ( t = = Format ) continue ; // ignore all formats!
switch ( state ) {
case 0 :
if ( t = = Sp ) {
// loop as long as we have Space
gotSpace = true ;
continue behindLoop ;
} else if ( t = = Close ) {
gotClose = true ;
state = 1 ; // go to close loop
continue behindLoop ;
}
break ;
case 1 :
if ( t = = Close ) {
// loop as long as we have Close
continue behindLoop ;
}
break ;
}
if ( t = = ATerm ) {
lookAfter = ATerm ;
} else if ( t = = Term ) {
lookAfter = Term ;
}
break ;
}
// if we didn't find ATerm or Term, bail
if ( lookAfter = = - 1 ) {
// Otherwise, do not break
2004-04-17 18:21:39 +00:00
// Any × Any (11)
setRule ( " 12: Any × Any " ) ;
2002-08-09 23:56:24 +00:00
return false ;
}
2004-04-17 18:21:39 +00:00
// ATerm Close* Sp*× (¬( OLetter))* Lower(8)
2002-08-09 23:56:24 +00:00
// Break after sentence terminators, but include closing punctuation, trailing spaces, and (optionally) a paragraph separator.
2004-04-17 18:21:39 +00:00
// ( Term | ATerm ) Close*× ( Close | Sp | Sep )(9)
// ( Term | ATerm ) Close* Sp× ( Sp | Sep )(10)
// ( Term | ATerm ) Close* Sp*÷(11)
2003-02-25 23:38:23 +00:00
2002-08-09 23:56:24 +00:00
// We DID find one. Loop to see if the right side is ok.
graphemeIterator . set ( source , offset ) ;
boolean isFirst = true ;
while ( true ) {
cp = graphemeIterator . nextBase ( ) ;
if ( cp = = - 1 ) break ;
2004-02-06 18:32:05 +00:00
t = getResolvedType ( cp ) ;
if ( SHOW_TYPE ) System . out . println ( ucd . getCodeAndName ( cp ) + " , " + getTypeID ( cp ) ) ;
2002-08-09 23:56:24 +00:00
if ( t = = Format ) continue ; // skip format characters!
if ( isFirst ) {
isFirst = false ;
if ( lookAfter = = ATerm & & t = = Upper ) {
2004-04-17 18:21:39 +00:00
setRule ( " 8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower " ) ;
2002-08-09 23:56:24 +00:00
return false ;
}
if ( gotSpace ) {
if ( t = = Sp | | t = = Sep ) {
2004-04-17 18:21:39 +00:00
setRule ( " 10: ( Term | ATerm ) Close* Sp × ( Sp | Sep ) " ) ;
2002-08-09 23:56:24 +00:00
return false ;
}
} else if ( t = = Close | | t = = Sp | | t = = Sep ) {
2004-04-17 18:21:39 +00:00
setRule ( " 9: ( Term | ATerm ) Close* × ( Close | Sp | Sep ) " ) ;
2002-08-09 23:56:24 +00:00
return false ;
}
if ( lookAfter = = Term ) break ;
}
// at this point, we have an ATerm. All other conditions are ok, but we need to verify 6
if ( t ! = OLetter & & t ! = Upper & & t ! = Lower ) continue ;
if ( t = = Lower ) {
2004-04-17 18:21:39 +00:00
setRule ( " 8: ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower) )* Lower " ) ;
2002-08-09 23:56:24 +00:00
return false ;
}
break ;
}
2004-04-17 18:21:39 +00:00
setRule ( " 11: ( Term | ATerm ) Close* Sp* ÷ " ) ;
2002-08-09 23:56:24 +00:00
return true ;
}
}
2003-02-25 23:38:23 +00:00
static final boolean DEBUG_GRAPHEMES = false ;
2002-08-09 23:56:24 +00:00
static class MyBreakIterator {
int offset = 0 ;
String string = " " ;
2004-02-06 18:32:05 +00:00
GenerateBreakTest breaker ;
2002-08-09 23:56:24 +00:00
boolean recommended = true ;
2004-02-06 18:32:05 +00:00
MyBreakIterator ( GenerateBreakTest breaker ) {
this . breaker = breaker ; // = new GenerateGraphemeBreakTest()
}
2002-08-09 23:56:24 +00:00
public MyBreakIterator set ( String source , int offset ) {
2003-02-25 23:38:23 +00:00
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(string) + "; " + offset);
2002-08-09 23:56:24 +00:00
string = source ;
this . offset = offset ;
return this ;
}
public int nextBase ( ) {
if ( offset > = string . length ( ) ) return - 1 ;
int result = UTF16 . charAt ( string , offset ) ;
for ( + + offset ; offset < string . length ( ) ; + + offset ) {
2004-02-06 18:32:05 +00:00
if ( breaker . isBreak ( string , offset ) ) break ;
2002-08-09 23:56:24 +00:00
}
2003-02-25 23:38:23 +00:00
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
2002-08-09 23:56:24 +00:00
return result ;
}
public int previousBase ( ) {
if ( offset < = 0 ) return - 1 ;
for ( - - offset ; offset > = 0 ; - - offset ) {
2004-02-06 18:32:05 +00:00
if ( breaker . isBreak ( string , offset ) ) break ;
2002-08-09 23:56:24 +00:00
}
2003-02-25 23:38:23 +00:00
int result = UTF16 . charAt ( string , offset ) ;
//if (DEBUG_GRAPHEMES) System.out.println(Utility.hex(result));
return result ;
2002-08-09 23:56:24 +00:00
}
2002-08-08 15:38:16 +00:00
}
2004-02-06 18:32:05 +00:00
/ *
*
* if ( false ) {
PrintWriter log = Utility . openPrintWriter ( " Diff.txt " , Utility . UTF8_WINDOWS ) ;
UnicodeSet Term = new UnicodeSet (
" [ \\ u0021 \\ u003F \\ u0589 \\ u061F \\ u06D4 \\ u0700 \\ u0701 \\ u0702 \\ u0964 \\ u1362 \\ u1367 "
+ " \\ u1368 \\ u104A \\ u104B \\ u166E \\ u1803 \\ u1809 \\ u203C \\ u203D \\ u2047 \\ u2048 \\ u2049 "
+ " \\ u3002 \\ uFE52 \\ uFE57 \\ uFF01 \\ uFF0E \\ uFF1F \\ uFF61] " ) ;
UnicodeSet terminal_punctuation = getSet ( BINARY_PROPERTIES , Terminal_Punctuation ) ;
UnicodeMap names = new UnicodeMap ( ) ;
names . add ( " Pd " , getSet ( CATEGORY , Pd ) ) ;
names . add ( " Ps " , getSet ( CATEGORY , Ps ) ) ;
names . add ( " Pe " , getSet ( CATEGORY , Pe ) ) ;
names . add ( " Pc " , getSet ( CATEGORY , Pc ) ) ;
names . add ( " Po " , getSet ( CATEGORY , Po ) ) ;
names . add ( " Pi " , getSet ( CATEGORY , Pi ) ) ;
names . add ( " Pf " , getSet ( CATEGORY , Pf ) ) ;
Utility . showSetDifferences ( log , " Term " , Term , " Terminal_Punctuation " , terminal_punctuation , true , true , names , ucd ) ;
Utility . showSetDifferences ( log , " Po " , getSet ( CATEGORY , Po ) , " Terminal_Punctuation " , terminal_punctuation , true , true , names , ucd ) ;
log . close ( ) ;
if ( true ) return ;
UnicodeSet whitespace = getSet ( BINARY_PROPERTIES , White_space ) ;
UnicodeSet space = getSet ( CATEGORY , Zs ) . addAll ( getSet ( CATEGORY , Zp ) ) . addAll ( getSet ( CATEGORY , Zl ) ) ;
Utility . showSetDifferences ( " White_Space " , whitespace , " Z " , space , true , ucd ) ;
UnicodeSet isSpace = new UnicodeSet ( ) ;
UnicodeSet isSpaceChar = new UnicodeSet ( ) ;
UnicodeSet isWhitespace = new UnicodeSet ( ) ;
for ( int i = 0 ; i < = 0xFFFF ; + + i ) {
if ( Character . isSpace ( ( char ) i ) ) isSpace . add ( i ) ;
if ( Character . isSpaceChar ( ( char ) i ) ) isSpaceChar . add ( i ) ;
if ( Character . isWhitespace ( ( char ) i ) ) isWhitespace . add ( i ) ;
}
Utility . showSetDifferences ( " White_Space " , whitespace , " isSpace " , isSpace , true , ucd ) ;
Utility . showSetDifferences ( " White_Space " , whitespace , " isSpaceChar " , isSpaceChar , true , ucd ) ;
Utility . showSetDifferences ( " White_Space " , whitespace , " isWhitespace " , isWhitespace , true , ucd ) ;
return ;
}
if ( DEBUG ) {
checkDecomps ( ) ;
Utility . showSetNames ( " " , new UnicodeSet ( " [ \ u034F \ u00AD \ u1806[:DI:]-[:Cs:]-[:Cn:]] " ) , true , ucd ) ;
System . out . println ( " *** Extend - Cf " ) ;
generateTerminalClosure ( ) ;
GenerateWordBreakTest gwb = new GenerateWordBreakTest ( ) ;
PrintWriter systemPrintWriter = new PrintWriter ( System . out ) ;
gwb . printLine ( systemPrintWriter , " n \ u0308't " , true , true , false ) ;
systemPrintWriter . flush ( ) ;
//showSet("sepSet", GenerateSentenceBreakTest.sepSet);
//showSet("atermSet", GenerateSentenceBreakTest.atermSet);
//showSet("termSet", GenerateSentenceBreakTest.termSet);
}
if ( true ) {
GenerateBreakTest foo = new GenerateLineBreakTest ( ) ;
//foo.isBreak("(\"Go.\") (He did)", 5, true);
foo . isBreak ( " \ u4e00 \ u4300 " , 1 , true ) ;
/ *
GenerateSentenceBreakTest foo = new GenerateSentenceBreakTest ( ) ;
//foo.isBreak("(\"Go.\") (He did)", 5, true);
foo . isBreak ( " 3.4 " , 2 , true ) ;
* /
}
new GenerateGraphemeBreakTest ( ) . run ( ) ;
new GenerateWordBreakTest ( ) . run ( ) ;
new GenerateLineBreakTest ( ) . run ( ) ;
new GenerateSentenceBreakTest ( ) . run ( ) ;
//if (true) return; // cut short for now
}
* /
2002-08-08 15:38:16 +00:00
}