ICU-1126 Add Title Case Break Iterator.

Updated word and line Break Iterator rules  for CJK Extension A

X-SVN-Rev: 7831
This commit is contained in:
Andy Heninger 2002-03-01 02:37:47 +00:00
parent ce608af1c0
commit 757791498b
4 changed files with 140 additions and 14 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/rbbi/Attic/WriteTablesToFiles.java,v $
* $Date: 2002/02/16 03:05:35 $
* $Revision: 1.2 $
* $Date: 2002/03/01 02:37:47 $
* $Revision: 1.3 $
*
*****************************************************************************************
*/
@ -42,6 +42,10 @@ public class WriteTablesToFiles {
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
"sent" + suffix + ".brk"), littleEndian);
bi = BreakIterator.getTitleInstance();
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
"title" + suffix + ".brk"), littleEndian);
java.util.Locale thai = new java.util.Locale("th", "", "");
bi = BreakIterator.getWordInstance(thai);
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java,v $
* $Date: 2002/02/16 03:05:38 $
* $Revision: 1.9 $
* $Date: 2002/03/01 02:37:47 $
* $Revision: 1.10 $
*
*****************************************************************************************
*/
@ -38,7 +38,8 @@ public class BreakIteratorRules extends ListResourceBundle {
new String[] { "RuleBasedBreakIterator", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" } // sentence-break iterator class
"RuleBasedBreakIterator", // sentence-break iterator class
"RuleBasedBreakIterator"} // Title-Case break iterator class
},
// rules describing how to break between logical characters
@ -116,7 +117,7 @@ public class BreakIteratorRules extends ListResourceBundle {
// Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals,
// other letters, and digits
+ "$danda=[\u0964\u0965];"
+ "$kanji=[\u3005\u4e00-\u9fa5\uf900-\ufa2d$surr_hi_ideo$pua];"
+ "$kanji=[\u3005\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a$surr_hi_ideo$pua];"
+ "$kata=[\u3099-\u309c\u30a1-\u30fe];"
+ "$hira=[\u3041-\u309e\u30fc];"
+ "$let=[[[:L:][:Mc:]$surr_hi_let]-[$kanji$kata$hira]];"
@ -234,7 +235,7 @@ public class BreakIteratorRules extends ListResourceBundle {
// Kanji: actually includes both Kanji and Kana, except for small Kana and
// CJK diacritics
+ "$kanji=[[$surr_hi_ideo$pua\u4e00-\u9fa5\uf900-\ufa2d\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"
+ "$kanji=[[$surr_hi_ideo$pua\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"
// digits
+ "$digit=[[:Nd:][:No:]];"
@ -360,6 +361,31 @@ public class BreakIteratorRules extends ListResourceBundle {
// followed by an optional run of ending punctuation, followed by
// a sentence terminator, this is a safe place to turn around
+ "![$sent_start$lc$digit]$start*$space*$end*$term;"
}
},
// default rules for finding Title Case boundaries.
// See Unicode Technical Report #21 more information about these rules.
{ "TitleBreakRules",
"$case_ignorable=[[:Mn:][:Me:][:Cf:][:Lm:][:Sk:]\\u0027\u00AD\u2019];"
+ "$cased=[[[:Lu:][:Lt:][:Ll:]"
+ "\u2160-\u216f" // Other Uppercase
+ "\u24b6-\u24cf"
+ "\u02b0-\u02b8" // Other Lower case
+ "\u02c0-\u02c1"
+ "\u02e0-\u02e4"
+ "\u0345\u037a"
+ "\u2170-\u217f"
+ "\u24d0-\u24e9]"
+ "-$case_ignorable];" // Remove anything that is case_ignorable
// from $cased.
+ "$not_cased=[^$cased$case_ignorable];"
// First time only, eat through any leading non-word-like stuff.
+ "[$not_cased$case_ignorable]*;"
// Match a word (a cased item), plus any following spaces or other non-cased junk,
// up to the start of the next cased item.
+ "$cased[$cased$case_ignorable]*[$not_cased]*;"
+ "!$not_cased*[$cased$case_ignorable]*$not_cased*;" // Backwards rule.
}
};
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/BreakIterator.java,v $
* $Date: 2002/02/16 03:06:03 $
* $Revision: 1.5 $
* $Date: 2002/03/01 02:37:47 $
* $Revision: 1.6 $
*
*****************************************************************************************
*/
@ -402,7 +402,8 @@ public abstract class BreakIterator implements Cloneable
private static final int WORD_INDEX = 1; //ibm.597
private static final int LINE_INDEX = 2; //ibm.597
private static final int SENTENCE_INDEX = 3; //ibm.597
private static final SoftReference[] iterCache = new SoftReference[4]; //ibm.597
private static final int TITLE_INDEX = 4;
private static final SoftReference[] iterCache = new SoftReference[5]; //ibm.597
/**
* Returns a new instance of BreakIterator that locates word boundaries.
@ -507,6 +508,30 @@ public abstract class BreakIterator implements Cloneable
"SentenceBreakDictionary"); //ibm.597
} //ibm.597
/**
* Returns a new instance of BreakIterator that locates sentence boundaries.
* This function assumes the text being analyzed is in the default locale's
* language.
* @return A new instance of BreakIterator that locates sentence boundaries.
*/
public static BreakIterator getTitleInstance()
{
return getTitleInstance(Locale.getDefault());
}
/**
* Returns a new instance of BreakIterator that locates sentence boundaries.
* @param where A Locale specifying the language of the text being analyzed.
* @return A new instance of BreakIterator that locates sentence boundaries.
*/
public static BreakIterator getTitleInstance(Locale where)
{
return getBreakInstance(where,
TITLE_INDEX,
"TitleBreakRules",
"TitleBreakDictionary");
}
private static BreakIterator getBreakInstance(Locale where, //ibm.597
int type, //ibm.597
String rulesName, //ibm.597

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java,v $
* $Date: 2002/02/25 22:43:58 $
* $Revision: 1.17 $
* $Date: 2002/03/01 02:37:47 $
* $Revision: 1.18 $
*
*****************************************************************************************
*/
@ -241,7 +241,7 @@ import java.io.*;
* &nbsp; For examples, see the resource data (which is annotated).</p>
*
* @author Richard Gillam
* $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.17 $ $Date: 2002/02/25 22:43:58 $
* $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.18 $ $Date: 2002/03/01 02:37:47 $
*/
public class RuleBasedBreakIterator extends BreakIterator {
@ -394,6 +394,77 @@ public class RuleBasedBreakIterator extends BreakIterator {
return description.hashCode();
}
//
// Dump out a more-or-less human readable form of the
// complete state table and character class definitions
//
public void debugDumpTables() {
System.out.println("Character Classes:");
int currentCharClass = 257;
int startCurrentRange = 0;
int initialStringLength = 0;
StringBuffer[] charClassRanges = new StringBuffer[numCategories];
for (int i=0; i<numCategories; i++) {
charClassRanges[i] = new StringBuffer();
}
for (int i = 0; i < 0xffff; i++) {
if ((int)charCategoryTable.elementAt((char)i) != currentCharClass) {
if (currentCharClass != 257) {
// Complete the output of the previous range.
if (i != startCurrentRange+1) {
charClassRanges[currentCharClass].append("-"+ Integer.toHexString(i-1));
}
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
charClassRanges[currentCharClass].append("\n ");
}
}
// Output the start of the new range.
currentCharClass = (int)charCategoryTable.elementAt((char)i);
startCurrentRange = i;
initialStringLength = charClassRanges[currentCharClass].length();
if (charClassRanges[currentCharClass].length() > 0)
charClassRanges[currentCharClass].append(", ");
charClassRanges[currentCharClass].append(Integer.toHexString(i));
}
}
for (int i=0; i<numCategories; i++) {
System.out.println(i + ": " + charClassRanges[i]);
}
System.out.println("\n\nState Table. *: end state %: look ahead state");
System.out.print("C:\t");
for (int i = 0; i < numCategories; i++)
System.out.print(Integer.toString(i) + "\t");
System.out.println(); System.out.print("=================================================");
for (int i = 0; i < stateTable.length; i++) {
if (i % numCategories == 0) {
System.out.println();
if (endStates[i / numCategories])
System.out.print("*");
else
System.out.print(" ");
if (lookaheadStates[i / numCategories]) {
System.out.print("%");
}
else
System.out.print(" ");
System.out.print(Integer.toString(i / numCategories) + ":\t");
}
if (stateTable[i] == 0) {
System.out.print(".\t");
} else {
System.out.print(Integer.toString(stateTable[i]) + "\t");
}
}
System.out.println();
}
// DELETE ME BEFORE RELEASE!!!
public void writeTablesToFile(FileOutputStream file, boolean littleEndian) throws IOException {
// NOTE: The format being written here is designed to be compatible with