ICU-1126 Add Title Case Break Iterator.
Updated word and line Break Iterator rules for CJK Extension A X-SVN-Rev: 7831
This commit is contained in:
parent
ce608af1c0
commit
757791498b
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/rbbi/Attic/WriteTablesToFiles.java,v $
|
||||
* $Date: 2002/02/16 03:05:35 $
|
||||
* $Revision: 1.2 $
|
||||
* $Date: 2002/03/01 02:37:47 $
|
||||
* $Revision: 1.3 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -42,6 +42,10 @@ public class WriteTablesToFiles {
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
"sent" + suffix + ".brk"), littleEndian);
|
||||
|
||||
bi = BreakIterator.getTitleInstance();
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
"title" + suffix + ".brk"), littleEndian);
|
||||
|
||||
java.util.Locale thai = new java.util.Locale("th", "", "");
|
||||
bi = BreakIterator.getWordInstance(thai);
|
||||
((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java,v $
|
||||
* $Date: 2002/02/16 03:05:38 $
|
||||
* $Revision: 1.9 $
|
||||
* $Date: 2002/03/01 02:37:47 $
|
||||
* $Revision: 1.10 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -38,7 +38,8 @@ public class BreakIteratorRules extends ListResourceBundle {
|
||||
new String[] { "RuleBasedBreakIterator", // character-break iterator class
|
||||
"RuleBasedBreakIterator", // word-break iterator class
|
||||
"RuleBasedBreakIterator", // line-break iterator class
|
||||
"RuleBasedBreakIterator" } // sentence-break iterator class
|
||||
"RuleBasedBreakIterator", // sentence-break iterator class
|
||||
"RuleBasedBreakIterator"} // Title-Case break iterator class
|
||||
},
|
||||
|
||||
// rules describing how to break between logical characters
|
||||
@ -116,7 +117,7 @@ public class BreakIteratorRules extends ListResourceBundle {
|
||||
// Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals,
|
||||
// other letters, and digits
|
||||
+ "$danda=[\u0964\u0965];"
|
||||
+ "$kanji=[\u3005\u4e00-\u9fa5\uf900-\ufa2d$surr_hi_ideo$pua];"
|
||||
+ "$kanji=[\u3005\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a$surr_hi_ideo$pua];"
|
||||
+ "$kata=[\u3099-\u309c\u30a1-\u30fe];"
|
||||
+ "$hira=[\u3041-\u309e\u30fc];"
|
||||
+ "$let=[[[:L:][:Mc:]$surr_hi_let]-[$kanji$kata$hira]];"
|
||||
@ -234,7 +235,7 @@ public class BreakIteratorRules extends ListResourceBundle {
|
||||
|
||||
// Kanji: actually includes both Kanji and Kana, except for small Kana and
|
||||
// CJK diacritics
|
||||
+ "$kanji=[[$surr_hi_ideo$pua\u4e00-\u9fa5\uf900-\ufa2d\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"
|
||||
+ "$kanji=[[$surr_hi_ideo$pua\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"
|
||||
|
||||
// digits
|
||||
+ "$digit=[[:Nd:][:No:]];"
|
||||
@ -360,6 +361,31 @@ public class BreakIteratorRules extends ListResourceBundle {
|
||||
// followed by an optional run of ending punctuation, followed by
|
||||
// a sentence terminator, this is a safe place to turn around
|
||||
+ "![$sent_start$lc$digit]$start*$space*$end*$term;"
|
||||
},
|
||||
|
||||
// default rules for finding Title Case boundaries.
|
||||
// See Unicode Technical Report #21 more information about these rules.
|
||||
{ "TitleBreakRules",
|
||||
"$case_ignorable=[[:Mn:][:Me:][:Cf:][:Lm:][:Sk:]\\u0027\u00AD\u2019];"
|
||||
+ "$cased=[[[:Lu:][:Lt:][:Ll:]"
|
||||
+ "\u2160-\u216f" // Other Uppercase
|
||||
+ "\u24b6-\u24cf"
|
||||
+ "\u02b0-\u02b8" // Other Lower case
|
||||
+ "\u02c0-\u02c1"
|
||||
+ "\u02e0-\u02e4"
|
||||
+ "\u0345\u037a"
|
||||
+ "\u2170-\u217f"
|
||||
+ "\u24d0-\u24e9]"
|
||||
+ "-$case_ignorable];" // Remove anything that is case_ignorable
|
||||
// from $cased.
|
||||
+ "$not_cased=[^$cased$case_ignorable];"
|
||||
// First time only, eat through any leading non-word-like stuff.
|
||||
+ "[$not_cased$case_ignorable]*;"
|
||||
// Match a word (a cased item), plus any following spaces or other non-cased junk,
|
||||
// up to the start of the next cased item.
|
||||
+ "$cased[$cased$case_ignorable]*[$not_cased]*;"
|
||||
+ "!$not_cased*[$cased$case_ignorable]*$not_cased*;" // Backwards rule.
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/BreakIterator.java,v $
|
||||
* $Date: 2002/02/16 03:06:03 $
|
||||
* $Revision: 1.5 $
|
||||
* $Date: 2002/03/01 02:37:47 $
|
||||
* $Revision: 1.6 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -402,7 +402,8 @@ public abstract class BreakIterator implements Cloneable
|
||||
private static final int WORD_INDEX = 1; //ibm.597
|
||||
private static final int LINE_INDEX = 2; //ibm.597
|
||||
private static final int SENTENCE_INDEX = 3; //ibm.597
|
||||
private static final SoftReference[] iterCache = new SoftReference[4]; //ibm.597
|
||||
private static final int TITLE_INDEX = 4;
|
||||
private static final SoftReference[] iterCache = new SoftReference[5]; //ibm.597
|
||||
|
||||
/**
|
||||
* Returns a new instance of BreakIterator that locates word boundaries.
|
||||
@ -507,6 +508,30 @@ public abstract class BreakIterator implements Cloneable
|
||||
"SentenceBreakDictionary"); //ibm.597
|
||||
} //ibm.597
|
||||
|
||||
/**
|
||||
* Returns a new instance of BreakIterator that locates sentence boundaries.
|
||||
* This function assumes the text being analyzed is in the default locale's
|
||||
* language.
|
||||
* @return A new instance of BreakIterator that locates sentence boundaries.
|
||||
*/
|
||||
public static BreakIterator getTitleInstance()
|
||||
{
|
||||
return getTitleInstance(Locale.getDefault());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a new instance of BreakIterator that locates sentence boundaries.
|
||||
* @param where A Locale specifying the language of the text being analyzed.
|
||||
* @return A new instance of BreakIterator that locates sentence boundaries.
|
||||
*/
|
||||
public static BreakIterator getTitleInstance(Locale where)
|
||||
{
|
||||
return getBreakInstance(where,
|
||||
TITLE_INDEX,
|
||||
"TitleBreakRules",
|
||||
"TitleBreakDictionary");
|
||||
}
|
||||
|
||||
private static BreakIterator getBreakInstance(Locale where, //ibm.597
|
||||
int type, //ibm.597
|
||||
String rulesName, //ibm.597
|
||||
|
@ -5,8 +5,8 @@
|
||||
*******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java,v $
|
||||
* $Date: 2002/02/25 22:43:58 $
|
||||
* $Revision: 1.17 $
|
||||
* $Date: 2002/03/01 02:37:47 $
|
||||
* $Revision: 1.18 $
|
||||
*
|
||||
*****************************************************************************************
|
||||
*/
|
||||
@ -241,7 +241,7 @@ import java.io.*;
|
||||
* For examples, see the resource data (which is annotated).</p>
|
||||
*
|
||||
* @author Richard Gillam
|
||||
* $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.17 $ $Date: 2002/02/25 22:43:58 $
|
||||
* $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.18 $ $Date: 2002/03/01 02:37:47 $
|
||||
*/
|
||||
public class RuleBasedBreakIterator extends BreakIterator {
|
||||
|
||||
@ -394,6 +394,77 @@ public class RuleBasedBreakIterator extends BreakIterator {
|
||||
return description.hashCode();
|
||||
}
|
||||
|
||||
//
|
||||
// Dump out a more-or-less human readable form of the
|
||||
// complete state table and character class definitions
|
||||
//
|
||||
public void debugDumpTables() {
|
||||
System.out.println("Character Classes:");
|
||||
int currentCharClass = 257;
|
||||
int startCurrentRange = 0;
|
||||
int initialStringLength = 0;
|
||||
|
||||
StringBuffer[] charClassRanges = new StringBuffer[numCategories];
|
||||
for (int i=0; i<numCategories; i++) {
|
||||
charClassRanges[i] = new StringBuffer();
|
||||
}
|
||||
|
||||
for (int i = 0; i < 0xffff; i++) {
|
||||
if ((int)charCategoryTable.elementAt((char)i) != currentCharClass) {
|
||||
if (currentCharClass != 257) {
|
||||
// Complete the output of the previous range.
|
||||
if (i != startCurrentRange+1) {
|
||||
charClassRanges[currentCharClass].append("-"+ Integer.toHexString(i-1));
|
||||
}
|
||||
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
|
||||
charClassRanges[currentCharClass].append("\n ");
|
||||
}
|
||||
}
|
||||
|
||||
// Output the start of the new range.
|
||||
currentCharClass = (int)charCategoryTable.elementAt((char)i);
|
||||
startCurrentRange = i;
|
||||
initialStringLength = charClassRanges[currentCharClass].length();
|
||||
if (charClassRanges[currentCharClass].length() > 0)
|
||||
charClassRanges[currentCharClass].append(", ");
|
||||
charClassRanges[currentCharClass].append(Integer.toHexString(i));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i=0; i<numCategories; i++) {
|
||||
System.out.println(i + ": " + charClassRanges[i]);
|
||||
}
|
||||
|
||||
|
||||
System.out.println("\n\nState Table. *: end state %: look ahead state");
|
||||
System.out.print("C:\t");
|
||||
for (int i = 0; i < numCategories; i++)
|
||||
System.out.print(Integer.toString(i) + "\t");
|
||||
System.out.println(); System.out.print("=================================================");
|
||||
for (int i = 0; i < stateTable.length; i++) {
|
||||
if (i % numCategories == 0) {
|
||||
System.out.println();
|
||||
if (endStates[i / numCategories])
|
||||
System.out.print("*");
|
||||
else
|
||||
System.out.print(" ");
|
||||
if (lookaheadStates[i / numCategories]) {
|
||||
System.out.print("%");
|
||||
}
|
||||
else
|
||||
System.out.print(" ");
|
||||
System.out.print(Integer.toString(i / numCategories) + ":\t");
|
||||
}
|
||||
if (stateTable[i] == 0) {
|
||||
System.out.print(".\t");
|
||||
} else {
|
||||
System.out.print(Integer.toString(stateTable[i]) + "\t");
|
||||
}
|
||||
}
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
|
||||
// DELETE ME BEFORE RELEASE!!!
|
||||
public void writeTablesToFile(FileOutputStream file, boolean littleEndian) throws IOException {
|
||||
// NOTE: The format being written here is designed to be compatible with
|
||||
|
Loading…
Reference in New Issue
Block a user