ICU-1126 Add Title Case Break Iterator.

Updated word and line Break Iterator rules for CJK Extension A X-SVN-Rev: 7831
2002-03-01 02:37:47 +00:00 · 2002-03-01 02:37:47 +00:00 · 757791498b
commit 757791498b
parent ce608af1c0
4 changed files with 140 additions and 14 deletions
--- a/icu4j/src/com/ibm/icu/dev/tool/rbbi/WriteTablesToFiles.java
+++ b/icu4j/src/com/ibm/icu/dev/tool/rbbi/WriteTablesToFiles.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/rbbi/Attic/WriteTablesToFiles.java,v $ 
- * $Date: 2002/02/16 03:05:35 $ 
- * $Revision: 1.2 $
+ * $Date: 2002/03/01 02:37:47 $ 
+ * $Revision: 1.3 $
 *
 *****************************************************************************************
 */
@ -42,6 +42,10 @@ public class WriteTablesToFiles {
        ((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
                    "sent" + suffix + ".brk"), littleEndian);

+        bi = BreakIterator.getTitleInstance();
+        ((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
+                    "title" + suffix + ".brk"), littleEndian);
+
        java.util.Locale thai = new java.util.Locale("th", "", "");
        bi = BreakIterator.getWordInstance(thai);
        ((RuleBasedBreakIterator)bi).writeTablesToFile(new FileOutputStream(
--- a/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java
+++ b/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/data/BreakIteratorRules.java,v $
- * $Date: 2002/02/16 03:05:38 $
- * $Revision: 1.9 $
+ * $Date: 2002/03/01 02:37:47 $
+ * $Revision: 1.10 $
 *
 *****************************************************************************************
 */
@ -38,7 +38,8 @@ public class BreakIteratorRules extends ListResourceBundle {
            new String[] { "RuleBasedBreakIterator",     // character-break iterator class
                           "RuleBasedBreakIterator",     // word-break iterator class
                           "RuleBasedBreakIterator",     // line-break iterator class
-                           "RuleBasedBreakIterator" }    // sentence-break iterator class
+                           "RuleBasedBreakIterator",     // sentence-break iterator class
+                           "RuleBasedBreakIterator"}     // Title-Case break iterator class
        },

        // rules describing how to break between logical characters
@ -116,7 +117,7 @@ public class BreakIteratorRules extends ListResourceBundle {
            // Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals,
            // other letters, and digits
            + "$danda=[\u0964\u0965];"
-            + "$kanji=[\u3005\u4e00-\u9fa5\uf900-\ufa2d$surr_hi_ideo$pua];"
+            + "$kanji=[\u3005\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a$surr_hi_ideo$pua];"
            + "$kata=[\u3099-\u309c\u30a1-\u30fe];"
            + "$hira=[\u3041-\u309e\u30fc];"
            + "$let=[[[:L:][:Mc:]$surr_hi_let]-[$kanji$kata$hira]];"
@ -234,7 +235,7 @@ public class BreakIteratorRules extends ListResourceBundle {

            // Kanji: actually includes both Kanji and Kana, except for small Kana and
            // CJK diacritics
-            + "$kanji=[[$surr_hi_ideo$pua\u4e00-\u9fa5\uf900-\ufa2d\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"
+            + "$kanji=[[$surr_hi_ideo$pua\u3400-\u4db5\u4e00-\u9fa5\uf900-\ufa6a\u3041-\u3094\u30a1-\u30fa]-[$post_word$_ignore_]];"

            // digits
            + "$digit=[[:Nd:][:No:]];"
@ -360,6 +361,31 @@ public class BreakIteratorRules extends ListResourceBundle {
            // followed by an optional run of ending punctuation, followed by
            // a sentence terminator, this is a safe place to turn around
            + "![$sent_start$lc$digit]$start*$space*$end*$term;"
-        }
+        },
+
+		// default rules for finding Title Case boundaries.
+		//   See  Unicode Technical Report #21 more information about these rules.
+		{ "TitleBreakRules",
+               "$case_ignorable=[[:Mn:][:Me:][:Cf:][:Lm:][:Sk:]\\u0027\u00AD\u2019];"
+             + "$cased=[[[:Lu:][:Lt:][:Ll:]"
+             +     "\u2160-\u216f"                       // Other Uppercase
+             +     "\u24b6-\u24cf"
+             +     "\u02b0-\u02b8"                       // Other Lower case
+             +     "\u02c0-\u02c1"
+             +     "\u02e0-\u02e4"
+             +     "\u0345\u037a"
+             +     "\u2170-\u217f"
+             +     "\u24d0-\u24e9]"
+             +     "-$case_ignorable];"                  // Remove anything that is case_ignorable
+                                                         //   from $cased.
+		     + "$not_cased=[^$cased$case_ignorable];"
+             // First time only, eat through any leading non-word-like stuff.
+		     + "[$not_cased$case_ignorable]*;"
+             //  Match a word (a cased item), plus any following spaces or other non-cased junk,
+             //    up to the start of the next cased item.
+             + "$cased[$cased$case_ignorable]*[$not_cased]*;"
+             + "!$not_cased*[$cased$case_ignorable]*$not_cased*;"   // Backwards rule.
+	    }
+
    };
 }
--- a/icu4j/src/com/ibm/icu/text/BreakIterator.java
+++ b/icu4j/src/com/ibm/icu/text/BreakIterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/BreakIterator.java,v $ 
- * $Date: 2002/02/16 03:06:03 $ 
- * $Revision: 1.5 $
+ * $Date: 2002/03/01 02:37:47 $ 
+ * $Revision: 1.6 $
 *
 *****************************************************************************************
 */
@ -402,7 +402,8 @@ public abstract class BreakIterator implements Cloneable
    private static final int WORD_INDEX = 1;                        //ibm.597
    private static final int LINE_INDEX = 2;                        //ibm.597
    private static final int SENTENCE_INDEX = 3;                    //ibm.597
-    private static final SoftReference[] iterCache = new SoftReference[4];  //ibm.597
+    private static final int TITLE_INDEX = 4;
+    private static final SoftReference[] iterCache = new SoftReference[5];  //ibm.597

    /**
     * Returns a new instance of BreakIterator that locates word boundaries.
@ -507,6 +508,30 @@ public abstract class BreakIterator implements Cloneable
                                "SentenceBreakDictionary");         //ibm.597
    }                                                               //ibm.597

+    /**
+     * Returns a new instance of BreakIterator that locates sentence boundaries.
+     * This function assumes the text being analyzed is in the default locale's
+     * language.
+     * @return A new instance of BreakIterator that locates sentence boundaries.
+     */
+    public static BreakIterator getTitleInstance()
+    {
+        return getTitleInstance(Locale.getDefault());
+    }
+
+    /**
+     * Returns a new instance of BreakIterator that locates sentence boundaries.
+     * @param where A Locale specifying the language of the text being analyzed.
+     * @return A new instance of BreakIterator that locates sentence boundaries.
+     */
+    public static BreakIterator getTitleInstance(Locale where)
+    {
+        return getBreakInstance(where,
+                                TITLE_INDEX,
+                                "TitleBreakRules",
+                                "TitleBreakDictionary");
+    }
+
    private static BreakIterator getBreakInstance(Locale where,     //ibm.597
                                                  int type,         //ibm.597
                                                  String rulesName, //ibm.597
--- a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@ -5,8 +5,8 @@
 *******************************************************************************
 *
 * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java,v $
- * $Date: 2002/02/25 22:43:58 $
- * $Revision: 1.17 $
+ * $Date: 2002/03/01 02:37:47 $
+ * $Revision: 1.18 $
 *
 *****************************************************************************************
 */
@ -241,7 +241,7 @@ import java.io.*;
 * &nbsp; For examples, see the resource data (which is annotated).</p>
 *
 * @author Richard Gillam
- * $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.17 $ $Date: 2002/02/25 22:43:58 $
+ * $RCSfile: RuleBasedBreakIterator.java,v $ $Revision: 1.18 $ $Date: 2002/03/01 02:37:47 $
 */
 public class RuleBasedBreakIterator extends BreakIterator {

@ -394,6 +394,77 @@ public class RuleBasedBreakIterator extends BreakIterator {
        return description.hashCode();
    }

+//
+//   Dump out a more-or-less human readable form of the
+//   complete state table and character class definitions
+//
+public void debugDumpTables() {
+    System.out.println("Character Classes:");
+    int currentCharClass = 257;
+    int startCurrentRange = 0;
+    int initialStringLength = 0;
+
+    StringBuffer[] charClassRanges = new StringBuffer[numCategories];
+    for (int i=0; i<numCategories; i++) {
+        charClassRanges[i] = new StringBuffer();
+    }
+
+    for (int i = 0; i < 0xffff; i++) {
+        if ((int)charCategoryTable.elementAt((char)i) != currentCharClass) {
+            if (currentCharClass != 257) {
+                // Complete the output of the previous range.
+                if (i != startCurrentRange+1) {
+                    charClassRanges[currentCharClass].append("-"+ Integer.toHexString(i-1));
+                }
+                if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
+                    charClassRanges[currentCharClass].append("\n     ");
+                }
+            }
+
+            // Output the start of the new range.
+            currentCharClass = (int)charCategoryTable.elementAt((char)i);
+            startCurrentRange = i;
+            initialStringLength = charClassRanges[currentCharClass].length();
+            if (charClassRanges[currentCharClass].length() > 0)
+                charClassRanges[currentCharClass].append(", ");
+            charClassRanges[currentCharClass].append(Integer.toHexString(i));
+        }
+    }
+
+    for (int i=0; i<numCategories; i++) {
+        System.out.println(i + ":     " + charClassRanges[i]);
+    }
+
+
+    System.out.println("\n\nState Table.   *: end state     %: look ahead state");
+    System.out.print("C:\t");
+    for (int i = 0; i < numCategories; i++)
+        System.out.print(Integer.toString(i) + "\t");
+    System.out.println(); System.out.print("=================================================");
+    for (int i = 0; i < stateTable.length; i++) {
+        if (i % numCategories == 0) {
+            System.out.println();
+            if (endStates[i / numCategories])
+                System.out.print("*");
+            else
+                System.out.print(" ");
+            if (lookaheadStates[i / numCategories]) {
+                System.out.print("%");
+            }
+            else
+                System.out.print(" ");
+            System.out.print(Integer.toString(i / numCategories) + ":\t");
+        }
+        if (stateTable[i] == 0) {
+            System.out.print(".\t");
+        } else {
+            System.out.print(Integer.toString(stateTable[i]) + "\t");
+        }
+    }
+    System.out.println();
+}
+
+
 // DELETE ME BEFORE RELEASE!!!
 public void writeTablesToFile(FileOutputStream file, boolean littleEndian) throws IOException {
    // NOTE: The format being written here is designed to be compatible with