From 13e01fb91d2cacf3506d0421a95cccad8e62520a Mon Sep 17 00:00:00 2001
From: Andy Heninger <andy.heninger@gmail.com>
Date: Thu, 28 Feb 2002 01:28:04 +0000
Subject: [PATCH] ICU-1126 Add title break iterator

X-SVN-Rev: 7801
---
 icu4c/source/common/brkiter.cpp        | 22 ++++++
 icu4c/source/common/rbbi.cpp           | 14 +++-
 icu4c/source/common/rbbi_tbl.cpp       | 96 ++++++++++++++++++++++++++
 icu4c/source/common/rbbi_tbl.h         |  8 +++
 icu4c/source/common/ubrk.cpp           |  4 ++
 icu4c/source/common/unicode/brkiter.h  | 20 ++++++
 icu4c/source/common/unicode/rbbi.h     |  3 +
 icu4c/source/common/unicode/ubrk.h     |  9 ++-
 icu4c/source/data/Makefile.in          |  2 +-
 icu4c/source/data/makedata.mak         |  5 +-
 icu4c/source/test/intltest/rbbitst.cpp | 42 ++++++++++-
 icu4c/source/test/intltest/rbbitst.h   |  4 ++
 icu4c/source/tools/Makefile.in         |  2 +-
 13 files changed, 223 insertions(+), 8 deletions(-)

diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp
index d54d6b973e..dbc43d1b80 100644
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@@ -152,6 +152,28 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
 
 // -------------------------------------
 
+// Creates a simple text boundary for title casing breaks.
+BreakIterator*
+BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
+{
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files.  This function will have to be made fully general 
+    // at some time in the future!
+    BreakIterator* result = NULL;
+    static const char filename[] = "title";
+
+    if (U_FAILURE(status))
+        return NULL;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
+
+    if (!U_FAILURE(status)) {
+        result = new RuleBasedBreakIterator(file);
+    }
+
+    return result;
+}
+// -------------------------------------
+
 // Gets all the available locales that has localized text boundary data.
 const Locale*
 BreakIterator::getAvailableLocales(int32_t& count)
diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp
index 1ac225f711..aed08985ac 100644
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@@ -466,7 +466,12 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
         // to the last saved lookup-state position
         if (tables->isLookaheadState(state)) {
             if (tables->isEndState(state)) {
-                result = lookaheadResult;
+                if (lookaheadResult > 0) {
+                    result = lookaheadResult;
+                }
+                else {
+                    result = text->getIndex() + 1;
+                }
             }
             else {
                 lookaheadResult = text->getIndex() + 1;
@@ -658,5 +663,12 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
     return localIterator;    
 }
 
+#ifdef RBBI_DEBUG
+void RuleBasedBreakIterator::debugDumpTables() const {
+    tables->debugDumpTables();
+}
+#endif
+
+
 U_NAMESPACE_END
 
diff --git a/icu4c/source/common/rbbi_tbl.cpp b/icu4c/source/common/rbbi_tbl.cpp
index 6280f99b58..7073b73f08 100644
--- a/icu4c/source/common/rbbi_tbl.cpp
+++ b/icu4c/source/common/rbbi_tbl.cpp
@@ -10,6 +10,10 @@
 #include "ucmp8.h"
 #include "cmemory.h"
 #include "rbbi_tbl.h"
+#include "unicode/unistr.h"
+#ifdef RBBI_DEBUG
+#include <stdio.h>
+#endif
 
 U_NAMESPACE_BEGIN
 
@@ -146,5 +150,97 @@ UBool
 RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
     return lookaheadStates[state];
 }
+
+
+#ifdef RBBI_DEBUG
+//
+//   debugDumpTables
+//
+void RuleBasedBreakIteratorTables::debugDumpTables() const {
+    printf("Character Classes:\n");
+    int currentCharClass = 257;
+    int startCurrentRange = 0;
+    int initialStringLength = 0;
+    char  buf[80];
+
+    UnicodeString *charClassRanges = new UnicodeString[numCategories];
+
+    for (int i = 0; i < 0xffff; i++) {
+        if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
+            if (currentCharClass != 257) {
+                // Complete the output of the previous range.
+                if (i != startCurrentRange+1) {
+                    sprintf(buf, "-%x", i-1);
+                    charClassRanges[currentCharClass].append(buf);
+                }
+                if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
+                    charClassRanges[currentCharClass].append("\n     ");
+                }
+            }
+
+            // Output the start of the new range.
+            currentCharClass = ucmp8_get(charCategoryTable, i);
+            startCurrentRange = i;
+            initialStringLength = charClassRanges[currentCharClass].length();
+            if (charClassRanges[currentCharClass].length() > 0)
+                charClassRanges[currentCharClass].append(", ");
+            sprintf(buf, "%x", i);
+            charClassRanges[currentCharClass].append(buf);
+        }
+    }
+
+    for (int i=0; i<numCategories; i++) {
+        printf("%d:   ", i);
+        // Write out the chars in the UnicodeStrings.
+        //    We know we didn't put anything into them except for plain ascii chars.
+        for (int j=0; j<charClassRanges[i].length(); j++) {
+            putchar(charClassRanges[i].charAt(j));
+        }
+        putchar('\n');
+    }
+
+    delete [] charClassRanges;
+
+
+    // State table length might be too big by one, because the only indication
+    //   we have is the  pointer to the start of the next item in the memory
+    //   image, the backwardsStateTable, which is 4 byte aligned.
+    //
+    int   stateTableLength = backwardsStateTable - stateTable;
+    if ((stateTableLength % numCategories) == 1) {
+        stateTableLength -= 1;
+    }
+
+    printf("\n\nState Table.   *: end state     %%: look ahead state\n");
+    printf("C:\t");
+    for (int i = 0; i < numCategories; i++) {
+        printf("%d\t", i);
+    }
+    printf("\n=================================================");
+ 
+    for (int i = 0; i < stateTableLength; i++) {
+        if (i % numCategories == 0) {
+            putchar('\n');
+            if (endStates[i / numCategories])
+                putchar('*');
+            else
+                putchar(' ');
+            if (lookaheadStates[i / numCategories]) {
+                putchar('%');
+            }
+            else
+                putchar(' ');
+            printf("%d:\t", i / numCategories);
+        }
+        if (stateTable[i] == 0) {
+            printf(".\t");
+        } else {
+            printf("%d\t", stateTable[i]);
+        }
+    }
+    printf("\n\n\n");
+}
+#endif // RBBI_DEBUG
+
 U_NAMESPACE_END
 
diff --git a/icu4c/source/common/rbbi_tbl.h b/icu4c/source/common/rbbi_tbl.h
index fa821c078b..07023a83f8 100644
--- a/icu4c/source/common/rbbi_tbl.h
+++ b/icu4c/source/common/rbbi_tbl.h
@@ -198,6 +198,14 @@ protected:
      */
     virtual UBool isLookaheadState(int32_t state) const;
 
+#ifdef RBBI_DEBUG
+    //
+    // Print out state table and character classes.
+    //    For debugging only.
+    //
+    void debugDumpTables() const;
+#endif
+
     friend class RuleBasedBreakIterator;
     friend class DictionaryBasedBreakIterator;
 };
diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp
index 904cd998e1..ecbd52e40a 100644
--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@@ -43,6 +43,10 @@ ubrk_open(UBreakIteratorType type,
   case UBRK_SENTENCE:
     result = BreakIterator::createSentenceInstance(Locale(locale), *status);
     break;
+
+  case UBRK_TITLE:
+    result = BreakIterator::createTitleInstance(Locale(locale), *status);
+    break;
   }
 
   // check for allocation error
diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h
index 22a5680479..a1907348fa 100644
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@@ -419,6 +419,26 @@ public:
     static BreakIterator* createSentenceInstance(const Locale& where, 
                                                        UErrorCode& status);
 
+    /**
+     * Create BreakIterator for title-casing breaks using the specified locale
+     * Returns an instance of a BreakIterator implementing title breaks.
+     * @param where the locale. 
+     * @return A BreakIterator for title-breaks.  The UErrorCode& status 
+     * parameter is used to return status information to the user.
+     * To check whether the construction succeeded or not, you should check
+     * the value of U_SUCCESS(err).  If you wish more detailed information, you
+     * can check for informational error results which still indicate success.
+     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
+     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used; neither the requested locale nor any of its fall back locales
+     * could be found.
+     * The caller owns the returned object and is responsible for deleting it.
+     * @stable
+     */
+    static BreakIterator* createTitleInstance(const Locale& where, 
+                                                       UErrorCode& status);
+
     /**
      * Get the set of Locales for which TextBoundaries are installed
      * @param count the output parameter of number of elements in the locale list
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
index 02a21ca54a..771a7d2c39 100644
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -428,6 +428,9 @@ RuleBasedBreakIterator(UDataMemory* image);
     virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                                int32_t &BufferSize,
                                                UErrorCode &status);
+#ifdef RBBI_DEBUG
+    void debugDumpTables() const;
+#endif
 
 
 protected:
diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h
index 12d54f01ff..6e176e1c9b 100644
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@@ -41,6 +41,11 @@
  * stored as a base character and a diacritical mark. What users
  * consider to be a character can differ between languages.
  * <P>
+ * Title boundary analysis locates all positions,
+ * typically starts of words, that should be set to Title Case
+ * when title casing the text.
+ * <P>
+ * 
  * This is the interface for all text boundaries.
  * <P>
  * Examples:
@@ -177,7 +182,9 @@ enum UBreakIteratorType {
   /** Line breaks */
   UBRK_LINE,
   /** Sentence breaks */
-  UBRK_SENTENCE
+  UBRK_SENTENCE,
+  /** Title Case breaks */
+  UBRK_TITLE
 };
 typedef enum UBreakIteratorType UBreakIteratorType;
 
diff --git a/icu4c/source/data/Makefile.in b/icu4c/source/data/Makefile.in
index 07e92c2e67..8b81c48c54 100644
--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@@ -132,7 +132,7 @@ TEST_DAT_FILES=$(TESTBUILDDIR)/test.dat
 
 ## BRK files
 # ALL of these files can be deleted (the following BRK files) - they are copied
-BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/word_th.brk
+BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/title.brk $(BUILDDIR)/word_th.brk
 # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
 
 ## UCM files 
diff --git a/icu4c/source/data/makedata.mak b/icu4c/source/data/makedata.mak
index 9395379dfe..92b41914e7 100644
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@@ -220,7 +220,7 @@ testdata: ucadata.dat $(TRANSLIT_FILES) $(RB_FILES)  {"$(ICUTOOLS)\genrb\$(CFG)"
 	@echo building testdata...
 	nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)"
 
-BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
+BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\title.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
 
 #invoke pkgdata for ICU common data
 #  pkgdata will drop all output files (.dat, .dll, .lib) into the target (ICUDBLD) directory.
@@ -266,6 +266,9 @@ $(BRK_FILES:.brk" =.brk"
 "$(ICUDBLD)\word.brk" : "$(ICUBRK)\wordLE.brk"
     copy "$(ICUBRK)\wordLE.brk" "$(ICUDBLD)\word.brk"
 
+"$(ICUDBLD)\title.brk" : "$(ICUBRK)\titleLE.brk"
+    copy "$(ICUBRK)\titleLE.brk" "$(ICUDBLD)\title.brk"
+
 "$(ICUDBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk"
     copy "$(ICUBRK)\line_thLE.brk" "$(ICUDBLD)\line_th.brk"
 
diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp
index 36517050da..b49170d9f4 100644
--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@@ -708,6 +708,33 @@ void RBBITest::TestHindiWordBreak()
     delete e;
     delete hindiWordData;
 }
+
+
+void RBBITest::TestTitleBreak()
+{
+    UErrorCode status= U_ZERO_ERROR;
+    RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
+    if(U_FAILURE(status)){
+          errln("FAIL : in construction");
+          return;
+    }
+    // titleI->debugDumpTables();
+
+    Vector *titleData = new Vector();
+    titleData->addElement("   ");
+    titleData->addElement("This ");
+    titleData->addElement("is ");
+    titleData->addElement("a ");
+    titleData->addElement("simple ");
+    titleData->addElement("sample ");
+    titleData->addElement("sentence. ");
+    titleData->addElement("This ");
+
+    generalIteratorTest(*titleI, titleData);
+    delete titleI;
+    delete titleData;
+}
+
 /*
 //Bug: if there is no word break before and after danda when it is followed by a space
 void RBBITest::TestDanda()
@@ -979,6 +1006,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
             if(exec) TestHindiCharacterBreak();                break;
         case 5: name = "TestHindiWordBreak";
             if(exec) TestHindiWordBreak();                     break;
+        case 6: name = "TestTitleBreak";
+            if(exec) TestTitleBreak();                         break;
+
 //      case 6: name = "TestDanda()";
 //           if(exec) TestDanda();                             break;
 //      case 7: name = "TestHindiCharacterWrapping()";
@@ -1069,9 +1099,11 @@ Vector* RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& te
     while (p != RuleBasedBreakIterator::DONE) {
         p = bi.next();
         if (p != RuleBasedBreakIterator::DONE) {
-            if (p <= lastP)
+            if (p <= lastP) {
                 errln((UnicodeString)"next() failed to move forward: next() on position "
                                 + lastP + (UnicodeString)" yielded " + p);
+                break;
+            }
 
             text.extractBetween(lastP, p, selection);  
             result->addElement(selection);
@@ -1097,16 +1129,20 @@ Vector* RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString&
     while (p != RuleBasedBreakIterator::DONE) {
         p = bi.previous();
         if (p != RuleBasedBreakIterator::DONE) {
-            if (p >= lastP)
+            if (p >= lastP) {
                 errln((UnicodeString)"previous() failed to move backward: previous() on position "
                                 + lastP + (UnicodeString)" yielded " + p);
+                break;
+            }
             text.extractBetween(p, lastP, selection);
             result->insertElementAt(selection, 0);
         }
         else {
-            if (lastP != 0)
+            if (lastP != 0) {
                 errln((UnicodeString)"previous() returned DONE prematurely: offset was "
                                 + lastP + (UnicodeString)" instead of 0");
+                break;
+            }
         }
         lastP = p;
     }
diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h
index 0c2f56e7b4..7ca529ad94 100644
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@@ -51,6 +51,10 @@ public:
      * Tests Hindi(Devanagiri) word iteration
      **/  
     void TestHindiWordBreak(void);
+    /**
+     * Tests Title Case break iteration
+     **/  
+    void TestTitleBreak(void);
     /**
     * Test Hindi Danda i.e make sure we have a break point before and after danda 
     **/ 
diff --git a/icu4c/source/tools/Makefile.in b/icu4c/source/tools/Makefile.in
index 336fc48826..efa9ea7942 100644
--- a/icu4c/source/tools/Makefile.in
+++ b/icu4c/source/tools/Makefile.in
@@ -99,7 +99,7 @@ all-local: build-local
 
 DAT_FILES=uprops.dat unames.dat cnvalias.dat tz.dat
 # ALL of these files can be deleted (the following BRK files) - they are copied
-BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk word_th.brk
+BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk title.brk word_th.brk
 # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
 
 DATAFILESD=$(DAT_FILES:%=$(OBJDATADIR)/%)