ICU-1126 Add title break iterator

X-SVN-Rev: 7801
2002-02-28 01:28:04 +00:00 · 2002-02-28 01:28:04 +00:00 · 13e01fb91d
commit 13e01fb91d
parent 7aadc85a12
13 changed files with 223 additions and 8 deletions
--- a/icu4c/source/common/brkiter.cpp
+++ b/icu4c/source/common/brkiter.cpp
@ -152,6 +152,28 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)

 // -------------------------------------

+// Creates a simple text boundary for title casing breaks.
+BreakIterator*
+BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
+{
+    // WARNING: This routine is currently written specifically to handle only the
+    // default rules files.  This function will have to be made fully general 
+    // at some time in the future!
+    BreakIterator* result = NULL;
+    static const char filename[] = "title";
+
+    if (U_FAILURE(status))
+        return NULL;
+    UDataMemory* file = udata_open(NULL, "brk", filename, &status);
+
+    if (!U_FAILURE(status)) {
+        result = new RuleBasedBreakIterator(file);
+    }
+
+    return result;
+}
+// -------------------------------------
+
 // Gets all the available locales that has localized text boundary data.
 const Locale*
 BreakIterator::getAvailableLocales(int32_t& count)
--- a/icu4c/source/common/rbbi.cpp
+++ b/icu4c/source/common/rbbi.cpp
@ -466,7 +466,12 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
        // to the last saved lookup-state position
        if (tables->isLookaheadState(state)) {
            if (tables->isEndState(state)) {
-                result = lookaheadResult;
+                if (lookaheadResult > 0) {
+                    result = lookaheadResult;
+                }
+                else {
+                    result = text->getIndex() + 1;
+                }
            }
            else {
                lookaheadResult = text->getIndex() + 1;
@ -658,5 +663,12 @@ BreakIterator *  RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
    return localIterator;    
 }

+#ifdef RBBI_DEBUG
+void RuleBasedBreakIterator::debugDumpTables() const {
+    tables->debugDumpTables();
+}
+#endif
+
+
 U_NAMESPACE_END

--- a/icu4c/source/common/rbbi_tbl.cpp
+++ b/icu4c/source/common/rbbi_tbl.cpp
@ -10,6 +10,10 @@
 #include "ucmp8.h"
 #include "cmemory.h"
 #include "rbbi_tbl.h"
+#include "unicode/unistr.h"
+#ifdef RBBI_DEBUG
+#include <stdio.h>
+#endif

 U_NAMESPACE_BEGIN

@ -146,5 +150,97 @@ UBool
 RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
    return lookaheadStates[state];
 }
+
+
+#ifdef RBBI_DEBUG
+//
+//   debugDumpTables
+//
+void RuleBasedBreakIteratorTables::debugDumpTables() const {
+    printf("Character Classes:\n");
+    int currentCharClass = 257;
+    int startCurrentRange = 0;
+    int initialStringLength = 0;
+    char  buf[80];
+
+    UnicodeString *charClassRanges = new UnicodeString[numCategories];
+
+    for (int i = 0; i < 0xffff; i++) {
+        if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
+            if (currentCharClass != 257) {
+                // Complete the output of the previous range.
+                if (i != startCurrentRange+1) {
+                    sprintf(buf, "-%x", i-1);
+                    charClassRanges[currentCharClass].append(buf);
+                }
+                if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
+                    charClassRanges[currentCharClass].append("\n     ");
+                }
+            }
+
+            // Output the start of the new range.
+            currentCharClass = ucmp8_get(charCategoryTable, i);
+            startCurrentRange = i;
+            initialStringLength = charClassRanges[currentCharClass].length();
+            if (charClassRanges[currentCharClass].length() > 0)
+                charClassRanges[currentCharClass].append(", ");
+            sprintf(buf, "%x", i);
+            charClassRanges[currentCharClass].append(buf);
+        }
+    }
+
+    for (int i=0; i<numCategories; i++) {
+        printf("%d:   ", i);
+        // Write out the chars in the UnicodeStrings.
+        //    We know we didn't put anything into them except for plain ascii chars.
+        for (int j=0; j<charClassRanges[i].length(); j++) {
+            putchar(charClassRanges[i].charAt(j));
+        }
+        putchar('\n');
+    }
+
+    delete [] charClassRanges;
+
+
+    // State table length might be too big by one, because the only indication
+    //   we have is the  pointer to the start of the next item in the memory
+    //   image, the backwardsStateTable, which is 4 byte aligned.
+    //
+    int   stateTableLength = backwardsStateTable - stateTable;
+    if ((stateTableLength % numCategories) == 1) {
+        stateTableLength -= 1;
+    }
+
+    printf("\n\nState Table.   *: end state     %%: look ahead state\n");
+    printf("C:\t");
+    for (int i = 0; i < numCategories; i++) {
+        printf("%d\t", i);
+    }
+    printf("\n=================================================");
+ 
+    for (int i = 0; i < stateTableLength; i++) {
+        if (i % numCategories == 0) {
+            putchar('\n');
+            if (endStates[i / numCategories])
+                putchar('*');
+            else
+                putchar(' ');
+            if (lookaheadStates[i / numCategories]) {
+                putchar('%');
+            }
+            else
+                putchar(' ');
+            printf("%d:\t", i / numCategories);
+        }
+        if (stateTable[i] == 0) {
+            printf(".\t");
+        } else {
+            printf("%d\t", stateTable[i]);
+        }
+    }
+    printf("\n\n\n");
+}
+#endif // RBBI_DEBUG
+
 U_NAMESPACE_END

--- a/icu4c/source/common/rbbi_tbl.h
+++ b/icu4c/source/common/rbbi_tbl.h
@ -198,6 +198,14 @@ protected:
     */
    virtual UBool isLookaheadState(int32_t state) const;

+#ifdef RBBI_DEBUG
+    //
+    // Print out state table and character classes.
+    //    For debugging only.
+    //
+    void debugDumpTables() const;
+#endif
+
    friend class RuleBasedBreakIterator;
    friend class DictionaryBasedBreakIterator;
 };
--- a/icu4c/source/common/ubrk.cpp
+++ b/icu4c/source/common/ubrk.cpp
@ -43,6 +43,10 @@ ubrk_open(UBreakIteratorType type,
  case UBRK_SENTENCE:
    result = BreakIterator::createSentenceInstance(Locale(locale), *status);
    break;
+
+  case UBRK_TITLE:
+    result = BreakIterator::createTitleInstance(Locale(locale), *status);
+    break;
  }

  // check for allocation error
--- a/icu4c/source/common/unicode/brkiter.h
+++ b/icu4c/source/common/unicode/brkiter.h
@ -419,6 +419,26 @@ public:
    static BreakIterator* createSentenceInstance(const Locale& where, 
                                                       UErrorCode& status);

+    /**
+     * Create BreakIterator for title-casing breaks using the specified locale
+     * Returns an instance of a BreakIterator implementing title breaks.
+     * @param where the locale. 
+     * @return A BreakIterator for title-breaks.  The UErrorCode& status 
+     * parameter is used to return status information to the user.
+     * To check whether the construction succeeded or not, you should check
+     * the value of U_SUCCESS(err).  If you wish more detailed information, you
+     * can check for informational error results which still indicate success.
+     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * example, 'de_CH' was requested, but nothing was found there, so 'de' was
+     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used; neither the requested locale nor any of its fall back locales
+     * could be found.
+     * The caller owns the returned object and is responsible for deleting it.
+     * @stable
+     */
+    static BreakIterator* createTitleInstance(const Locale& where, 
+                                                       UErrorCode& status);
+
    /**
     * Get the set of Locales for which TextBoundaries are installed
     * @param count the output parameter of number of elements in the locale list
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@ -428,6 +428,9 @@ RuleBasedBreakIterator(UDataMemory* image);
    virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                               int32_t &BufferSize,
                                               UErrorCode &status);
+#ifdef RBBI_DEBUG
+    void debugDumpTables() const;
+#endif


 protected:
--- a/icu4c/source/common/unicode/ubrk.h
+++ b/icu4c/source/common/unicode/ubrk.h
@ -41,6 +41,11 @@
 * stored as a base character and a diacritical mark. What users
 * consider to be a character can differ between languages.
 * <P>
+ * Title boundary analysis locates all positions,
+ * typically starts of words, that should be set to Title Case
+ * when title casing the text.
+ * <P>
+ * 
 * This is the interface for all text boundaries.
 * <P>
 * Examples:
@ -177,7 +182,9 @@ enum UBreakIteratorType {
  /** Line breaks */
  UBRK_LINE,
  /** Sentence breaks */
-  UBRK_SENTENCE
+  UBRK_SENTENCE,
+  /** Title Case breaks */
+  UBRK_TITLE
 };
 typedef enum UBreakIteratorType UBreakIteratorType;

--- a/icu4c/source/data/Makefile.in
+++ b/icu4c/source/data/Makefile.in
@ -132,7 +132,7 @@ TEST_DAT_FILES=$(TESTBUILDDIR)/test.dat

 ## BRK files
 # ALL of these files can be deleted (the following BRK files) - they are copied
-BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/word_th.brk
+BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/title.brk $(BUILDDIR)/word_th.brk
 # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted

 ## UCM files 
--- a/icu4c/source/data/makedata.mak
+++ b/icu4c/source/data/makedata.mak
@ -220,7 +220,7 @@ testdata: ucadata.dat $(TRANSLIT_FILES) $(RB_FILES)  {"$(ICUTOOLS)\genrb\$(CFG)"
 	@echo building testdata...
 	nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)"

-BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
+BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\title.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"

 #invoke pkgdata for ICU common data
 #  pkgdata will drop all output files (.dat, .dll, .lib) into the target (ICUDBLD) directory.
@ -266,6 +266,9 @@ $(BRK_FILES:.brk" =.brk"
 "$(ICUDBLD)\word.brk" : "$(ICUBRK)\wordLE.brk"
    copy "$(ICUBRK)\wordLE.brk" "$(ICUDBLD)\word.brk"

+"$(ICUDBLD)\title.brk" : "$(ICUBRK)\titleLE.brk"
+    copy "$(ICUBRK)\titleLE.brk" "$(ICUDBLD)\title.brk"
+
 "$(ICUDBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk"
    copy "$(ICUBRK)\line_thLE.brk" "$(ICUDBLD)\line_th.brk"

--- a/icu4c/source/test/intltest/rbbitst.cpp
+++ b/icu4c/source/test/intltest/rbbitst.cpp
@ -708,6 +708,33 @@ void RBBITest::TestHindiWordBreak()
    delete e;
    delete hindiWordData;
 }
+
+
+void RBBITest::TestTitleBreak()
+{
+    UErrorCode status= U_ZERO_ERROR;
+    RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
+    if(U_FAILURE(status)){
+          errln("FAIL : in construction");
+          return;
+    }
+    // titleI->debugDumpTables();
+
+    Vector *titleData = new Vector();
+    titleData->addElement("   ");
+    titleData->addElement("This ");
+    titleData->addElement("is ");
+    titleData->addElement("a ");
+    titleData->addElement("simple ");
+    titleData->addElement("sample ");
+    titleData->addElement("sentence. ");
+    titleData->addElement("This ");
+
+    generalIteratorTest(*titleI, titleData);
+    delete titleI;
+    delete titleData;
+}
+
 /*
 //Bug: if there is no word break before and after danda when it is followed by a space
 void RBBITest::TestDanda()
@ -979,6 +1006,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
            if(exec) TestHindiCharacterBreak();                break;
        case 5: name = "TestHindiWordBreak";
            if(exec) TestHindiWordBreak();                     break;
+        case 6: name = "TestTitleBreak";
+            if(exec) TestTitleBreak();                         break;
+
 //      case 6: name = "TestDanda()";
 //           if(exec) TestDanda();                             break;
 //      case 7: name = "TestHindiCharacterWrapping()";
@ -1069,9 +1099,11 @@ Vector* RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& te
    while (p != RuleBasedBreakIterator::DONE) {
        p = bi.next();
        if (p != RuleBasedBreakIterator::DONE) {
-            if (p <= lastP)
+            if (p <= lastP) {
                errln((UnicodeString)"next() failed to move forward: next() on position "
                                + lastP + (UnicodeString)" yielded " + p);
+                break;
+            }

            text.extractBetween(lastP, p, selection);  
            result->addElement(selection);
@ -1097,16 +1129,20 @@ Vector* RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString&
    while (p != RuleBasedBreakIterator::DONE) {
        p = bi.previous();
        if (p != RuleBasedBreakIterator::DONE) {
-            if (p >= lastP)
+            if (p >= lastP) {
                errln((UnicodeString)"previous() failed to move backward: previous() on position "
                                + lastP + (UnicodeString)" yielded " + p);
+                break;
+            }
            text.extractBetween(p, lastP, selection);
            result->insertElementAt(selection, 0);
        }
        else {
-            if (lastP != 0)
+            if (lastP != 0) {
                errln((UnicodeString)"previous() returned DONE prematurely: offset was "
                                + lastP + (UnicodeString)" instead of 0");
+                break;
+            }
        }
        lastP = p;
    }
--- a/icu4c/source/test/intltest/rbbitst.h
+++ b/icu4c/source/test/intltest/rbbitst.h
@ -51,6 +51,10 @@ public:
     * Tests Hindi(Devanagiri) word iteration
     **/  
    void TestHindiWordBreak(void);
+    /**
+     * Tests Title Case break iteration
+     **/  
+    void TestTitleBreak(void);
    /**
    * Test Hindi Danda i.e make sure we have a break point before and after danda 
    **/ 
--- a/icu4c/source/tools/Makefile.in
+++ b/icu4c/source/tools/Makefile.in
@ -99,7 +99,7 @@ all-local: build-local

 DAT_FILES=uprops.dat unames.dat cnvalias.dat tz.dat
 # ALL of these files can be deleted (the following BRK files) - they are copied
-BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk word_th.brk
+BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk title.brk word_th.brk
 # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted

 DATAFILESD=$(DAT_FILES:%=$(OBJDATADIR)/%)