From 13e01fb91d2cacf3506d0421a95cccad8e62520a Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 28 Feb 2002 01:28:04 +0000 Subject: [PATCH] ICU-1126 Add title break iterator X-SVN-Rev: 7801 --- icu4c/source/common/brkiter.cpp | 22 ++++++ icu4c/source/common/rbbi.cpp | 14 +++- icu4c/source/common/rbbi_tbl.cpp | 96 ++++++++++++++++++++++++++ icu4c/source/common/rbbi_tbl.h | 8 +++ icu4c/source/common/ubrk.cpp | 4 ++ icu4c/source/common/unicode/brkiter.h | 20 ++++++ icu4c/source/common/unicode/rbbi.h | 3 + icu4c/source/common/unicode/ubrk.h | 9 ++- icu4c/source/data/Makefile.in | 2 +- icu4c/source/data/makedata.mak | 5 +- icu4c/source/test/intltest/rbbitst.cpp | 42 ++++++++++- icu4c/source/test/intltest/rbbitst.h | 4 ++ icu4c/source/tools/Makefile.in | 2 +- 13 files changed, 223 insertions(+), 8 deletions(-) diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index d54d6b973e..dbc43d1b80 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -152,6 +152,28 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status) // ------------------------------------- +// Creates a simple text boundary for title casing breaks. +BreakIterator* +BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status) +{ + // WARNING: This routine is currently written specifically to handle only the + // default rules files. This function will have to be made fully general + // at some time in the future! + BreakIterator* result = NULL; + static const char filename[] = "title"; + + if (U_FAILURE(status)) + return NULL; + UDataMemory* file = udata_open(NULL, "brk", filename, &status); + + if (!U_FAILURE(status)) { + result = new RuleBasedBreakIterator(file); + } + + return result; +} +// ------------------------------------- + // Gets all the available locales that has localized text boundary data. const Locale* BreakIterator::getAvailableLocales(int32_t& count) diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index 1ac225f711..aed08985ac 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -466,7 +466,12 @@ int32_t RuleBasedBreakIterator::handleNext(void) { // to the last saved lookup-state position if (tables->isLookaheadState(state)) { if (tables->isEndState(state)) { - result = lookaheadResult; + if (lookaheadResult > 0) { + result = lookaheadResult; + } + else { + result = text->getIndex() + 1; + } } else { lookaheadResult = text->getIndex() + 1; @@ -658,5 +663,12 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer, return localIterator; } +#ifdef RBBI_DEBUG +void RuleBasedBreakIterator::debugDumpTables() const { + tables->debugDumpTables(); +} +#endif + + U_NAMESPACE_END diff --git a/icu4c/source/common/rbbi_tbl.cpp b/icu4c/source/common/rbbi_tbl.cpp index 6280f99b58..7073b73f08 100644 --- a/icu4c/source/common/rbbi_tbl.cpp +++ b/icu4c/source/common/rbbi_tbl.cpp @@ -10,6 +10,10 @@ #include "ucmp8.h" #include "cmemory.h" #include "rbbi_tbl.h" +#include "unicode/unistr.h" +#ifdef RBBI_DEBUG +#include +#endif U_NAMESPACE_BEGIN @@ -146,5 +150,97 @@ UBool RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const { return lookaheadStates[state]; } + + +#ifdef RBBI_DEBUG +// +// debugDumpTables +// +void RuleBasedBreakIteratorTables::debugDumpTables() const { + printf("Character Classes:\n"); + int currentCharClass = 257; + int startCurrentRange = 0; + int initialStringLength = 0; + char buf[80]; + + UnicodeString *charClassRanges = new UnicodeString[numCategories]; + + for (int i = 0; i < 0xffff; i++) { + if ( ucmp8_get(charCategoryTable, i) != currentCharClass) { + if (currentCharClass != 257) { + // Complete the output of the previous range. + if (i != startCurrentRange+1) { + sprintf(buf, "-%x", i-1); + charClassRanges[currentCharClass].append(buf); + } + if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) { + charClassRanges[currentCharClass].append("\n "); + } + } + + // Output the start of the new range. + currentCharClass = ucmp8_get(charCategoryTable, i); + startCurrentRange = i; + initialStringLength = charClassRanges[currentCharClass].length(); + if (charClassRanges[currentCharClass].length() > 0) + charClassRanges[currentCharClass].append(", "); + sprintf(buf, "%x", i); + charClassRanges[currentCharClass].append(buf); + } + } + + for (int i=0; i + * Title boundary analysis locates all positions, + * typically starts of words, that should be set to Title Case + * when title casing the text. + *

+ * * This is the interface for all text boundaries. *

* Examples: @@ -177,7 +182,9 @@ enum UBreakIteratorType { /** Line breaks */ UBRK_LINE, /** Sentence breaks */ - UBRK_SENTENCE + UBRK_SENTENCE, + /** Title Case breaks */ + UBRK_TITLE }; typedef enum UBreakIteratorType UBreakIteratorType; diff --git a/icu4c/source/data/Makefile.in b/icu4c/source/data/Makefile.in index 07e92c2e67..8b81c48c54 100644 --- a/icu4c/source/data/Makefile.in +++ b/icu4c/source/data/Makefile.in @@ -132,7 +132,7 @@ TEST_DAT_FILES=$(TESTBUILDDIR)/test.dat ## BRK files # ALL of these files can be deleted (the following BRK files) - they are copied -BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/word_th.brk +BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/title.brk $(BUILDDIR)/word_th.brk # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted ## UCM files diff --git a/icu4c/source/data/makedata.mak b/icu4c/source/data/makedata.mak index 9395379dfe..92b41914e7 100644 --- a/icu4c/source/data/makedata.mak +++ b/icu4c/source/data/makedata.mak @@ -220,7 +220,7 @@ testdata: ucadata.dat $(TRANSLIT_FILES) $(RB_FILES) {"$(ICUTOOLS)\genrb\$(CFG)" @echo building testdata... nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)" -BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk" +BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\title.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk" #invoke pkgdata for ICU common data # pkgdata will drop all output files (.dat, .dll, .lib) into the target (ICUDBLD) directory. @@ -266,6 +266,9 @@ $(BRK_FILES:.brk" =.brk" "$(ICUDBLD)\word.brk" : "$(ICUBRK)\wordLE.brk" copy "$(ICUBRK)\wordLE.brk" "$(ICUDBLD)\word.brk" +"$(ICUDBLD)\title.brk" : "$(ICUBRK)\titleLE.brk" + copy "$(ICUBRK)\titleLE.brk" "$(ICUDBLD)\title.brk" + "$(ICUDBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk" copy "$(ICUBRK)\line_thLE.brk" "$(ICUDBLD)\line_th.brk" diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 36517050da..b49170d9f4 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -708,6 +708,33 @@ void RBBITest::TestHindiWordBreak() delete e; delete hindiWordData; } + + +void RBBITest::TestTitleBreak() +{ + UErrorCode status= U_ZERO_ERROR; + RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status); + if(U_FAILURE(status)){ + errln("FAIL : in construction"); + return; + } + // titleI->debugDumpTables(); + + Vector *titleData = new Vector(); + titleData->addElement(" "); + titleData->addElement("This "); + titleData->addElement("is "); + titleData->addElement("a "); + titleData->addElement("simple "); + titleData->addElement("sample "); + titleData->addElement("sentence. "); + titleData->addElement("This "); + + generalIteratorTest(*titleI, titleData); + delete titleI; + delete titleData; +} + /* //Bug: if there is no word break before and after danda when it is followed by a space void RBBITest::TestDanda() @@ -979,6 +1006,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha if(exec) TestHindiCharacterBreak(); break; case 5: name = "TestHindiWordBreak"; if(exec) TestHindiWordBreak(); break; + case 6: name = "TestTitleBreak"; + if(exec) TestTitleBreak(); break; + // case 6: name = "TestDanda()"; // if(exec) TestDanda(); break; // case 7: name = "TestHindiCharacterWrapping()"; @@ -1069,9 +1099,11 @@ Vector* RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& te while (p != RuleBasedBreakIterator::DONE) { p = bi.next(); if (p != RuleBasedBreakIterator::DONE) { - if (p <= lastP) + if (p <= lastP) { errln((UnicodeString)"next() failed to move forward: next() on position " + lastP + (UnicodeString)" yielded " + p); + break; + } text.extractBetween(lastP, p, selection); result->addElement(selection); @@ -1097,16 +1129,20 @@ Vector* RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString& while (p != RuleBasedBreakIterator::DONE) { p = bi.previous(); if (p != RuleBasedBreakIterator::DONE) { - if (p >= lastP) + if (p >= lastP) { errln((UnicodeString)"previous() failed to move backward: previous() on position " + lastP + (UnicodeString)" yielded " + p); + break; + } text.extractBetween(p, lastP, selection); result->insertElementAt(selection, 0); } else { - if (lastP != 0) + if (lastP != 0) { errln((UnicodeString)"previous() returned DONE prematurely: offset was " + lastP + (UnicodeString)" instead of 0"); + break; + } } lastP = p; } diff --git a/icu4c/source/test/intltest/rbbitst.h b/icu4c/source/test/intltest/rbbitst.h index 0c2f56e7b4..7ca529ad94 100644 --- a/icu4c/source/test/intltest/rbbitst.h +++ b/icu4c/source/test/intltest/rbbitst.h @@ -51,6 +51,10 @@ public: * Tests Hindi(Devanagiri) word iteration **/ void TestHindiWordBreak(void); + /** + * Tests Title Case break iteration + **/ + void TestTitleBreak(void); /** * Test Hindi Danda i.e make sure we have a break point before and after danda **/ diff --git a/icu4c/source/tools/Makefile.in b/icu4c/source/tools/Makefile.in index 336fc48826..efa9ea7942 100644 --- a/icu4c/source/tools/Makefile.in +++ b/icu4c/source/tools/Makefile.in @@ -99,7 +99,7 @@ all-local: build-local DAT_FILES=uprops.dat unames.dat cnvalias.dat tz.dat # ALL of these files can be deleted (the following BRK files) - they are copied -BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk word_th.brk +BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk title.brk word_th.brk # don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted DATAFILESD=$(DAT_FILES:%=$(OBJDATADIR)/%)