ICU-1126 Add title break iterator

X-SVN-Rev: 7801
This commit is contained in:
Andy Heninger 2002-02-28 01:28:04 +00:00
parent 7aadc85a12
commit 13e01fb91d
13 changed files with 223 additions and 8 deletions

View File

@ -152,6 +152,28 @@ BreakIterator::createSentenceInstance(const Locale& key, UErrorCode& status)
// -------------------------------------
// Creates a simple text boundary for title casing breaks.
BreakIterator*
BreakIterator::createTitleInstance(const Locale& key, UErrorCode& status)
{
// WARNING: This routine is currently written specifically to handle only the
// default rules files. This function will have to be made fully general
// at some time in the future!
BreakIterator* result = NULL;
static const char filename[] = "title";
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (!U_FAILURE(status)) {
result = new RuleBasedBreakIterator(file);
}
return result;
}
// -------------------------------------
// Gets all the available locales that has localized text boundary data.
const Locale*
BreakIterator::getAvailableLocales(int32_t& count)

View File

@ -466,7 +466,12 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
// to the last saved lookup-state position
if (tables->isLookaheadState(state)) {
if (tables->isEndState(state)) {
result = lookaheadResult;
if (lookaheadResult > 0) {
result = lookaheadResult;
}
else {
result = text->getIndex() + 1;
}
}
else {
lookaheadResult = text->getIndex() + 1;
@ -658,5 +663,12 @@ BreakIterator * RuleBasedBreakIterator::createBufferClone(void *stackBuffer,
return localIterator;
}
#ifdef RBBI_DEBUG
void RuleBasedBreakIterator::debugDumpTables() const {
tables->debugDumpTables();
}
#endif
U_NAMESPACE_END

View File

@ -10,6 +10,10 @@
#include "ucmp8.h"
#include "cmemory.h"
#include "rbbi_tbl.h"
#include "unicode/unistr.h"
#ifdef RBBI_DEBUG
#include <stdio.h>
#endif
U_NAMESPACE_BEGIN
@ -146,5 +150,97 @@ UBool
RuleBasedBreakIteratorTables::isLookaheadState(int32_t state) const {
return lookaheadStates[state];
}
#ifdef RBBI_DEBUG
//
// debugDumpTables
//
void RuleBasedBreakIteratorTables::debugDumpTables() const {
printf("Character Classes:\n");
int currentCharClass = 257;
int startCurrentRange = 0;
int initialStringLength = 0;
char buf[80];
UnicodeString *charClassRanges = new UnicodeString[numCategories];
for (int i = 0; i < 0xffff; i++) {
if ( ucmp8_get(charCategoryTable, i) != currentCharClass) {
if (currentCharClass != 257) {
// Complete the output of the previous range.
if (i != startCurrentRange+1) {
sprintf(buf, "-%x", i-1);
charClassRanges[currentCharClass].append(buf);
}
if (charClassRanges[currentCharClass].length() % 72 < initialStringLength % 72) {
charClassRanges[currentCharClass].append("\n ");
}
}
// Output the start of the new range.
currentCharClass = ucmp8_get(charCategoryTable, i);
startCurrentRange = i;
initialStringLength = charClassRanges[currentCharClass].length();
if (charClassRanges[currentCharClass].length() > 0)
charClassRanges[currentCharClass].append(", ");
sprintf(buf, "%x", i);
charClassRanges[currentCharClass].append(buf);
}
}
for (int i=0; i<numCategories; i++) {
printf("%d: ", i);
// Write out the chars in the UnicodeStrings.
// We know we didn't put anything into them except for plain ascii chars.
for (int j=0; j<charClassRanges[i].length(); j++) {
putchar(charClassRanges[i].charAt(j));
}
putchar('\n');
}
delete [] charClassRanges;
// State table length might be too big by one, because the only indication
// we have is the pointer to the start of the next item in the memory
// image, the backwardsStateTable, which is 4 byte aligned.
//
int stateTableLength = backwardsStateTable - stateTable;
if ((stateTableLength % numCategories) == 1) {
stateTableLength -= 1;
}
printf("\n\nState Table. *: end state %%: look ahead state\n");
printf("C:\t");
for (int i = 0; i < numCategories; i++) {
printf("%d\t", i);
}
printf("\n=================================================");
for (int i = 0; i < stateTableLength; i++) {
if (i % numCategories == 0) {
putchar('\n');
if (endStates[i / numCategories])
putchar('*');
else
putchar(' ');
if (lookaheadStates[i / numCategories]) {
putchar('%');
}
else
putchar(' ');
printf("%d:\t", i / numCategories);
}
if (stateTable[i] == 0) {
printf(".\t");
} else {
printf("%d\t", stateTable[i]);
}
}
printf("\n\n\n");
}
#endif // RBBI_DEBUG
U_NAMESPACE_END

View File

@ -198,6 +198,14 @@ protected:
*/
virtual UBool isLookaheadState(int32_t state) const;
#ifdef RBBI_DEBUG
//
// Print out state table and character classes.
// For debugging only.
//
void debugDumpTables() const;
#endif
friend class RuleBasedBreakIterator;
friend class DictionaryBasedBreakIterator;
};

View File

@ -43,6 +43,10 @@ ubrk_open(UBreakIteratorType type,
case UBRK_SENTENCE:
result = BreakIterator::createSentenceInstance(Locale(locale), *status);
break;
case UBRK_TITLE:
result = BreakIterator::createTitleInstance(Locale(locale), *status);
break;
}
// check for allocation error

View File

@ -419,6 +419,26 @@ public:
static BreakIterator* createSentenceInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
* @param where the locale.
* @return A BreakIterator for title-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
*/
static BreakIterator* createTitleInstance(const Locale& where,
UErrorCode& status);
/**
* Get the set of Locales for which TextBoundaries are installed
* @param count the output parameter of number of elements in the locale list

View File

@ -428,6 +428,9 @@ RuleBasedBreakIterator(UDataMemory* image);
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status);
#ifdef RBBI_DEBUG
void debugDumpTables() const;
#endif
protected:

View File

@ -41,6 +41,11 @@
* stored as a base character and a diacritical mark. What users
* consider to be a character can differ between languages.
* <P>
* Title boundary analysis locates all positions,
* typically starts of words, that should be set to Title Case
* when title casing the text.
* <P>
*
* This is the interface for all text boundaries.
* <P>
* Examples:
@ -177,7 +182,9 @@ enum UBreakIteratorType {
/** Line breaks */
UBRK_LINE,
/** Sentence breaks */
UBRK_SENTENCE
UBRK_SENTENCE,
/** Title Case breaks */
UBRK_TITLE
};
typedef enum UBreakIteratorType UBreakIteratorType;

View File

@ -132,7 +132,7 @@ TEST_DAT_FILES=$(TESTBUILDDIR)/test.dat
## BRK files
# ALL of these files can be deleted (the following BRK files) - they are copied
BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/word_th.brk
BRK_FILES=$(BUILDDIR)/char.brk $(BUILDDIR)/line.brk $(BUILDDIR)/line_th.brk $(BUILDDIR)/sent.brk $(BUILDDIR)/word.brk $(BUILDDIR)/title.brk $(BUILDDIR)/word_th.brk
# don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
## UCM files

View File

@ -220,7 +220,7 @@ testdata: ucadata.dat $(TRANSLIT_FILES) $(RB_FILES) {"$(ICUTOOLS)\genrb\$(CFG)"
@echo building testdata...
nmake /nologo /f "$(TESTDATA)\testdata.mk" TESTDATA=. ICUTOOLS="$(ICUTOOLS)" PKGOPT="$(PKGOPT)" CFG=$(CFG) TESTDATAOUT="$(TESTDATAOUT)" ICUDATA="$(ICUDATA)" TESTDATABLD="$(TESTDATABLD)"
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
BRK_FILES = "$(ICUDBLD)\sent.brk" "$(ICUDBLD)\char.brk" "$(ICUDBLD)\line.brk" "$(ICUDBLD)\word.brk" "$(ICUDBLD)\title.brk" "$(ICUDBLD)\line_th.brk" "$(ICUDBLD)\word_th.brk"
#invoke pkgdata for ICU common data
# pkgdata will drop all output files (.dat, .dll, .lib) into the target (ICUDBLD) directory.
@ -266,6 +266,9 @@ $(BRK_FILES:.brk" =.brk"
"$(ICUDBLD)\word.brk" : "$(ICUBRK)\wordLE.brk"
copy "$(ICUBRK)\wordLE.brk" "$(ICUDBLD)\word.brk"
"$(ICUDBLD)\title.brk" : "$(ICUBRK)\titleLE.brk"
copy "$(ICUBRK)\titleLE.brk" "$(ICUDBLD)\title.brk"
"$(ICUDBLD)\line_th.brk" : "$(ICUBRK)\line_thLE.brk"
copy "$(ICUBRK)\line_thLE.brk" "$(ICUDBLD)\line_th.brk"

View File

@ -708,6 +708,33 @@ void RBBITest::TestHindiWordBreak()
delete e;
delete hindiWordData;
}
void RBBITest::TestTitleBreak()
{
UErrorCode status= U_ZERO_ERROR;
RuleBasedBreakIterator* titleI=(RuleBasedBreakIterator*)RuleBasedBreakIterator::createTitleInstance(Locale::getDefault(), status);
if(U_FAILURE(status)){
errln("FAIL : in construction");
return;
}
// titleI->debugDumpTables();
Vector *titleData = new Vector();
titleData->addElement(" ");
titleData->addElement("This ");
titleData->addElement("is ");
titleData->addElement("a ");
titleData->addElement("simple ");
titleData->addElement("sample ");
titleData->addElement("sentence. ");
titleData->addElement("This ");
generalIteratorTest(*titleI, titleData);
delete titleI;
delete titleData;
}
/*
//Bug: if there is no word break before and after danda when it is followed by a space
void RBBITest::TestDanda()
@ -979,6 +1006,9 @@ void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, cha
if(exec) TestHindiCharacterBreak(); break;
case 5: name = "TestHindiWordBreak";
if(exec) TestHindiWordBreak(); break;
case 6: name = "TestTitleBreak";
if(exec) TestTitleBreak(); break;
// case 6: name = "TestDanda()";
// if(exec) TestDanda(); break;
// case 7: name = "TestHindiCharacterWrapping()";
@ -1069,9 +1099,11 @@ Vector* RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, UnicodeString& te
while (p != RuleBasedBreakIterator::DONE) {
p = bi.next();
if (p != RuleBasedBreakIterator::DONE) {
if (p <= lastP)
if (p <= lastP) {
errln((UnicodeString)"next() failed to move forward: next() on position "
+ lastP + (UnicodeString)" yielded " + p);
break;
}
text.extractBetween(lastP, p, selection);
result->addElement(selection);
@ -1097,16 +1129,20 @@ Vector* RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi, UnicodeString&
while (p != RuleBasedBreakIterator::DONE) {
p = bi.previous();
if (p != RuleBasedBreakIterator::DONE) {
if (p >= lastP)
if (p >= lastP) {
errln((UnicodeString)"previous() failed to move backward: previous() on position "
+ lastP + (UnicodeString)" yielded " + p);
break;
}
text.extractBetween(p, lastP, selection);
result->insertElementAt(selection, 0);
}
else {
if (lastP != 0)
if (lastP != 0) {
errln((UnicodeString)"previous() returned DONE prematurely: offset was "
+ lastP + (UnicodeString)" instead of 0");
break;
}
}
lastP = p;
}

View File

@ -51,6 +51,10 @@ public:
* Tests Hindi(Devanagiri) word iteration
**/
void TestHindiWordBreak(void);
/**
* Tests Title Case break iteration
**/
void TestTitleBreak(void);
/**
* Test Hindi Danda i.e make sure we have a break point before and after danda
**/

View File

@ -99,7 +99,7 @@ all-local: build-local
DAT_FILES=uprops.dat unames.dat cnvalias.dat tz.dat
# ALL of these files can be deleted (the following BRK files) - they are copied
BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk word_th.brk
BRK_FILES=char.brk line.brk line_th.brk sent.brk word.brk title.brk word_th.brk
# don't include thaidict.brk - it goes into a resource bundle - plus it isn't deleted
DATAFILESD=$(DAT_FILES:%=$(OBJDATADIR)/%)