ICU-2077 RBBI: review comments incorporated. (incomplete, more to come.)

X-SVN-Rev: 9612
This commit is contained in:
Andy Heninger 2002-08-08 00:39:13 +00:00
parent 48acf9f6dd
commit 6df1676310
9 changed files with 287 additions and 197 deletions

View File

@ -37,7 +37,7 @@ const int32_t BreakIterator::DONE = (int32_t)-1;
// -------------------------------------
// Creates a simple text boundary for word breaks.
// Creates a break iterator for word breaks.
BreakIterator*
BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
{
@ -49,31 +49,32 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "word_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (U_SUCCESS(status)) {
if(!uprv_strcmp(filename, "word_th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
else {
result = new RuleBasedBreakIterator(file, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
if(!uprv_strcmp(filename, "word_th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
@ -81,7 +82,7 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status)
// -------------------------------------
// Creates a simple text boundary for line breaks.
// Creates a break iterator for line breaks.
BreakIterator*
BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
{
@ -93,39 +94,39 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status)
if (U_FAILURE(status))
return NULL;
if (!uprv_strcmp(key.getLanguage(), "th"))
{
filename = "line_th";
}
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (!U_FAILURE(status)) {
if (!uprv_strcmp(key.getLanguage(), "th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
else {
result = new RuleBasedBreakIterator(file, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
}
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (!uprv_strcmp(key.getLanguage(), "th")) {
filename = "thaidict.brk";
result = new DictionaryBasedBreakIterator(file, filename, status);
}
else {
result = new RuleBasedBreakIterator(file, status);
}
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a simple text boundary for character breaks.
// Creates a break iterator for character breaks.
BreakIterator*
BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& status)
{
@ -138,22 +139,26 @@ BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& stat
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (!U_FAILURE(status)) {
result = new RuleBasedBreakIterator(file, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
}
// -------------------------------------
// Creates a simple text boundary for sentence breaks.
// Creates a break iterator for sentence breaks.
BreakIterator*
BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status)
{
@ -166,14 +171,19 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (!U_FAILURE(status)) {
result = new RuleBasedBreakIterator(file, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
@ -181,7 +191,7 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status
// -------------------------------------
// Creates a simple text boundary for title casing breaks.
// Creates a break iterator for title casing breaks.
BreakIterator*
BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status)
{
@ -194,14 +204,19 @@ BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status)
if (U_FAILURE(status))
return NULL;
UDataMemory* file = udata_open(NULL, "brk", filename, &status);
if (U_FAILURE(status)) {
return NULL;
}
// The UDataMemory is adopted by the break iterator.
if (!U_FAILURE(status)) {
result = new RuleBasedBreakIterator(file, status);
/* test for NULL */
if(result == 0) {
status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
result = new RuleBasedBreakIterator(file, status);
if (result == NULL) {
udata_close(file);
status = U_MEMORY_ALLOCATION_ERROR;
}
if (U_FAILURE(status)) { // Sometimes redundant check, but simple.
delete result;
result = NULL;
}
return result;
@ -234,11 +249,11 @@ BreakIterator::getDisplayName(const Locale& objectLocale,
return objectLocale.getDisplayName(displayLocale, name);
}
// -------------------------------------
// Needed because we declare the copy constructor (in order to prevent synthesizing one) and
// so the default constructor is no longer synthesized.
// ------------------------------------------
//
// Default constructor and destructor
//
//-------------------------------------------
BreakIterator::BreakIterator()
{
fBufferClone = FALSE;

View File

@ -18,6 +18,7 @@
#include "rbbirb.h"
#include "filestrm.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
@ -25,8 +26,7 @@ U_NAMESPACE_BEGIN
static const int16_t START_STATE = 1; // The state number of the starting state
static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop"
/**
* Class ID. (value is irrelevant; address is important)
@ -86,6 +86,10 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules,
if (U_FAILURE(status)) {return;};
RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *)
RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status);
// Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that
// creates and returns a complete RBBI. From here, in a constructor, we
// can't just return the object created by the builder factory, hence
// the assignment of the factory created object to "this".
if (U_SUCCESS(status)) {
*this = *bi;
delete bi;
@ -118,16 +122,15 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth
}
//=======================================================================
// boilerplate
//=======================================================================
/**
* Destructor
*/
RuleBasedBreakIterator::~RuleBasedBreakIterator() {
delete fText;
fText = NULL;
if (fData != NULL) {
fData->removeReference();
fData = NULL;
}
}
@ -163,6 +166,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
//-----------------------------------------------------------------------------
//
// init() Shared initialization routine. Used by all the constructors.
// Initializes all fields, leaving the object in a consistent state.
//
//-----------------------------------------------------------------------------
UBool RuleBasedBreakIterator::fTrace = FALSE;
@ -179,7 +183,7 @@ void RuleBasedBreakIterator::init() {
if (debugInitDone == FALSE) {
#ifdef RBBI_DEBUG
char *debugEnv = getenv("U_RBBIDEBUG");
if (debugEnv && strstr(debugEnv, "trace")) {
if (debugEnv && uprv_strstr(debugEnv, "trace")) {
fTrace = TRUE;
}
#endif
@ -268,7 +272,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) {
reset();
delete fText;
fText = newText;
fText->first();
this->first();
}
/**
@ -286,8 +290,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) {
else {
delete fText;
fText = new StringCharacterIterator(newText);
fText->first();
}
this->first();
}
@ -435,11 +439,14 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) {
fLastBreakTagValid = TRUE;
if (fText == NULL || offset >= fText->endIndex()) {
// fText->setToEnd();
return BreakIterator::DONE;
// return BreakIterator::DONE;
last();
return next();
}
else if (offset < fText->startIndex()) {
// fText->setToStart();
return fText->startIndex();
// return fText->startIndex();
return first();
}
// otherwise, set our internal iteration position (temporarily)
@ -476,10 +483,11 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
// just return DONE; if it's before the beginning, return the
// text's starting offset
if (fText == NULL || offset > fText->endIndex()) {
return BreakIterator::DONE;
// return BreakIterator::DONE;
return last();
}
else if (offset < fText->startIndex()) {
return fText->startIndex();
return first();
}
// if we start by updating the current iteration position to the
@ -499,19 +507,25 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) {
UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
// the beginning index of the iterator is always a boundary position by definition
if (fText == NULL || offset == fText->startIndex()) {
first(); // For side effects on current position, tag values.
return TRUE;
}
// out-of-range indexes are never boundary positions
else if (offset < fText->startIndex() || offset > fText->endIndex()) {
if (offset < fText->startIndex()) {
first(); // For side effects on current position, tag values.
return FALSE;
}
if (offset > fText->endIndex()) {
last(); // For side effects on current position, tag values.
return FALSE;
}
// otherwise, we can use following() on the position before the specified
// one and return true of the position we get back is the one the user
// one and return true if the position we get back is the one the user
// specified
else
return following(offset - 1) == offset;
return following(offset - 1) == offset;
}
/**
@ -555,7 +569,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
int32_t result = fText->getIndex() + 1;
int32_t lookaheadResult = 0;
// begin in state 1
// Initialize the state machine. Begin in state 1
int32_t state = START_STATE;
int16_t category;
UChar32 c = fText->current32();
@ -565,16 +579,19 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
fLastBreakTag = 0;
row = (RBBIStateTableRow *)
row = (RBBIStateTableRow *) // Point to starting row of state table.
(fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state));
// Character Category fetch for starting character.
// See comments on character category code within loop, below.
UTRIE_GET16(&fData->fTrie, c, category);
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
category &= ~0x4000;
}
// loop until we reach the end of the text or transition to state 0
for (;;) {
// loop until we reach the end of the text or transition to state 0
for (;;) {
if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) {
// Note: CharacterIterator::DONE is 0xffff, which is also a legal
// character value. Check for DONE first, because it's quicker,
@ -586,15 +603,16 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
// Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
// not the size of the character going in.
//
// And off bit 14, which flags use of a dictionary for dictionary based
// iterators, but should be ignored here.
UTRIE_GET16(&fData->fTrie, c, category);
// Check the dictionary bit in the character's category.
// Counter is only used by dictionary based iterators.
// Counter is only used by dictionary based iterators (subclasses).
// Chars that need to be handled by a dictionary have a flag bit set
// in their category values.
//
if ((category & 0x4000) != 0) {
fDictionaryCharCount++;
// And off the dictionary flag bit.
category &= ~0x4000;
}
@ -616,6 +634,8 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
// Get the next character. Doing it here positions the iterator
// to the correct position for recording matches in the code that
// follows.
// TODO: 16 bit next, and a 16 bit TRIE lookup, with escape code
// for non-BMP chars, would be faster.
c = fText->next32();
if (row->fAccepting == 0 && row->fLookAhead == 0) {
@ -636,7 +656,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) {
if (row->fAccepting == 0 && row->fLookAhead != 0) {
// Lookahead match point. Remember it, but only if no other rule has
// unconitionally matched up to this point.
// TODO: handle case where there's a pending match from a different rule
// TODO: handle case where there's a pending match from a different rule -
// where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead.
int32_t r = fText->getIndex();
if (r > result) {
@ -672,6 +692,7 @@ continueOn:
// a lookahead state, advance the break position to the lookahead position
// (the theory here is that if there are no characters at all after the lookahead
// position, that always matches the lookahead criteria)
// TODO: is this really the right behavior?
if (c == CharacterIterator::DONE &&
fText->hasNext()==FALSE &&
lookaheadResult == fText->endIndex()) {
@ -694,8 +715,9 @@ continueOn:
// This method backs the iterator back up to a "safe position" in the text.
// This is a position that we know, without any context, must be a break position.
// The various calling methods then iterate forward from this safe position to
// the appropriate position to return. (For more information, see the description
// of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.)
// the appropriate position to return.
//
// The logic of this function is very similar to handleNext(), above.
//
//-----------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::handlePrevious(void) {
@ -833,18 +855,27 @@ RuleBasedBreakIterator::reset()
//-------------------------------------------------------------------------------
//
// getRuleStatus()
// getRuleStatus() Return the break rule tag associated with the current
// iterator position. If the iterator arrived at its current
// position by iterating forwards, the value will have been
// cached by the handleNext() function.
//
// If no cached status value is available, the status is
// found by doing a previous() followed by a next(), which
// leaves the iterator where it started, and computes the
// status while doing the next().
//
//-------------------------------------------------------------------------------
int32_t RuleBasedBreakIterator::getRuleStatus() const {
// If the break tag value is unkown, back the iterator up, then move
// forward again. Moving forward will set the fLastBreakTag value correctly.
RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this;
if (fLastBreakTagValid == FALSE) {
if (current() == fText->startIndex()) {
// No cached status is available.
if (fText == NULL || current() == fText->startIndex()) {
// At start of text, or there is no text. Status is always zero.
nonConstThis->fLastBreakTag = 0;
nonConstThis->fLastBreakTagValid = TRUE;
} else {
// Not at start of text. Find status the tedious way.
int32_t pa = current();
nonConstThis->previous();
int32_t pb = nonConstThis->next();
@ -857,7 +888,7 @@ int32_t RuleBasedBreakIterator::getRuleStatus() const {
//-------------------------------------------------------------------------------
//
// getFlattenedData Access to the compiled form of the rules,
// getBinaryRules Access to the compiled form of the rules,
// for use by build system tools that save the data
// for standard iterator types.
//
@ -868,7 +899,7 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) {
if (fData != NULL) {
retPtr = (const uint8_t *)fData->fHeader;
length = fData->fHeader->fLength;
length = fData->fHeader->fLength;
}
return retPtr;
}

View File

@ -1,8 +1,8 @@
/*
**********************************************************************
***************************************************************************
* Copyright (C) 1999-2002 International Business Machines Corporation *
* and others. All rights reserved. *
**********************************************************************
* and others. All rights reserved. *
***************************************************************************
*/
#include "unicode/utypes.h"
@ -156,7 +156,7 @@ int32_t RBBIDataWrapper::hashCode() {
//
//-----------------------------------------------------------------------------
void RBBIDataWrapper::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
};
@ -221,9 +221,4 @@ void RBBIDataWrapper::printData() {
U_NAMESPACE_END

View File

@ -35,7 +35,8 @@ U_NAMESPACE_BEGIN
// All of them are strung together in a linked list, which is kept in order
// (by character)
//
struct RangeDescriptor : public UObject {
class RangeDescriptor : public UObject {
public:
UChar32 fStartChar; // Start of range, unicode 32 bit value.
UChar32 fEndChar; // End of range, unicode 32 bit value.
int32_t fNum; // runtime-mapped input value for this range.

View File

@ -94,22 +94,27 @@ ubrk_openRules( const UChar *rules,
UParseError *parseErr,
UErrorCode *status) {
BreakIterator *result = 0;
if (status == NULL || U_FAILURE(*status)){
return 0;
}
BreakIterator *result = 0;
UnicodeString ruleString(rules, rulesLength);
result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status);
if(U_FAILURE(*status)) {
return 0;
}
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLength);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete result;
return 0;
if (text != NULL) {
UCharCharacterIterator *iter = 0;
iter = new UCharCharacterIterator(text, textLength);
if(iter == 0) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete result;
return 0;
}
result->adoptText(iter);
}
result->adoptText(iter);
return (UBreakIterator *)result;
}
@ -243,7 +248,7 @@ ubrk_countAvailable()
}
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset)
{
return ((BreakIterator *)bi)->isBoundary(offset);

View File

@ -1,10 +1,10 @@
/*
*****************************************************************************************
* Copyright (C) 1997-2001, International Business Machines
* Copyright (C) 1997-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*****************************************************************************************
*
* File BRKITER.H
* File brkiter.h
*
* Modification History:
*
@ -65,13 +65,13 @@ U_NAMESPACE_BEGIN
* <P>
* Helper function to output text
* <pre>
* \code
* \code
* void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
* {
* UnicodeString textBuffer, temp;
* CharacterIterator *strIter = iterator.createText();
* strIter->getText(temp);
* cout << " " << start << " " << end << " |"
* cout << " " << start << " " << end << " |"
* << temp.extractBetween(start, end, textBuffer)
* << "|" << endl;
* delete strIter;
@ -149,7 +149,7 @@ U_NAMESPACE_BEGIN
* BreakIterator* boundary;
* UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
* cout << "Examining: " << stringToExamine << endl;
*
*
* //print each sentence in forward and reverse order
* boundary = BreakIterator::createSentenceInstance( Locale::US );
* boundary->setText(stringToExamine);
@ -158,7 +158,7 @@ U_NAMESPACE_BEGIN
* cout << "----- backward: ----------" << endl;
* printEachBackward(*boundary);
* delete boundary;
*
*
* //print each word in order
* boundary = BreakIterator::createWordInstance();
* boundary->setText(stringToExamine);
@ -173,7 +173,7 @@ U_NAMESPACE_BEGIN
* //print word at charpos 10
* cout << "----- at pos 10: ---------" << endl;
* printAt(*boundary, 10 );
*
*
* delete boundary;
* }
* \endcode
@ -222,6 +222,8 @@ public:
/**
* Return a CharacterIterator over the text being analyzed.
* Changing the state of the returned iterator can have undefined consequences
* on the operation of the break iterator. If you need to change it, clone it first.
* @stable
*/
virtual const CharacterIterator& getText(void) const = 0;
@ -278,8 +280,7 @@ public:
virtual int32_t next(void) = 0;
/**
* Return character index of the text boundary that was most recently
* returned by next(), previous(), first(), or last()
* Return character index of the current interator position within the text.
* @return The boundary most recently returned.
* @stable
*/
@ -304,9 +305,11 @@ public:
* @stable
*/
virtual int32_t preceding(int32_t offset) = 0;
/**
* Return true if the specfied position is a boundary position.
* As a side effect, the current position of the iterator is set
* to the first boundary position at or following the specified offset.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable
@ -328,22 +331,22 @@ public:
* Create BreakIterator for word-breaks using the given locale.
* Returns an instance of a BreakIterator implementing word breaks.
* WordBreak is useful for word selection (ex. double click)
* @param where the locale.
* @param where the locale.
* @param status the error code
* @return A BreakIterator for word-breaks. The UErrorCode& status
* @return A BreakIterator for word-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
*/
static BreakIterator* createWordInstance(const Locale& where,
static BreakIterator* createWordInstance(const Locale& where,
UErrorCode& status);
/**
@ -354,84 +357,84 @@ public:
* LineBreak is useful for word wrapping text.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for line-breaks. The UErrorCode& status
* @return A BreakIterator for line-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
*/
static BreakIterator* createLineInstance(const Locale& where,
static BreakIterator* createLineInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for character-breaks using specified locale
* Returns an instance of a BreakIterator implementing character breaks.
* Character breaks are boundaries of combining character sequences.
* @param where the locale.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for character-breaks. The UErrorCode& status
* @return A BreakIterator for character-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
*/
static BreakIterator* createCharacterInstance(const Locale& where,
static BreakIterator* createCharacterInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for sentence-breaks using specified locale
* Returns an instance of a BreakIterator implementing sentence breaks.
* @param where the locale.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for sentence-breaks. The UErrorCode& status
* @return A BreakIterator for sentence-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
*/
static BreakIterator* createSentenceInstance(const Locale& where,
static BreakIterator* createSentenceInstance(const Locale& where,
UErrorCode& status);
/**
* Create BreakIterator for title-casing breaks using the specified locale
* Returns an instance of a BreakIterator implementing title breaks.
* @param where the locale.
* @param where the locale.
* @param status The error code.
* @return A BreakIterator for title-breaks. The UErrorCode& status
* @return A BreakIterator for title-breaks. The UErrorCode& status
* parameter is used to return status information to the user.
* To check whether the construction succeeded or not, you should check
* the value of U_SUCCESS(err). If you wish more detailed information, you
* can check for informational error results which still indicate success.
* U_USING_FALLBACK_ERROR indicates that a fall back locale was used. For
* U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
* example, 'de_CH' was requested, but nothing was found there, so 'de' was
* used. U_USING_DEFAULT_ERROR indicates that the default locale data was
* used. U_USING_DEFAULT_WARNING indicates that the default locale data was
* used; neither the requested locale nor any of its fall back locales
* could be found.
* The caller owns the returned object and is responsible for deleting it.
* @stable
* @draft ICU 2.1
*/
static BreakIterator* createTitleInstance(const Locale& where,
static BreakIterator* createTitleInstance(const Locale& where,
UErrorCode& status);
/**
@ -469,24 +472,30 @@ public:
/**
* Thread safe client-buffer-based cloning operation
* Do NOT call delete on a safeclone, since 'new' is not used to create it.
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
* If buffer is not large enough, new memory will be allocated.
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* @param BufferSize reference to size of allocated space.
* If BufferSize == 0, a sufficient size for use in cloning will
* be returned ('pre-flighting')
* If BufferSize is not enough for a stack-based safe clone,
* If BufferSize is not enough for a stack-based safe clone,
* new memory will be allocated.
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
* necessary.
* @return pointer to the new clone
*
* @draft ICU 1.8
*
* @stable
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
UErrorCode &status) = 0;
/**
* Determine whether the BreakIterator was created in user memory by
* createBufferClone(), and thus should not be deleted. Such objects
* must be closed by an explicit call to the destructor (not delete).
* @stable
*/
inline UBool isBufferClone(void);

View File

@ -24,9 +24,9 @@ struct UTrie;
U_NAMESPACE_BEGIN
struct RBBIDataHeader;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
class RuleBasedBreakIteratorTables;
class BreakIterator;
class RBBIDataWrapper;
@ -37,10 +37,6 @@ class RBBIDataWrapper;
* <p>See the ICU User Guide for information on Break Iterator Rules.</p>
*
*/
class U_COMMON_API RuleBasedBreakIterator : public BreakIterator {
protected:
@ -74,7 +70,7 @@ protected:
uint32_t fDictionaryCharCount;
//
// Debugging flag.
// Debugging flag. Trace operation of state machine when true.
//
static UBool fTrace;
@ -117,7 +113,8 @@ protected:
public:
/** Default constructor. Creates an empty shell of an iterator, with no
* rules or text to iterate over. Object can subsequently be assigned.
* rules or text to iterate over. Object can subsequently be assigned to.
* @draft ICU 2.2
*/
RuleBasedBreakIterator();
@ -134,12 +131,14 @@ public:
* @param parseError In the event of a syntax error in the rules, provides the location
* within the rules of the problem.
* @param status Information on any errors encountered.
* @draft ICU 2.2
*/
RuleBasedBreakIterator( const UnicodeString &rules,
UParseError &parseError,
UErrorCode &status);
/**
* Destructor
* @stable
*/
virtual ~RuleBasedBreakIterator();
@ -148,6 +147,7 @@ public:
* and iterate over the same text, as the one passed in.
* @param that The RuleBasedBreakItertor passed in
* @return the newly created RuleBasedBreakIterator
* @stable
*/
RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
@ -157,6 +157,7 @@ public:
* @param that The BreakIterator to be compared for equality
* @Return TRUE if both BreakIterators are of the
* same class, have the same behavior, and iterate over the same text.
* @stable
*/
virtual UBool operator==(const BreakIterator& that) const;
@ -165,6 +166,7 @@ public:
* and vice versa.
* @param that The BreakIterator to be compared for inequality
* @return TRUE if both BreakIterators are not same.
* @stable
*/
UBool operator!=(const BreakIterator& that) const;
@ -175,18 +177,21 @@ public:
* will correctly clone (copy) a derived class.
* clone() is thread safe. Multiple threads may simultaeneously
* clone the same source break iterator.
* @stable
*/
virtual BreakIterator* clone() const;
/**
* Compute a hash code for this BreakIterator
* @return A hash code
* @stable
*/
virtual int32_t hashCode(void) const;
/**
* Returns the description used to create this iterator
* @return the description used to create this iterator
* @stable
*/
virtual const UnicodeString& getRules(void) const;
@ -200,6 +205,7 @@ public:
* Changing the state of this iterator can have undefined consequences. If
* you need to change it, clone it first.
* @return An iterator over the text being analyzed.
* @stable
*/
virtual const CharacterIterator& getText(void) const;
@ -209,6 +215,7 @@ public:
* the current iteration position to the beginning of the text.
* @param newText An iterator over the text to analyze. The BreakIterator
* takes ownership of the character iterator. The caller MUST NOT delete it!
* @stable
*/
virtual void adoptText(CharacterIterator* newText);
@ -216,6 +223,7 @@ public:
* Set the iterator to analyze a new piece of text. This function resets
* the current iteration position to the beginning of the text.
* @param newText The text to analyze.
* @stable
*/
virtual void setText(const UnicodeString& newText);
@ -223,6 +231,7 @@ public:
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
* @stable
*/
virtual int32_t first(void);
@ -230,6 +239,7 @@ public:
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
* @stable
*/
virtual int32_t last(void);
@ -241,18 +251,21 @@ public:
* (negative is backwards, and positive is forwards).
* @return The character offset of the boundary position n boundaries away from
* the current one.
* @stable
*/
virtual int32_t next(int32_t n);
/**
* Advances the iterator to the next boundary position.
* @return The position of the first boundary after this one.
* @stable
*/
virtual int32_t next(void);
/**
* Moves the iterator backwards, to the last boundary preceding this one.
* @return The position of the last boundary position preceding this one.
* @stable
*/
virtual int32_t previous(void);
@ -261,6 +274,7 @@ public:
* the specified position.
* @param offset The position from which to begin searching for a break position.
* @return The position of the first break after the current position.
* @stable
*/
virtual int32_t following(int32_t offset);
@ -269,6 +283,7 @@ public:
* specified position.
* @param offset The position to begin searching for a break from.
* @return The position of the last boundary before the starting position.
* @stable
*/
virtual int32_t preceding(int32_t offset);
@ -278,12 +293,14 @@ public:
* or after "offset".
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable
*/
virtual UBool isBoundary(int32_t offset);
/**
* Returns the current iteration position.
* @return The current iteration position.
* @stable
*/
virtual int32_t current(void) const;
@ -295,6 +312,7 @@ public:
* status, a default value of 0 is returned.
* @return the status from the break rule that determined the most recently
* returned break position.
* @draft ICU 2.2
*/
virtual int32_t getRuleStatus() const;
@ -336,7 +354,7 @@ public:
* buffer size, but do not clone the object. If the
* size was too small (but not zero), allocate heap
* storage for the cloned object.
*
*
* @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
* returned if the the provided buffer was too small, and
* the clone was therefore put on the heap.
@ -344,6 +362,7 @@ public:
* @return Pointer to the clone object. This may differ from the stackBuffer
* address if the byte alignment of the stack buffer was not suitable
* or if the stackBuffer was too small to hold the clone.
* @draft stable
*/
virtual BreakIterator * createBufferClone(void *stackBuffer,
int32_t &BufferSize,
@ -365,6 +384,7 @@ public:
* @return A pointer to the binary (compiled) rule data. The storage
* belongs to the RulesBasedBreakIterator object, not the
* caller, and must not be modified or deleted.
* @internal
*/
virtual const uint8_t *getBinaryRules(uint32_t &length);

View File

@ -47,7 +47,7 @@
* typically starts of words, that should be set to Title Case
* when title casing the text.
* <P>
*
*
* This is the interface for all text boundaries.
* <P>
* Examples:
@ -204,15 +204,27 @@ typedef enum UBreakIteratorType UBreakIteratorType;
* than for single individual values.
*/
enum UWordBreak {
/** Tag value for "words" that do not fit into any of other categories.
* Includes spaces and most punctuation. */
UBRK_WORD_NONE = 0,
/** Upper bound for tags for uncategorized words. */
UBRK_WORD_NONE_LIMIT = 100,
/** Tag value for words that appear to be numbers, lower limit. */
UBRK_WORD_NUMBER = 100,
/** Tag value for words that appear to be numbers, upper limit. */
UBRK_WORD_NUMBER_LIMIT = 200,
/** Tag value for words that contain letters, excluding
* hiragana, katakana or ideographic characters, lower limit. */
UBRK_WORD_LETTER = 200,
/** Tag value for words containing letters, upper limit */
UBRK_WORD_LETTER_LIMIT = 300,
UBRK_WORD_HIRAKATA = 300,
UBRK_WORD_HIRAKATA_LIMIT = 400,
/** Tag value for words containing kana characters, lower limit */
UBRK_WORD_KANA = 300,
/** Tag value for words containing kana characters, upper limit */
UBRK_WORD_KANA_LIMIT = 400,
/** Tag value for words containing ideographic characters, lower limit */
UBRK_WORD_IDEO = 400,
/** Tag value for words containing ideographic characters, upper limit */
UBRK_WORD_IDEO_LIMIT = 500
};
typedef enum UWordBreak UWordBreak;
@ -232,7 +244,7 @@ typedef enum UWordBreak UWordBreak;
* @see ubrk_openRules
* @stable
*/
U_CAPI UBreakIterator* U_EXPORT2
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
const char *locale,
const UChar *text,
@ -252,9 +264,9 @@ ubrk_open(UBreakIteratorType type,
* @param status A UErrorCode to receive any errors.
* @return A UBreakIterator for the specified rules.
* @see ubrk_open
* @draft
* @draft ICU 2.2
*/
U_CAPI UBreakIterator* U_EXPORT2
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules(const UChar *rules,
int32_t rulesLength,
const UChar *text,
@ -276,9 +288,9 @@ ubrk_openRules(const UChar *rules,
* @param status to indicate whether the operation went on smoothly or there were errors
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
* @return pointer to the new clone
* @draft ICU 1.8
* @stable
*/
U_CAPI UBreakIterator * U_EXPORT2
U_CAPI UBreakIterator * U_EXPORT2
ubrk_safeClone(
const UBreakIterator *bi,
void *stackBuffer,
@ -293,7 +305,7 @@ ubrk_safeClone(
* @param bi The break iterator to close.
* @stable
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi);
/**
@ -304,7 +316,7 @@ ubrk_close(UBreakIterator *bi);
* @param status The error code
* @stable
*/
U_CAPI void U_EXPORT2
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
const UChar* text,
int32_t textLength,
@ -318,7 +330,7 @@ ubrk_setText(UBreakIterator* bi,
* \Ref{ubrk_first}, or \Ref{ubrk_last}.
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_current(const UBreakIterator *bi);
/**
@ -330,7 +342,7 @@ ubrk_current(const UBreakIterator *bi);
* @see ubrk_previous
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_next(UBreakIterator *bi);
/**
@ -342,7 +354,7 @@ ubrk_next(UBreakIterator *bi);
* @see ubrk_next
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_previous(UBreakIterator *bi);
/**
@ -353,7 +365,7 @@ ubrk_previous(UBreakIterator *bi);
* @see ubrk_last
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_first(UBreakIterator *bi);
/**
@ -366,7 +378,7 @@ ubrk_first(UBreakIterator *bi);
* @see ubrk_first
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_last(UBreakIterator *bi);
/**
@ -378,7 +390,7 @@ ubrk_last(UBreakIterator *bi);
* @see ubrk_following
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_preceding(UBreakIterator *bi,
int32_t offset);
@ -391,7 +403,7 @@ ubrk_preceding(UBreakIterator *bi,
* @see ubrk_preceding
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_following(UBreakIterator *bi,
int32_t offset);
@ -404,7 +416,7 @@ ubrk_following(UBreakIterator *bi,
* @see ubrk_countAvailable
* @stable
*/
U_CAPI const char* U_EXPORT2
U_CAPI const char* U_EXPORT2
ubrk_getAvailable(int32_t index);
/**
@ -415,7 +427,7 @@ ubrk_getAvailable(int32_t index);
* @see ubrk_getAvailable
* @stable
*/
U_CAPI int32_t U_EXPORT2
U_CAPI int32_t U_EXPORT2
ubrk_countAvailable(void);
@ -426,8 +438,9 @@ ubrk_countAvailable(void);
* @param bi The break iterator to use.
* @param offset the offset to check.
* @return True if "offset" is a boundary position.
* @stable
*/
U_CAPI UBool U_EXPORT2
U_CAPI UBool U_EXPORT2
ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
/**
@ -437,6 +450,7 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset);
* status, a default value of 0 is returned.
* <p>
* For word break iterators, the possible values are defined in enum UWordBreak.
* @draft ICU 2.2
*/
U_CAPI int32_t U_EXPORT2
ubrk_getRuleStatus(UBreakIterator *bi);

View File

@ -654,12 +654,12 @@ void RBBIAPITest::TestWordStatus() {
int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER,
UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE,
UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE,
UBRK_WORD_HIRAKATA, UBRK_WORD_NONE, UBRK_WORD_HIRAKATA};
UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA};
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_HIRAKATA_LIMIT};
int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT,
UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT};
UErrorCode status=U_ZERO_ERROR;