From 6df16763109cf4a032258081813bb865f8f465c1 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Thu, 8 Aug 2002 00:39:13 +0000 Subject: [PATCH] ICU-2077 RBBI: review comments incorporated. (incomplete, more to come.) X-SVN-Rev: 9612 --- icu4c/source/common/brkiter.cpp | 153 +++++++++++++----------- icu4c/source/common/rbbi.cpp | 95 ++++++++++----- icu4c/source/common/rbbidata.cpp | 13 +- icu4c/source/common/rbbisetb.h | 3 +- icu4c/source/common/ubrk.cpp | 23 ++-- icu4c/source/common/unicode/brkiter.h | 93 +++++++------- icu4c/source/common/unicode/rbbi.h | 40 +++++-- icu4c/source/common/unicode/ubrk.h | 54 +++++---- icu4c/source/test/intltest/rbbiapts.cpp | 10 +- 9 files changed, 287 insertions(+), 197 deletions(-) diff --git a/icu4c/source/common/brkiter.cpp b/icu4c/source/common/brkiter.cpp index 7b77da8350..dca3e4d4bb 100644 --- a/icu4c/source/common/brkiter.cpp +++ b/icu4c/source/common/brkiter.cpp @@ -37,7 +37,7 @@ const int32_t BreakIterator::DONE = (int32_t)-1; // ------------------------------------- -// Creates a simple text boundary for word breaks. +// Creates a break iterator for word breaks. BreakIterator* BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) { @@ -49,31 +49,32 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) if (U_FAILURE(status)) return NULL; + if (!uprv_strcmp(key.getLanguage(), "th")) { filename = "word_th"; } UDataMemory* file = udata_open(NULL, "brk", filename, &status); + if (U_FAILURE(status)) { + return NULL; + } + // The UDataMemory is adopted by the break iterator. - if (U_SUCCESS(status)) { - if(!uprv_strcmp(filename, "word_th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } - else { - result = new RuleBasedBreakIterator(file, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } + if(!uprv_strcmp(filename, "word_th")) { + filename = "thaidict.brk"; + result = new DictionaryBasedBreakIterator(file, filename, status); + } + else { + result = new RuleBasedBreakIterator(file, status); + } + if (result == NULL) { + udata_close(file); + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { // Sometimes redundant check, but simple. + delete result; + result = NULL; } return result; @@ -81,7 +82,7 @@ BreakIterator::createWordInstance(const Locale& key, UErrorCode& status) // ------------------------------------- -// Creates a simple text boundary for line breaks. +// Creates a break iterator for line breaks. BreakIterator* BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) { @@ -93,39 +94,39 @@ BreakIterator::createLineInstance(const Locale& key, UErrorCode& status) if (U_FAILURE(status)) return NULL; + if (!uprv_strcmp(key.getLanguage(), "th")) { filename = "line_th"; } UDataMemory* file = udata_open(NULL, "brk", filename, &status); - - if (!U_FAILURE(status)) { - if (!uprv_strcmp(key.getLanguage(), "th")) { - filename = "thaidict.brk"; - result = new DictionaryBasedBreakIterator(file, filename, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } - else { - result = new RuleBasedBreakIterator(file, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } - } + if (U_FAILURE(status)) { + return NULL; } + // The UDataMemory is adopted by the break iterator. + if (!uprv_strcmp(key.getLanguage(), "th")) { + filename = "thaidict.brk"; + result = new DictionaryBasedBreakIterator(file, filename, status); + } + else { + result = new RuleBasedBreakIterator(file, status); + } + if (result == NULL) { + udata_close(file); + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { // Sometimes redundant check, but simple. + delete result; + result = NULL; + } return result; } // ------------------------------------- -// Creates a simple text boundary for character breaks. +// Creates a break iterator for character breaks. BreakIterator* BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& status) { @@ -138,22 +139,26 @@ BreakIterator::createCharacterInstance(const Locale& /* key */, UErrorCode& stat if (U_FAILURE(status)) return NULL; UDataMemory* file = udata_open(NULL, "brk", filename, &status); - - if (!U_FAILURE(status)) { - result = new RuleBasedBreakIterator(file, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } + if (U_FAILURE(status)) { + return NULL; } + // The UDataMemory is adopted by the break iterator. + result = new RuleBasedBreakIterator(file, status); + if (result == NULL) { + udata_close(file); + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { // Sometimes redundant check, but simple. + delete result; + result = NULL; + } return result; } // ------------------------------------- -// Creates a simple text boundary for sentence breaks. +// Creates a break iterator for sentence breaks. BreakIterator* BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status) { @@ -166,14 +171,19 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status if (U_FAILURE(status)) return NULL; UDataMemory* file = udata_open(NULL, "brk", filename, &status); + if (U_FAILURE(status)) { + return NULL; + } + // The UDataMemory is adopted by the break iterator. - if (!U_FAILURE(status)) { - result = new RuleBasedBreakIterator(file, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } + result = new RuleBasedBreakIterator(file, status); + if (result == NULL) { + udata_close(file); + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { // Sometimes redundant check, but simple. + delete result; + result = NULL; } return result; @@ -181,7 +191,7 @@ BreakIterator::createSentenceInstance(const Locale& /*key */, UErrorCode& status // ------------------------------------- -// Creates a simple text boundary for title casing breaks. +// Creates a break iterator for title casing breaks. BreakIterator* BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status) { @@ -194,14 +204,19 @@ BreakIterator::createTitleInstance(const Locale& /* key */, UErrorCode& status) if (U_FAILURE(status)) return NULL; UDataMemory* file = udata_open(NULL, "brk", filename, &status); + if (U_FAILURE(status)) { + return NULL; + } + // The UDataMemory is adopted by the break iterator. - if (!U_FAILURE(status)) { - result = new RuleBasedBreakIterator(file, status); - /* test for NULL */ - if(result == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return 0; - } + result = new RuleBasedBreakIterator(file, status); + if (result == NULL) { + udata_close(file); + status = U_MEMORY_ALLOCATION_ERROR; + } + if (U_FAILURE(status)) { // Sometimes redundant check, but simple. + delete result; + result = NULL; } return result; @@ -234,11 +249,11 @@ BreakIterator::getDisplayName(const Locale& objectLocale, return objectLocale.getDisplayName(displayLocale, name); } -// ------------------------------------- - -// Needed because we declare the copy constructor (in order to prevent synthesizing one) and -// so the default constructor is no longer synthesized. - +// ------------------------------------------ +// +// Default constructor and destructor +// +//------------------------------------------- BreakIterator::BreakIterator() { fBufferClone = FALSE; diff --git a/icu4c/source/common/rbbi.cpp b/icu4c/source/common/rbbi.cpp index e0dd1ba3a4..f8341e41dd 100644 --- a/icu4c/source/common/rbbi.cpp +++ b/icu4c/source/common/rbbi.cpp @@ -18,6 +18,7 @@ #include "rbbirb.h" #include "filestrm.h" #include "cmemory.h" +#include "cstring.h" #include "uassert.h" @@ -25,8 +26,7 @@ U_NAMESPACE_BEGIN static const int16_t START_STATE = 1; // The state number of the starting state - -static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" +static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" /** * Class ID. (value is irrelevant; address is important) @@ -86,6 +86,10 @@ RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, if (U_FAILURE(status)) {return;}; RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) RBBIRuleBuilder::createRuleBasedBreakIterator(rules, parseError, status); + // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that + // creates and returns a complete RBBI. From here, in a constructor, we + // can't just return the object created by the builder factory, hence + // the assignment of the factory created object to "this". if (U_SUCCESS(status)) { *this = *bi; delete bi; @@ -118,16 +122,15 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& oth } -//======================================================================= -// boilerplate -//======================================================================= /** * Destructor */ RuleBasedBreakIterator::~RuleBasedBreakIterator() { delete fText; + fText = NULL; if (fData != NULL) { fData->removeReference(); + fData = NULL; } } @@ -163,6 +166,7 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { //----------------------------------------------------------------------------- // // init() Shared initialization routine. Used by all the constructors. +// Initializes all fields, leaving the object in a consistent state. // //----------------------------------------------------------------------------- UBool RuleBasedBreakIterator::fTrace = FALSE; @@ -179,7 +183,7 @@ void RuleBasedBreakIterator::init() { if (debugInitDone == FALSE) { #ifdef RBBI_DEBUG char *debugEnv = getenv("U_RBBIDEBUG"); - if (debugEnv && strstr(debugEnv, "trace")) { + if (debugEnv && uprv_strstr(debugEnv, "trace")) { fTrace = TRUE; } #endif @@ -268,7 +272,7 @@ RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { reset(); delete fText; fText = newText; - fText->first(); + this->first(); } /** @@ -286,8 +290,8 @@ RuleBasedBreakIterator::setText(const UnicodeString& newText) { else { delete fText; fText = new StringCharacterIterator(newText); - fText->first(); } + this->first(); } @@ -435,11 +439,14 @@ int32_t RuleBasedBreakIterator::following(int32_t offset) { fLastBreakTagValid = TRUE; if (fText == NULL || offset >= fText->endIndex()) { // fText->setToEnd(); - return BreakIterator::DONE; + // return BreakIterator::DONE; + last(); + return next(); } else if (offset < fText->startIndex()) { // fText->setToStart(); - return fText->startIndex(); + // return fText->startIndex(); + return first(); } // otherwise, set our internal iteration position (temporarily) @@ -476,10 +483,11 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { // just return DONE; if it's before the beginning, return the // text's starting offset if (fText == NULL || offset > fText->endIndex()) { - return BreakIterator::DONE; + // return BreakIterator::DONE; + return last(); } else if (offset < fText->startIndex()) { - return fText->startIndex(); + return first(); } // if we start by updating the current iteration position to the @@ -499,19 +507,25 @@ int32_t RuleBasedBreakIterator::preceding(int32_t offset) { UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { // the beginning index of the iterator is always a boundary position by definition if (fText == NULL || offset == fText->startIndex()) { + first(); // For side effects on current position, tag values. return TRUE; } // out-of-range indexes are never boundary positions - else if (offset < fText->startIndex() || offset > fText->endIndex()) { + if (offset < fText->startIndex()) { + first(); // For side effects on current position, tag values. + return FALSE; + } + + if (offset > fText->endIndex()) { + last(); // For side effects on current position, tag values. return FALSE; } // otherwise, we can use following() on the position before the specified - // one and return true of the position we get back is the one the user + // one and return true if the position we get back is the one the user // specified - else - return following(offset - 1) == offset; + return following(offset - 1) == offset; } /** @@ -555,7 +569,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) { int32_t result = fText->getIndex() + 1; int32_t lookaheadResult = 0; - // begin in state 1 + // Initialize the state machine. Begin in state 1 int32_t state = START_STATE; int16_t category; UChar32 c = fText->current32(); @@ -565,16 +579,19 @@ int32_t RuleBasedBreakIterator::handleNext(void) { fLastBreakTag = 0; - row = (RBBIStateTableRow *) + row = (RBBIStateTableRow *) // Point to starting row of state table. (fData->fForwardTable->fTableData + (fData->fForwardTable->fRowLen * state)); + + // Character Category fetch for starting character. + // See comments on character category code within loop, below. UTRIE_GET16(&fData->fTrie, c, category); if ((category & 0x4000) != 0) { fDictionaryCharCount++; category &= ~0x4000; } - // loop until we reach the end of the text or transition to state 0 - for (;;) { + // loop until we reach the end of the text or transition to state 0 + for (;;) { if (c == CharacterIterator::DONE && fText->hasNext()==FALSE) { // Note: CharacterIterator::DONE is 0xffff, which is also a legal // character value. Check for DONE first, because it's quicker, @@ -586,15 +603,16 @@ int32_t RuleBasedBreakIterator::handleNext(void) { // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, // not the size of the character going in. // - // And off bit 14, which flags use of a dictionary for dictionary based - // iterators, but should be ignored here. UTRIE_GET16(&fData->fTrie, c, category); // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iterators. + // Counter is only used by dictionary based iterators (subclasses). + // Chars that need to be handled by a dictionary have a flag bit set + // in their category values. // if ((category & 0x4000) != 0) { fDictionaryCharCount++; + // And off the dictionary flag bit. category &= ~0x4000; } @@ -616,6 +634,8 @@ int32_t RuleBasedBreakIterator::handleNext(void) { // Get the next character. Doing it here positions the iterator // to the correct position for recording matches in the code that // follows. + // TODO: 16 bit next, and a 16 bit TRIE lookup, with escape code + // for non-BMP chars, would be faster. c = fText->next32(); if (row->fAccepting == 0 && row->fLookAhead == 0) { @@ -636,7 +656,7 @@ int32_t RuleBasedBreakIterator::handleNext(void) { if (row->fAccepting == 0 && row->fLookAhead != 0) { // Lookahead match point. Remember it, but only if no other rule has // unconitionally matched up to this point. - // TODO: handle case where there's a pending match from a different rule + // TODO: handle case where there's a pending match from a different rule - // where lookaheadStatus != 0 && lookaheadStatus != row->fLookAhead. int32_t r = fText->getIndex(); if (r > result) { @@ -672,6 +692,7 @@ continueOn: // a lookahead state, advance the break position to the lookahead position // (the theory here is that if there are no characters at all after the lookahead // position, that always matches the lookahead criteria) + // TODO: is this really the right behavior? if (c == CharacterIterator::DONE && fText->hasNext()==FALSE && lookaheadResult == fText->endIndex()) { @@ -694,8 +715,9 @@ continueOn: // This method backs the iterator back up to a "safe position" in the text. // This is a position that we know, without any context, must be a break position. // The various calling methods then iterate forward from this safe position to -// the appropriate position to return. (For more information, see the description -// of buildBackwardsStateTable() in RuleBasedBreakIterator.Builder.) +// the appropriate position to return. +// +// The logic of this function is very similar to handleNext(), above. // //----------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::handlePrevious(void) { @@ -833,18 +855,27 @@ RuleBasedBreakIterator::reset() //------------------------------------------------------------------------------- // -// getRuleStatus() +// getRuleStatus() Return the break rule tag associated with the current +// iterator position. If the iterator arrived at its current +// position by iterating forwards, the value will have been +// cached by the handleNext() function. +// +// If no cached status value is available, the status is +// found by doing a previous() followed by a next(), which +// leaves the iterator where it started, and computes the +// status while doing the next(). // //------------------------------------------------------------------------------- int32_t RuleBasedBreakIterator::getRuleStatus() const { - // If the break tag value is unkown, back the iterator up, then move - // forward again. Moving forward will set the fLastBreakTag value correctly. RuleBasedBreakIterator *nonConstThis = (RuleBasedBreakIterator *)this; if (fLastBreakTagValid == FALSE) { - if (current() == fText->startIndex()) { + // No cached status is available. + if (fText == NULL || current() == fText->startIndex()) { + // At start of text, or there is no text. Status is always zero. nonConstThis->fLastBreakTag = 0; nonConstThis->fLastBreakTagValid = TRUE; } else { + // Not at start of text. Find status the tedious way. int32_t pa = current(); nonConstThis->previous(); int32_t pb = nonConstThis->next(); @@ -857,7 +888,7 @@ int32_t RuleBasedBreakIterator::getRuleStatus() const { //------------------------------------------------------------------------------- // -// getFlattenedData Access to the compiled form of the rules, +// getBinaryRules Access to the compiled form of the rules, // for use by build system tools that save the data // for standard iterator types. // @@ -868,7 +899,7 @@ const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { if (fData != NULL) { retPtr = (const uint8_t *)fData->fHeader; - length = fData->fHeader->fLength; + length = fData->fHeader->fLength; } return retPtr; } diff --git a/icu4c/source/common/rbbidata.cpp b/icu4c/source/common/rbbidata.cpp index 38e37ecc78..50540ef761 100644 --- a/icu4c/source/common/rbbidata.cpp +++ b/icu4c/source/common/rbbidata.cpp @@ -1,8 +1,8 @@ /* -********************************************************************** +*************************************************************************** * Copyright (C) 1999-2002 International Business Machines Corporation * -* and others. All rights reserved. * -********************************************************************** +* and others. All rights reserved. * +*************************************************************************** */ #include "unicode/utypes.h" @@ -156,7 +156,7 @@ int32_t RBBIDataWrapper::hashCode() { // //----------------------------------------------------------------------------- void RBBIDataWrapper::removeReference() { - if (umtx_atomic_dec(&fRefCount) == 0) { + if (umtx_atomic_dec(&fRefCount) == 0) { delete this; } }; @@ -221,9 +221,4 @@ void RBBIDataWrapper::printData() { - - - - - U_NAMESPACE_END diff --git a/icu4c/source/common/rbbisetb.h b/icu4c/source/common/rbbisetb.h index 735166aa69..385b0be05a 100644 --- a/icu4c/source/common/rbbisetb.h +++ b/icu4c/source/common/rbbisetb.h @@ -35,7 +35,8 @@ U_NAMESPACE_BEGIN // All of them are strung together in a linked list, which is kept in order // (by character) // -struct RangeDescriptor : public UObject { +class RangeDescriptor : public UObject { +public: UChar32 fStartChar; // Start of range, unicode 32 bit value. UChar32 fEndChar; // End of range, unicode 32 bit value. int32_t fNum; // runtime-mapped input value for this range. diff --git a/icu4c/source/common/ubrk.cpp b/icu4c/source/common/ubrk.cpp index a257998bc5..601e734c8a 100644 --- a/icu4c/source/common/ubrk.cpp +++ b/icu4c/source/common/ubrk.cpp @@ -94,22 +94,27 @@ ubrk_openRules( const UChar *rules, UParseError *parseErr, UErrorCode *status) { - BreakIterator *result = 0; + if (status == NULL || U_FAILURE(*status)){ + return 0; + } + BreakIterator *result = 0; UnicodeString ruleString(rules, rulesLength); result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status); if(U_FAILURE(*status)) { return 0; } - UCharCharacterIterator *iter = 0; - iter = new UCharCharacterIterator(text, textLength); - if(iter == 0) { - *status = U_MEMORY_ALLOCATION_ERROR; - delete result; - return 0; + if (text != NULL) { + UCharCharacterIterator *iter = 0; + iter = new UCharCharacterIterator(text, textLength); + if(iter == 0) { + *status = U_MEMORY_ALLOCATION_ERROR; + delete result; + return 0; + } + result->adoptText(iter); } - result->adoptText(iter); return (UBreakIterator *)result; } @@ -243,7 +248,7 @@ ubrk_countAvailable() } -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 ubrk_isBoundary(UBreakIterator *bi, int32_t offset) { return ((BreakIterator *)bi)->isBoundary(offset); diff --git a/icu4c/source/common/unicode/brkiter.h b/icu4c/source/common/unicode/brkiter.h index b015ac97f6..11701bba4b 100644 --- a/icu4c/source/common/unicode/brkiter.h +++ b/icu4c/source/common/unicode/brkiter.h @@ -1,10 +1,10 @@ /* ***************************************************************************************** -* Copyright (C) 1997-2001, International Business Machines +* Copyright (C) 1997-2002, International Business Machines * Corporation and others. All Rights Reserved. ***************************************************************************************** * -* File BRKITER.H +* File brkiter.h * * Modification History: * @@ -65,13 +65,13 @@ U_NAMESPACE_BEGIN *

* Helper function to output text *

- * \code 
+ * \code
  *    void printTextRange( BreakIterator& iterator, int32_t start, int32_t end )
  *    {
  *        UnicodeString textBuffer, temp;
  *        CharacterIterator *strIter = iterator.createText();
  *        strIter->getText(temp);
- *        cout << " " << start << " " << end << " |" 
+ *        cout << " " << start << " " << end << " |"
  *             << temp.extractBetween(start, end, textBuffer)
  *             << "|" << endl;
  *        delete strIter;
@@ -149,7 +149,7 @@ U_NAMESPACE_BEGIN
  *           BreakIterator* boundary;
  *           UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
  *           cout << "Examining: " << stringToExamine << endl;
- * 
+ *
  *           //print each sentence in forward and reverse order
  *           boundary = BreakIterator::createSentenceInstance( Locale::US );
  *           boundary->setText(stringToExamine);
@@ -158,7 +158,7 @@ U_NAMESPACE_BEGIN
  *           cout << "----- backward: ----------" << endl;
  *           printEachBackward(*boundary);
  *           delete boundary;
- * 
+ *
  *           //print each word in order
  *           boundary = BreakIterator::createWordInstance();
  *           boundary->setText(stringToExamine);
@@ -173,7 +173,7 @@ U_NAMESPACE_BEGIN
  *           //print word at charpos 10
  *           cout << "----- at pos 10: ---------" << endl;
  *           printAt(*boundary, 10 );
- * 
+ *
  *           delete boundary;
  *       }
  * \endcode
@@ -222,6 +222,8 @@ public:
 
     /**
      * Return a CharacterIterator over the text being analyzed.
+     * Changing the state of the returned iterator can have undefined consequences
+     * on the operation of the break iterator.  If you need to change it, clone it first.
      * @stable
      */
     virtual const CharacterIterator& getText(void) const = 0;
@@ -278,8 +280,7 @@ public:
     virtual int32_t next(void) = 0;
 
     /**
-     * Return character index of the text boundary that was most recently
-     * returned by next(), previous(), first(), or last()
+     * Return character index of the current interator position within the text.
      * @return The boundary most recently returned.
      * @stable
      */
@@ -304,9 +305,11 @@ public:
      * @stable
      */
     virtual int32_t preceding(int32_t offset) = 0;
- 
+
     /**
      * Return true if the specfied position is a boundary position.
+     * As a side effect, the current position of the iterator is set
+     * to the first boundary position at or following the specified offset.
      * @param offset the offset to check.
      * @return True if "offset" is a boundary position.
      * @stable
@@ -328,22 +331,22 @@ public:
      * Create BreakIterator for word-breaks using the given locale.
      * Returns an instance of a BreakIterator implementing word breaks.
      * WordBreak is useful for word selection (ex. double click)
-     * @param where the locale. 
+     * @param where the locale.
      * @param status the error code
-     * @return A BreakIterator for word-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for word-breaks.  The UErrorCode& status
      * parameter is used to return status information to the user.
      * To check whether the construction succeeded or not, you should check
      * the value of U_SUCCESS(err).  If you wish more detailed information, you
      * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
      * used; neither the requested locale nor any of its fall back locales
      * could be found.
      * The caller owns the returned object and is responsible for deleting it.
      * @stable
      */
-    static BreakIterator* createWordInstance(const Locale& where, 
+    static BreakIterator* createWordInstance(const Locale& where,
                                                    UErrorCode& status);
 
     /**
@@ -354,84 +357,84 @@ public:
      * LineBreak is useful for word wrapping text.
      * @param where the locale.
      * @param status The error code.
-     * @return A BreakIterator for line-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for line-breaks.  The UErrorCode& status
      * parameter is used to return status information to the user.
      * To check whether the construction succeeded or not, you should check
      * the value of U_SUCCESS(err).  If you wish more detailed information, you
      * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
      * used; neither the requested locale nor any of its fall back locales
      * could be found.
      * The caller owns the returned object and is responsible for deleting it.
      * @stable
      */
-    static BreakIterator* createLineInstance(const Locale& where, 
+    static BreakIterator* createLineInstance(const Locale& where,
                                                    UErrorCode& status);
 
     /**
      * Create BreakIterator for character-breaks using specified locale
      * Returns an instance of a BreakIterator implementing character breaks.
      * Character breaks are boundaries of combining character sequences.
-     * @param where the locale. 
+     * @param where the locale.
      * @param status The error code.
-     * @return A BreakIterator for character-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for character-breaks.  The UErrorCode& status
      * parameter is used to return status information to the user.
      * To check whether the construction succeeded or not, you should check
      * the value of U_SUCCESS(err).  If you wish more detailed information, you
      * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
      * used; neither the requested locale nor any of its fall back locales
      * could be found.
      * The caller owns the returned object and is responsible for deleting it.
      * @stable
      */
-    static BreakIterator* createCharacterInstance(const Locale& where, 
+    static BreakIterator* createCharacterInstance(const Locale& where,
                                                         UErrorCode& status);
 
     /**
      * Create BreakIterator for sentence-breaks using specified locale
      * Returns an instance of a BreakIterator implementing sentence breaks.
-     * @param where the locale. 
+     * @param where the locale.
      * @param status The error code.
-     * @return A BreakIterator for sentence-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for sentence-breaks.  The UErrorCode& status
      * parameter is used to return status information to the user.
      * To check whether the construction succeeded or not, you should check
      * the value of U_SUCCESS(err).  If you wish more detailed information, you
      * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
      * used; neither the requested locale nor any of its fall back locales
      * could be found.
      * The caller owns the returned object and is responsible for deleting it.
      * @stable
      */
-    static BreakIterator* createSentenceInstance(const Locale& where, 
+    static BreakIterator* createSentenceInstance(const Locale& where,
                                                        UErrorCode& status);
 
     /**
      * Create BreakIterator for title-casing breaks using the specified locale
      * Returns an instance of a BreakIterator implementing title breaks.
-     * @param where the locale. 
+     * @param where the locale.
      * @param status The error code.
-     * @return A BreakIterator for title-breaks.  The UErrorCode& status 
+     * @return A BreakIterator for title-breaks.  The UErrorCode& status
      * parameter is used to return status information to the user.
      * To check whether the construction succeeded or not, you should check
      * the value of U_SUCCESS(err).  If you wish more detailed information, you
      * can check for informational error results which still indicate success.
-     * U_USING_FALLBACK_ERROR indicates that a fall back locale was used.  For
+     * U_USING_FALLBACK_WARNING indicates that a fall back locale was used.  For
      * example, 'de_CH' was requested, but nothing was found there, so 'de' was
-     * used.  U_USING_DEFAULT_ERROR indicates that the default locale data was
+     * used.  U_USING_DEFAULT_WARNING indicates that the default locale data was
      * used; neither the requested locale nor any of its fall back locales
      * could be found.
      * The caller owns the returned object and is responsible for deleting it.
-     * @stable
+     * @draft ICU 2.1
      */
-    static BreakIterator* createTitleInstance(const Locale& where, 
+    static BreakIterator* createTitleInstance(const Locale& where,
                                                        UErrorCode& status);
 
     /**
@@ -469,24 +472,30 @@ public:
     /**
      * Thread safe client-buffer-based cloning operation
      *    Do NOT call delete on a safeclone, since 'new' is not used to create it.
-     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated. 
+     * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
      * If buffer is not large enough, new memory will be allocated.
-     * @param BufferSize reference to size of allocated space. 
-     * If BufferSize == 0, a sufficient size for use in cloning will 
+     * @param BufferSize reference to size of allocated space.
+     * If BufferSize == 0, a sufficient size for use in cloning will
      * be returned ('pre-flighting')
-     * If BufferSize is not enough for a stack-based safe clone, 
+     * If BufferSize is not enough for a stack-based safe clone,
      * new memory will be allocated.
      * @param status to indicate whether the operation went on smoothly or there were errors
-     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were 
+     *  An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
      *  necessary.
      * @return pointer to the new clone
-     *  
-     * @draft ICU 1.8
+     *
+     * @stable
      */
     virtual BreakIterator *  createBufferClone(void *stackBuffer,
                                                int32_t &BufferSize,
                                                UErrorCode &status) = 0;
 
+    /**
+     *   Determine whether the BreakIterator was created in user memory by
+     *   createBufferClone(), and thus should not be deleted.  Such objects
+     *   must be closed by an explicit call to the destructor (not delete).
+     *  @stable
+     */
     inline UBool isBufferClone(void);
 
 
diff --git a/icu4c/source/common/unicode/rbbi.h b/icu4c/source/common/unicode/rbbi.h
index c2d5b9d43b..af6bf31232 100644
--- a/icu4c/source/common/unicode/rbbi.h
+++ b/icu4c/source/common/unicode/rbbi.h
@@ -24,9 +24,9 @@ struct UTrie;
 U_NAMESPACE_BEGIN
 
 struct RBBIDataHeader;
-class RuleBasedBreakIteratorTables;
-class BreakIterator;
-class RBBIDataWrapper;
+class  RuleBasedBreakIteratorTables;
+class  BreakIterator;
+class  RBBIDataWrapper;
 
 
 
@@ -37,10 +37,6 @@ class RBBIDataWrapper;
  * 

See the ICU User Guide for information on Break Iterator Rules.

* */ - - - - class U_COMMON_API RuleBasedBreakIterator : public BreakIterator { protected: @@ -74,7 +70,7 @@ protected: uint32_t fDictionaryCharCount; // - // Debugging flag. + // Debugging flag. Trace operation of state machine when true. // static UBool fTrace; @@ -117,7 +113,8 @@ protected: public: /** Default constructor. Creates an empty shell of an iterator, with no - * rules or text to iterate over. Object can subsequently be assigned. + * rules or text to iterate over. Object can subsequently be assigned to. + * @draft ICU 2.2 */ RuleBasedBreakIterator(); @@ -134,12 +131,14 @@ public: * @param parseError In the event of a syntax error in the rules, provides the location * within the rules of the problem. * @param status Information on any errors encountered. + * @draft ICU 2.2 */ RuleBasedBreakIterator( const UnicodeString &rules, UParseError &parseError, UErrorCode &status); /** * Destructor + * @stable */ virtual ~RuleBasedBreakIterator(); @@ -148,6 +147,7 @@ public: * and iterate over the same text, as the one passed in. * @param that The RuleBasedBreakItertor passed in * @return the newly created RuleBasedBreakIterator + * @stable */ RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); @@ -157,6 +157,7 @@ public: * @param that The BreakIterator to be compared for equality * @Return TRUE if both BreakIterators are of the * same class, have the same behavior, and iterate over the same text. + * @stable */ virtual UBool operator==(const BreakIterator& that) const; @@ -165,6 +166,7 @@ public: * and vice versa. * @param that The BreakIterator to be compared for inequality * @return TRUE if both BreakIterators are not same. + * @stable */ UBool operator!=(const BreakIterator& that) const; @@ -175,18 +177,21 @@ public: * will correctly clone (copy) a derived class. * clone() is thread safe. Multiple threads may simultaeneously * clone the same source break iterator. + * @stable */ virtual BreakIterator* clone() const; /** * Compute a hash code for this BreakIterator * @return A hash code + * @stable */ virtual int32_t hashCode(void) const; /** * Returns the description used to create this iterator * @return the description used to create this iterator + * @stable */ virtual const UnicodeString& getRules(void) const; @@ -200,6 +205,7 @@ public: * Changing the state of this iterator can have undefined consequences. If * you need to change it, clone it first. * @return An iterator over the text being analyzed. + * @stable */ virtual const CharacterIterator& getText(void) const; @@ -209,6 +215,7 @@ public: * the current iteration position to the beginning of the text. * @param newText An iterator over the text to analyze. The BreakIterator * takes ownership of the character iterator. The caller MUST NOT delete it! + * @stable */ virtual void adoptText(CharacterIterator* newText); @@ -216,6 +223,7 @@ public: * Set the iterator to analyze a new piece of text. This function resets * the current iteration position to the beginning of the text. * @param newText The text to analyze. + * @stable */ virtual void setText(const UnicodeString& newText); @@ -223,6 +231,7 @@ public: * Sets the current iteration position to the beginning of the text. * (i.e., the CharacterIterator's starting offset). * @return The offset of the beginning of the text. + * @stable */ virtual int32_t first(void); @@ -230,6 +239,7 @@ public: * Sets the current iteration position to the end of the text. * (i.e., the CharacterIterator's ending offset). * @return The text's past-the-end offset. + * @stable */ virtual int32_t last(void); @@ -241,18 +251,21 @@ public: * (negative is backwards, and positive is forwards). * @return The character offset of the boundary position n boundaries away from * the current one. + * @stable */ virtual int32_t next(int32_t n); /** * Advances the iterator to the next boundary position. * @return The position of the first boundary after this one. + * @stable */ virtual int32_t next(void); /** * Moves the iterator backwards, to the last boundary preceding this one. * @return The position of the last boundary position preceding this one. + * @stable */ virtual int32_t previous(void); @@ -261,6 +274,7 @@ public: * the specified position. * @param offset The position from which to begin searching for a break position. * @return The position of the first break after the current position. + * @stable */ virtual int32_t following(int32_t offset); @@ -269,6 +283,7 @@ public: * specified position. * @param offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. + * @stable */ virtual int32_t preceding(int32_t offset); @@ -278,12 +293,14 @@ public: * or after "offset". * @param offset the offset to check. * @return True if "offset" is a boundary position. + * @stable */ virtual UBool isBoundary(int32_t offset); /** * Returns the current iteration position. * @return The current iteration position. + * @stable */ virtual int32_t current(void) const; @@ -295,6 +312,7 @@ public: * status, a default value of 0 is returned. * @return the status from the break rule that determined the most recently * returned break position. + * @draft ICU 2.2 */ virtual int32_t getRuleStatus() const; @@ -336,7 +354,7 @@ public: * buffer size, but do not clone the object. If the * size was too small (but not zero), allocate heap * storage for the cloned object. - * + * * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be * returned if the the provided buffer was too small, and * the clone was therefore put on the heap. @@ -344,6 +362,7 @@ public: * @return Pointer to the clone object. This may differ from the stackBuffer * address if the byte alignment of the stack buffer was not suitable * or if the stackBuffer was too small to hold the clone. + * @draft stable */ virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, @@ -365,6 +384,7 @@ public: * @return A pointer to the binary (compiled) rule data. The storage * belongs to the RulesBasedBreakIterator object, not the * caller, and must not be modified or deleted. + * @internal */ virtual const uint8_t *getBinaryRules(uint32_t &length); diff --git a/icu4c/source/common/unicode/ubrk.h b/icu4c/source/common/unicode/ubrk.h index 045dea019f..f627cf6bfe 100644 --- a/icu4c/source/common/unicode/ubrk.h +++ b/icu4c/source/common/unicode/ubrk.h @@ -47,7 +47,7 @@ * typically starts of words, that should be set to Title Case * when title casing the text. *

- * + * * This is the interface for all text boundaries. *

* Examples: @@ -204,15 +204,27 @@ typedef enum UBreakIteratorType UBreakIteratorType; * than for single individual values. */ enum UWordBreak { + /** Tag value for "words" that do not fit into any of other categories. + * Includes spaces and most punctuation. */ UBRK_WORD_NONE = 0, + /** Upper bound for tags for uncategorized words. */ UBRK_WORD_NONE_LIMIT = 100, + /** Tag value for words that appear to be numbers, lower limit. */ UBRK_WORD_NUMBER = 100, + /** Tag value for words that appear to be numbers, upper limit. */ UBRK_WORD_NUMBER_LIMIT = 200, + /** Tag value for words that contain letters, excluding + * hiragana, katakana or ideographic characters, lower limit. */ UBRK_WORD_LETTER = 200, + /** Tag value for words containing letters, upper limit */ UBRK_WORD_LETTER_LIMIT = 300, - UBRK_WORD_HIRAKATA = 300, - UBRK_WORD_HIRAKATA_LIMIT = 400, + /** Tag value for words containing kana characters, lower limit */ + UBRK_WORD_KANA = 300, + /** Tag value for words containing kana characters, upper limit */ + UBRK_WORD_KANA_LIMIT = 400, + /** Tag value for words containing ideographic characters, lower limit */ UBRK_WORD_IDEO = 400, + /** Tag value for words containing ideographic characters, upper limit */ UBRK_WORD_IDEO_LIMIT = 500 }; typedef enum UWordBreak UWordBreak; @@ -232,7 +244,7 @@ typedef enum UWordBreak UWordBreak; * @see ubrk_openRules * @stable */ -U_CAPI UBreakIterator* U_EXPORT2 +U_CAPI UBreakIterator* U_EXPORT2 ubrk_open(UBreakIteratorType type, const char *locale, const UChar *text, @@ -252,9 +264,9 @@ ubrk_open(UBreakIteratorType type, * @param status A UErrorCode to receive any errors. * @return A UBreakIterator for the specified rules. * @see ubrk_open - * @draft + * @draft ICU 2.2 */ -U_CAPI UBreakIterator* U_EXPORT2 +U_CAPI UBreakIterator* U_EXPORT2 ubrk_openRules(const UChar *rules, int32_t rulesLength, const UChar *text, @@ -276,9 +288,9 @@ ubrk_openRules(const UChar *rules, * @param status to indicate whether the operation went on smoothly or there were errors * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary. * @return pointer to the new clone - * @draft ICU 1.8 + * @stable */ -U_CAPI UBreakIterator * U_EXPORT2 +U_CAPI UBreakIterator * U_EXPORT2 ubrk_safeClone( const UBreakIterator *bi, void *stackBuffer, @@ -293,7 +305,7 @@ ubrk_safeClone( * @param bi The break iterator to close. * @stable */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 ubrk_close(UBreakIterator *bi); /** @@ -304,7 +316,7 @@ ubrk_close(UBreakIterator *bi); * @param status The error code * @stable */ -U_CAPI void U_EXPORT2 +U_CAPI void U_EXPORT2 ubrk_setText(UBreakIterator* bi, const UChar* text, int32_t textLength, @@ -318,7 +330,7 @@ ubrk_setText(UBreakIterator* bi, * \Ref{ubrk_first}, or \Ref{ubrk_last}. * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_current(const UBreakIterator *bi); /** @@ -330,7 +342,7 @@ ubrk_current(const UBreakIterator *bi); * @see ubrk_previous * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_next(UBreakIterator *bi); /** @@ -342,7 +354,7 @@ ubrk_next(UBreakIterator *bi); * @see ubrk_next * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_previous(UBreakIterator *bi); /** @@ -353,7 +365,7 @@ ubrk_previous(UBreakIterator *bi); * @see ubrk_last * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_first(UBreakIterator *bi); /** @@ -366,7 +378,7 @@ ubrk_first(UBreakIterator *bi); * @see ubrk_first * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_last(UBreakIterator *bi); /** @@ -378,7 +390,7 @@ ubrk_last(UBreakIterator *bi); * @see ubrk_following * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_preceding(UBreakIterator *bi, int32_t offset); @@ -391,7 +403,7 @@ ubrk_preceding(UBreakIterator *bi, * @see ubrk_preceding * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_following(UBreakIterator *bi, int32_t offset); @@ -404,7 +416,7 @@ ubrk_following(UBreakIterator *bi, * @see ubrk_countAvailable * @stable */ -U_CAPI const char* U_EXPORT2 +U_CAPI const char* U_EXPORT2 ubrk_getAvailable(int32_t index); /** @@ -415,7 +427,7 @@ ubrk_getAvailable(int32_t index); * @see ubrk_getAvailable * @stable */ -U_CAPI int32_t U_EXPORT2 +U_CAPI int32_t U_EXPORT2 ubrk_countAvailable(void); @@ -426,8 +438,9 @@ ubrk_countAvailable(void); * @param bi The break iterator to use. * @param offset the offset to check. * @return True if "offset" is a boundary position. +* @stable */ -U_CAPI UBool U_EXPORT2 +U_CAPI UBool U_EXPORT2 ubrk_isBoundary(UBreakIterator *bi, int32_t offset); /** @@ -437,6 +450,7 @@ ubrk_isBoundary(UBreakIterator *bi, int32_t offset); * status, a default value of 0 is returned. *

* For word break iterators, the possible values are defined in enum UWordBreak. + * @draft ICU 2.2 */ U_CAPI int32_t U_EXPORT2 ubrk_getRuleStatus(UBreakIterator *bi); diff --git a/icu4c/source/test/intltest/rbbiapts.cpp b/icu4c/source/test/intltest/rbbiapts.cpp index 8f9e87a59a..3f0dd10d46 100644 --- a/icu4c/source/test/intltest/rbbiapts.cpp +++ b/icu4c/source/test/intltest/rbbiapts.cpp @@ -654,12 +654,12 @@ void RBBIAPITest::TestWordStatus() { int32_t tag_lo[] = {UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_LETTER, UBRK_WORD_NONE, UBRK_WORD_NUMBER, UBRK_WORD_NONE, UBRK_WORD_IDEO, UBRK_WORD_IDEO, UBRK_WORD_NONE, - UBRK_WORD_HIRAKATA, UBRK_WORD_NONE, UBRK_WORD_HIRAKATA}; + UBRK_WORD_KANA, UBRK_WORD_NONE, UBRK_WORD_KANA}; - int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, - UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, - UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, - UBRK_WORD_HIRAKATA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_HIRAKATA_LIMIT}; + int32_t tag_hi[] = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, + UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT, + UBRK_WORD_IDEO_LIMIT, UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT, + UBRK_WORD_KANA_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_KANA_LIMIT}; UErrorCode status=U_ZERO_ERROR;