ICU-10688 branch, work in progress.

X-SVN-Rev: 40686
This commit is contained in:
Andy Heninger 2017-12-03 00:36:54 +00:00
parent c67d9d0a4a
commit ca7b62180e
9 changed files with 47 additions and 121 deletions

View File

@ -59,58 +59,47 @@ LanguageBreakFactory::~LanguageBreakFactory() {
****************************************************************** ******************************************************************
*/ */
UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) { (void)status;
fHandled[i] = 0;
}
} }
UnhandledEngine::~UnhandledEngine() { UnhandledEngine::~UnhandledEngine() {
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) { delete fHandled;
if (fHandled[i] != 0) { fHandled = nullptr;
delete fHandled[i];
}
}
} }
UBool UBool
UnhandledEngine::handles(UChar32 c, int32_t breakType) const { UnhandledEngine::handles(UChar32 c) const {
return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled) return fHandled && fHandled->contains(c);
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
} }
int32_t int32_t
UnhandledEngine::findBreaks( UText *text, UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */, int32_t /* startPos */,
int32_t endPos, int32_t endPos,
int32_t breakType,
UVector32 &/*foundBreaks*/ ) const { UVector32 &/*foundBreaks*/ ) const {
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { UChar32 c = utext_current32(text);
UChar32 c = utext_current32(text); while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { utext_next32(text); // TODO: recast loop to work with post-increment operations.
utext_next32(text); // TODO: recast loop to work with post-increment operations. c = utext_current32(text);
c = utext_current32(text);
}
} }
return 0; return 0;
} }
void void
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { UnhandledEngine::handleCharacter(UChar32 c) {
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { if (fHandled == nullptr) {
if (fHandled[breakType] == 0) { fHandled = new UnicodeSet();
fHandled[breakType] = new UnicodeSet(); if (fHandled == nullptr) {
if (fHandled[breakType] == 0) {
return; return;
} }
} }
if (!fHandled[breakType]->contains(c)) { if (!fHandled->contains(c)) {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
// Apply the entire script of the character. // Apply the entire script of the character.
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
} }
}
} }
/* /*
@ -138,7 +127,7 @@ U_NAMESPACE_BEGIN
static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER; static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
const LanguageBreakEngine * const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { ICULanguageBreakFactory::getEngineFor(UChar32 c) {
const LanguageBreakEngine *lbe = NULL; const LanguageBreakEngine *lbe = NULL;
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
@ -156,14 +145,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
int32_t i = fEngines->size(); int32_t i = fEngines->size();
while (--i >= 0) { while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != NULL && lbe->handles(c, breakType)) { if (lbe != NULL && lbe->handles(c)) {
return lbe; return lbe;
} }
} }
} }
// We didn't find an engine. Create one. // We didn't find an engine. Create one.
lbe = loadEngineFor(c, breakType); lbe = loadEngineFor(c);
if (lbe != NULL) { if (lbe != NULL) {
fEngines->push((void *)lbe, status); fEngines->push((void *)lbe, status);
} }
@ -171,11 +160,11 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
} }
const LanguageBreakEngine * const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status); UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) { if (U_SUCCESS(status)) {
DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != NULL) { if (m != NULL) {
const LanguageBreakEngine *engine = NULL; const LanguageBreakEngine *engine = NULL;
switch(code) { switch(code) {
@ -236,7 +225,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
} }
DictionaryMatcher * DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
// open root from brkitr tree. // open root from brkitr tree.
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);

View File

@ -54,11 +54,10 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.</p> * a particular kind of break.</p>
* *
* @param c A character which begins a run that the engine might handle * @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break * @return TRUE if this engine handles the particular character and break
* type. * type.
*/ */
virtual UBool handles(UChar32 c, int32_t breakType) const = 0; virtual UBool handles(UChar32 c) const = 0;
/** /**
* <p>Find any breaks within a run in the supplied text.</p> * <p>Find any breaks within a run in the supplied text.</p>
@ -68,14 +67,12 @@ class LanguageBreakEngine : public UMemory {
* is capable of handling. * is capable of handling.
* @param startPos The start of the run within the supplied text. * @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text. * @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks A Vector of int32_t to receive the breaks. * @param foundBreaks A Vector of int32_t to receive the breaks.
* @return The number of breaks found. * @return The number of breaks found.
*/ */
virtual int32_t findBreaks( UText *text, virtual int32_t findBreaks( UText *text,
int32_t startPos, int32_t startPos,
int32_t endPos, int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const = 0; UVector32 &foundBreaks ) const = 0;
}; };
@ -125,11 +122,9 @@ class LanguageBreakFactory : public UMemory {
* *
* @param c A character that begins a run for which a LanguageBreakEngine is * @param c A character that begins a run for which a LanguageBreakEngine is
* sought. * sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0. * @return A LanguageBreakEngine with the desired characteristics, or 0.
*/ */
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0; virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
}; };
@ -152,11 +147,11 @@ class UnhandledEngine : public LanguageBreakEngine {
private: private:
/** /**
* The sets of characters handled, for each break type * The sets of characters handled.
* @internal * @internal
*/ */
UnicodeSet *fHandled[4]; UnicodeSet *fHandled;
public: public:
@ -176,11 +171,10 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.</p> * a particular kind of break.</p>
* *
* @param c A character which begins a run that the engine might handle * @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break * @return TRUE if this engine handles the particular character and break
* type. * type.
*/ */
virtual UBool handles(UChar32 c, int32_t breakType) const; virtual UBool handles(UChar32 c) const;
/** /**
* <p>Find any breaks within a run in the supplied text.</p> * <p>Find any breaks within a run in the supplied text.</p>
@ -190,23 +184,20 @@ class UnhandledEngine : public LanguageBreakEngine {
* is capable of handling. * is capable of handling.
* @param startPos The start of the run within the supplied text. * @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text. * @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any * @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found. * @return The number of breaks found.
*/ */
virtual int32_t findBreaks( UText *text, virtual int32_t findBreaks( UText *text,
int32_t startPos, int32_t startPos,
int32_t endPos, int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const; UVector32 &foundBreaks ) const;
/** /**
* <p>Tell the engine to handle a particular character and break type.</p> * <p>Tell the engine to handle a particular character and break type.</p>
* *
* @param c A character which the engine should handle * @param c A character which the engine should handle
* @param breakType The type of text break for which the engine should handle c
*/ */
virtual void handleCharacter(UChar32 c, int32_t breakType); virtual void handleCharacter(UChar32 c);
}; };
@ -250,11 +241,9 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
* *
* @param c A character that begins a run for which a LanguageBreakEngine is * @param c A character that begins a run for which a LanguageBreakEngine is
* sought. * sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0. * @return A LanguageBreakEngine with the desired characteristics, or 0.
*/ */
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType); virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
protected: protected:
/** /**
@ -263,21 +252,17 @@ protected:
* *
* @param c A character that begins a run for which a LanguageBreakEngine is * @param c A character that begins a run for which a LanguageBreakEngine is
* sought. * sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0. * @return A LanguageBreakEngine with the desired characteristics, or 0.
*/ */
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType); virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
/** /**
* <p>Create a DictionaryMatcher for the specified script and break type.</p> * <p>Create a DictionaryMatcher for the specified script and break type.</p>
* @param script An ISO 15924 script code that identifies the dictionary to be * @param script An ISO 15924 script code that identifies the dictionary to be
* created. * created.
* @param breakType The kind of text break for which a dictionary is
* sought.
* @return A DictionaryMatcher with the desired characteristics, or NULL. * @return A DictionaryMatcher with the desired characteristics, or NULL.
*/ */
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType); virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
}; };
U_NAMESPACE_END U_NAMESPACE_END

View File

@ -52,7 +52,7 @@ U_NAMESPACE_BEGIN
// ------------------------------------- // -------------------------------------
BreakIterator* BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status) BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
{ {
char fnbuff[256]; char fnbuff[256];
char ext[4]={'\0'}; char ext[4]={'\0'};
@ -121,7 +121,6 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind,
U_LOCALE_BASED(locBased, *(BreakIterator*)result); U_LOCALE_BASED(locBased, *(BreakIterator*)result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status), locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data()); actualLocale.data());
result->setBreakType(kind);
} }
ures_close(b); ures_close(b);
@ -413,10 +412,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
BreakIterator *result = NULL; BreakIterator *result = NULL;
switch (kind) { switch (kind) {
case UBRK_CHARACTER: case UBRK_CHARACTER:
result = BreakIterator::buildInstance(loc, "grapheme", kind, status); result = BreakIterator::buildInstance(loc, "grapheme", status);
break; break;
case UBRK_WORD: case UBRK_WORD:
result = BreakIterator::buildInstance(loc, "word", kind, status); result = BreakIterator::buildInstance(loc, "word", status);
break; break;
case UBRK_LINE: case UBRK_LINE:
uprv_strcpy(lbType, "line"); uprv_strcpy(lbType, "line");
@ -429,10 +428,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
uprv_strcat(lbType, lbKeyValue); uprv_strcat(lbType, lbKeyValue);
} }
} }
result = BreakIterator::buildInstance(loc, lbType, kind, status); result = BreakIterator::buildInstance(loc, lbType, status);
break; break;
case UBRK_SENTENCE: case UBRK_SENTENCE:
result = BreakIterator::buildInstance(loc, "sentence", kind, status); result = BreakIterator::buildInstance(loc, "sentence", status);
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
{ {
char ssKeyValue[kKeyValueLenMax] = {0}; char ssKeyValue[kKeyValueLenMax] = {0};
@ -449,7 +448,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
#endif #endif
break; break;
case UBRK_TITLE: case UBRK_TITLE:
result = BreakIterator::buildInstance(loc, "title", kind, status); result = BreakIterator::buildInstance(loc, "title", status);
break; break;
default: default:
status = U_ILLEGAL_ARGUMENT_ERROR; status = U_ILLEGAL_ARGUMENT_ERROR;

View File

@ -37,9 +37,8 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
} }
UBool UBool
DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const { DictionaryBreakEngine::handles(UChar32 c) const {
return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes) return fSet.contains(c);
&& fSet.contains(c));
} }
int32_t int32_t

View File

@ -42,27 +42,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
UnicodeSet fSet; UnicodeSet fSet;
/**
* The set of break types handled by this engine
* @internal
*/
uint32_t fTypes;
/**
* <p>Default constructor.</p>
*
*/
DictionaryBreakEngine();
public: public:
/** /**
* <p>Constructor setting the break types handled.</p> * <p>Constructor </p>
*
* @param breakTypes A bitmap of types handled by the engine.
*/ */
DictionaryBreakEngine( uint32_t breakTypes ); DictionaryBreakEngine();
/** /**
* <p>Virtual destructor.</p> * <p>Virtual destructor.</p>
@ -78,7 +63,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* @return TRUE if this engine handles the particular character and break * @return TRUE if this engine handles the particular character and break
* type. * type.
*/ */
virtual UBool handles( UChar32 c, int32_t breakType ) const; virtual UBool handles(UChar32 c) const;
/** /**
* <p>Find any breaks within a run in the supplied text.</p> * <p>Find any breaks within a run in the supplied text.</p>
@ -88,14 +73,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* that starts from the first character in the range. * that starts from the first character in the range.
* @param startPos The start of the run within the supplied text. * @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text. * @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks vector of int32_t to receive the break positions * @param foundBreaks vector of int32_t to receive the break positions
* @return The number of breaks found. * @return The number of breaks found.
*/ */
virtual int32_t findBreaks( UText *text, virtual int32_t findBreaks( UText *text,
int32_t startPos, int32_t startPos,
int32_t endPos, int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const; UVector32 &foundBreaks ) const;
protected: protected:

View File

@ -217,7 +217,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
} }
BreakIterator::operator=(that); BreakIterator::operator=(that);
fBreakType = that.fBreakType;
if (fLanguageBreakEngines != NULL) { if (fLanguageBreakEngines != NULL) {
delete fLanguageBreakEngines; delete fLanguageBreakEngines;
fLanguageBreakEngines = NULL; // Just rebuild for now fLanguageBreakEngines = NULL; // Just rebuild for now
@ -278,11 +277,6 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
fRuleStatusIndex = 0; fRuleStatusIndex = 0;
fDone = false; fDone = false;
fDictionaryCharCount = 0; fDictionaryCharCount = 0;
fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
// dictionary behavior for Break Iterators that are
// built from rules. Even better would be the ability to
// declare the type in the rules.
fLanguageBreakEngines = NULL; fLanguageBreakEngines = NULL;
fUnhandledBreakEngine = NULL; fUnhandledBreakEngine = NULL;
fBreakCache = NULL; fBreakCache = NULL;
@ -1290,14 +1284,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
int32_t i = fLanguageBreakEngines->size(); int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) { while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
if (lbe->handles(c, fBreakType)) { if (lbe->handles(c)) {
return lbe; return lbe;
} }
} }
// No existing dictionary took the character. See if a factory wants to // No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character. // give us a new LanguageBreakEngine for this character.
lbe = getLanguageBreakEngineFromFactory(c, fBreakType); lbe = getLanguageBreakEngineFromFactory(c);
// If we got one, use it and push it on our stack. // If we got one, use it and push it on our stack.
if (lbe != NULL) { if (lbe != NULL) {
@ -1327,21 +1321,11 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
// Tell the reject engine about the character; at its discretion, it may // Tell the reject engine about the character; at its discretion, it may
// add more than just the one character. // add more than just the one character.
fUnhandledBreakEngine->handleCharacter(c, fBreakType); fUnhandledBreakEngine->handleCharacter(c);
return fUnhandledBreakEngine; return fUnhandledBreakEngine;
} }
/*int32_t RuleBasedBreakIterator::getBreakType() const {
return fBreakType;
}*/
void RuleBasedBreakIterator::setBreakType(int32_t type) {
fBreakType = type;
}
void RuleBasedBreakIterator::dumpCache() { void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache(); fBreakCache->dumpCache();
} }

View File

@ -168,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// Ask the language object if there are any breaks. It will add them to the cache and // Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one. // leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) { if (lbe != NULL) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks); foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, *fBreaks);
} }
// Reload the loop variables for the next go-round // Reload the loop variables for the next go-round

View File

@ -616,7 +616,7 @@ public:
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0; virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
private: private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status); static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status); static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status); static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

View File

@ -149,13 +149,6 @@ private:
*/ */
UnhandledEngine *fUnhandledBreakEngine; UnhandledEngine *fUnhandledBreakEngine;
/**
*
* The type of the break iterator, or -1 if it has not been set.
* @internal
*/
int32_t fBreakType;
//======================================================================= //=======================================================================
// constructors // constructors
//======================================================================= //=======================================================================
@ -645,12 +638,6 @@ private:
*/ */
void reset(void); void reset(void);
/**
* Set the type of the break iterator.
* @internal
*/
void setBreakType(int32_t type);
/** /**
* Common initialization function, used by constructors and bufferClone. * Common initialization function, used by constructors and bufferClone.
* @internal * @internal