ICU-10688 branch, work in progress.

X-SVN-Rev: 40686
This commit is contained in:
Andy Heninger 2017-12-03 00:36:54 +00:00
parent c67d9d0a4a
commit ca7b62180e
9 changed files with 47 additions and 121 deletions

View File

@ -59,58 +59,47 @@ LanguageBreakFactory::~LanguageBreakFactory() {
******************************************************************
*/
UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) {
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
fHandled[i] = 0;
}
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
(void)status;
}
UnhandledEngine::~UnhandledEngine() {
for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) {
if (fHandled[i] != 0) {
delete fHandled[i];
}
}
delete fHandled;
fHandled = nullptr;
}
UBool
UnhandledEngine::handles(UChar32 c, int32_t breakType) const {
return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)
&& fHandled[breakType] != 0 && fHandled[breakType]->contains(c));
UnhandledEngine::handles(UChar32 c) const {
return fHandled && fHandled->contains(c);
}
int32_t
UnhandledEngine::findBreaks( UText *text,
int32_t /* startPos */,
int32_t endPos,
int32_t breakType,
UVector32 &/*foundBreaks*/ ) const {
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
UChar32 c = utext_current32(text);
while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
utext_next32(text); // TODO: recast loop to work with post-increment operations.
c = utext_current32(text);
}
return 0;
}
void
UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) {
if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) {
if (fHandled[breakType] == 0) {
fHandled[breakType] = new UnicodeSet();
if (fHandled[breakType] == 0) {
UnhandledEngine::handleCharacter(UChar32 c) {
if (fHandled == nullptr) {
fHandled = new UnicodeSet();
if (fHandled == nullptr) {
return;
}
}
if (!fHandled[breakType]->contains(c)) {
if (!fHandled->contains(c)) {
UErrorCode status = U_ZERO_ERROR;
// Apply the entire script of the character.
int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
}
}
}
/*
@ -138,7 +127,7 @@ U_NAMESPACE_BEGIN
static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
const LanguageBreakEngine *
ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
const LanguageBreakEngine *lbe = NULL;
UErrorCode status = U_ZERO_ERROR;
@ -156,14 +145,14 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
int32_t i = fEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
if (lbe != NULL && lbe->handles(c, breakType)) {
if (lbe != NULL && lbe->handles(c)) {
return lbe;
}
}
}
// We didn't find an engine. Create one.
lbe = loadEngineFor(c, breakType);
lbe = loadEngineFor(c);
if (lbe != NULL) {
fEngines->push((void *)lbe, status);
}
@ -171,11 +160,11 @@ ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) {
}
const LanguageBreakEngine *
ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
UErrorCode status = U_ZERO_ERROR;
UScriptCode code = uscript_getScript(c, &status);
if (U_SUCCESS(status)) {
DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType);
DictionaryMatcher *m = loadDictionaryMatcherFor(code);
if (m != NULL) {
const LanguageBreakEngine *engine = NULL;
switch(code) {
@ -236,7 +225,7 @@ ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) {
}
DictionaryMatcher *
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) {
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) {
UErrorCode status = U_ZERO_ERROR;
// open root from brkitr tree.
UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);

View File

@ -54,11 +54,10 @@ class LanguageBreakEngine : public UMemory {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, int32_t breakType) const = 0;
virtual UBool handles(UChar32 c) const = 0;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -68,14 +67,12 @@ class LanguageBreakEngine : public UMemory {
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks A Vector of int32_t to receive the breaks.
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const = 0;
};
@ -125,11 +122,9 @@ class LanguageBreakFactory : public UMemory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType) = 0;
virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
};
@ -152,11 +147,11 @@ class UnhandledEngine : public LanguageBreakEngine {
private:
/**
* The sets of characters handled, for each break type
* The sets of characters handled.
* @internal
*/
UnicodeSet *fHandled[4];
UnicodeSet *fHandled;
public:
@ -176,11 +171,10 @@ class UnhandledEngine : public LanguageBreakEngine {
* a particular kind of break.</p>
*
* @param c A character which begins a run that the engine might handle
* @param breakType The type of text break which the caller wants to determine
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles(UChar32 c, int32_t breakType) const;
virtual UBool handles(UChar32 c) const;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -190,23 +184,20 @@ class UnhandledEngine : public LanguageBreakEngine {
* is capable of handling.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks An allocated C array of the breaks found, if any
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const;
/**
* <p>Tell the engine to handle a particular character and break type.</p>
*
* @param c A character which the engine should handle
* @param breakType The type of text break for which the engine should handle c
*/
virtual void handleCharacter(UChar32 c, int32_t breakType);
virtual void handleCharacter(UChar32 c);
};
@ -250,11 +241,9 @@ class ICULanguageBreakFactory : public LanguageBreakFactory {
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *getEngineFor(UChar32 c, int32_t breakType);
virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
protected:
/**
@ -263,21 +252,17 @@ protected:
*
* @param c A character that begins a run for which a LanguageBreakEngine is
* sought.
* @param breakType The kind of text break for which a LanguageBreakEngine is
* sought.
* @return A LanguageBreakEngine with the desired characteristics, or 0.
*/
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c, int32_t breakType);
virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
/**
* <p>Create a DictionaryMatcher for the specified script and break type.</p>
* @param script An ISO 15924 script code that identifies the dictionary to be
* created.
* @param breakType The kind of text break for which a dictionary is
* sought.
* @return A DictionaryMatcher with the desired characteristics, or NULL.
*/
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script, int32_t breakType);
virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
};
U_NAMESPACE_END

View File

@ -52,7 +52,7 @@ U_NAMESPACE_BEGIN
// -------------------------------------
BreakIterator*
BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode &status)
BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &status)
{
char fnbuff[256];
char ext[4]={'\0'};
@ -121,7 +121,6 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, int32_t kind,
U_LOCALE_BASED(locBased, *(BreakIterator*)result);
locBased.setLocaleIDs(ures_getLocaleByType(b, ULOC_VALID_LOCALE, &status),
actualLocale.data());
result->setBreakType(kind);
}
ures_close(b);
@ -413,10 +412,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
BreakIterator *result = NULL;
switch (kind) {
case UBRK_CHARACTER:
result = BreakIterator::buildInstance(loc, "grapheme", kind, status);
result = BreakIterator::buildInstance(loc, "grapheme", status);
break;
case UBRK_WORD:
result = BreakIterator::buildInstance(loc, "word", kind, status);
result = BreakIterator::buildInstance(loc, "word", status);
break;
case UBRK_LINE:
uprv_strcpy(lbType, "line");
@ -429,10 +428,10 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
uprv_strcat(lbType, lbKeyValue);
}
}
result = BreakIterator::buildInstance(loc, lbType, kind, status);
result = BreakIterator::buildInstance(loc, lbType, status);
break;
case UBRK_SENTENCE:
result = BreakIterator::buildInstance(loc, "sentence", kind, status);
result = BreakIterator::buildInstance(loc, "sentence", status);
#if !UCONFIG_NO_FILTERED_BREAK_ITERATION
{
char ssKeyValue[kKeyValueLenMax] = {0};
@ -449,7 +448,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
#endif
break;
case UBRK_TITLE:
result = BreakIterator::buildInstance(loc, "title", kind, status);
result = BreakIterator::buildInstance(loc, "title", status);
break;
default:
status = U_ILLEGAL_ARGUMENT_ERROR;

View File

@ -37,9 +37,8 @@ DictionaryBreakEngine::~DictionaryBreakEngine() {
}
UBool
DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
&& fSet.contains(c));
DictionaryBreakEngine::handles(UChar32 c) const {
return fSet.contains(c);
}
int32_t

View File

@ -42,27 +42,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
UnicodeSet fSet;
/**
* The set of break types handled by this engine
* @internal
*/
uint32_t fTypes;
/**
* <p>Default constructor.</p>
*
*/
DictionaryBreakEngine();
public:
/**
* <p>Constructor setting the break types handled.</p>
*
* @param breakTypes A bitmap of types handled by the engine.
* <p>Constructor </p>
*/
DictionaryBreakEngine( uint32_t breakTypes );
DictionaryBreakEngine();
/**
* <p>Virtual destructor.</p>
@ -78,7 +63,7 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* @return TRUE if this engine handles the particular character and break
* type.
*/
virtual UBool handles( UChar32 c, int32_t breakType ) const;
virtual UBool handles(UChar32 c) const;
/**
* <p>Find any breaks within a run in the supplied text.</p>
@ -88,14 +73,12 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
* that starts from the first character in the range.
* @param startPos The start of the run within the supplied text.
* @param endPos The end of the run within the supplied text.
* @param breakType The type of break desired, or -1.
* @param foundBreaks vector of int32_t to receive the break positions
* @return The number of breaks found.
*/
virtual int32_t findBreaks( UText *text,
int32_t startPos,
int32_t endPos,
int32_t breakType,
UVector32 &foundBreaks ) const;
protected:

View File

@ -217,7 +217,6 @@ RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) {
}
BreakIterator::operator=(that);
fBreakType = that.fBreakType;
if (fLanguageBreakEngines != NULL) {
delete fLanguageBreakEngines;
fLanguageBreakEngines = NULL; // Just rebuild for now
@ -278,11 +277,6 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
fRuleStatusIndex = 0;
fDone = false;
fDictionaryCharCount = 0;
fBreakType = UBRK_WORD; // Defaulting BreakType to word gives reasonable
// dictionary behavior for Break Iterators that are
// built from rules. Even better would be the ability to
// declare the type in the rules.
fLanguageBreakEngines = NULL;
fUnhandledBreakEngine = NULL;
fBreakCache = NULL;
@ -1290,14 +1284,14 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
int32_t i = fLanguageBreakEngines->size();
while (--i >= 0) {
lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i));
if (lbe->handles(c, fBreakType)) {
if (lbe->handles(c)) {
return lbe;
}
}
// No existing dictionary took the character. See if a factory wants to
// give us a new LanguageBreakEngine for this character.
lbe = getLanguageBreakEngineFromFactory(c, fBreakType);
lbe = getLanguageBreakEngineFromFactory(c);
// If we got one, use it and push it on our stack.
if (lbe != NULL) {
@ -1327,21 +1321,11 @@ RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) {
// Tell the reject engine about the character; at its discretion, it may
// add more than just the one character.
fUnhandledBreakEngine->handleCharacter(c, fBreakType);
fUnhandledBreakEngine->handleCharacter(c);
return fUnhandledBreakEngine;
}
/*int32_t RuleBasedBreakIterator::getBreakType() const {
return fBreakType;
}*/
void RuleBasedBreakIterator::setBreakType(int32_t type) {
fBreakType = type;
}
void RuleBasedBreakIterator::dumpCache() {
fBreakCache->dumpCache();
}

View File

@ -168,7 +168,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
// Ask the language object if there are any breaks. It will add them to the cache and
// leave the text pointer on the other side of its range, ready to search for the next one.
if (lbe != NULL) {
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBI->fBreakType, *fBreaks);
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, *fBreaks);
}
// Reload the loop variables for the next go-round

View File

@ -616,7 +616,7 @@ public:
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) = 0;
private:
static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
static BreakIterator* buildInstance(const Locale& loc, const char *type, UErrorCode& status);
static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);

View File

@ -149,13 +149,6 @@ private:
*/
UnhandledEngine *fUnhandledBreakEngine;
/**
*
* The type of the break iterator, or -1 if it has not been set.
* @internal
*/
int32_t fBreakType;
//=======================================================================
// constructors
//=======================================================================
@ -645,12 +638,6 @@ private:
*/
void reset(void);
/**
* Set the type of the break iterator.
* @internal
*/
void setBreakType(int32_t type);
/**
* Common initialization function, used by constructors and bufferClone.
* @internal