ICU-10990 remove STL deps from FilteredBreakIterator implementation

X-SVN-Rev: 36008
This commit is contained in:
Steven R. Loomis 2014-07-08 00:30:21 +00:00
parent ff4dcfc31d
commit 68b3eed7cb
3 changed files with 158 additions and 65 deletions

View File

@ -417,7 +417,10 @@
* @internal * @internal
*/ */
#ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION #ifndef UCONFIG_NO_FILTERED_BREAK_ITERATION
# define UCONFIG_NO_FILTERED_BREAK_ITERATION 1 # define UCONFIG_NO_FILTERED_BREAK_ITERATION 0
#endif #endif
#endif #endif

View File

@ -11,15 +11,107 @@
#include <unicode/ucharstriebuilder.h> #include <unicode/ucharstriebuilder.h>
#include <set>
#include <string>
#include <functional>
#include "uresimp.h" #include "uresimp.h"
#include "ubrkimpl.h" #include "ubrkimpl.h"
#include "uvector.h"
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN
using namespace std; #ifndef FB_DEBUG
#define FB_DEBUG 0
#endif
#if FB_DEBUG
#include <stdio.h>
static void _fb_trace(const char *m, const UnicodeString *s, UBool b, int32_t d, const char *f, int l) {
char buf[2048];
if(s) {
s->extract(0,s->length(),buf,2048);
} else {
strcpy(buf,"NULL");
}
fprintf(stderr,"%s:%d: %s. s='%s'(%p), b=%c, d=%d\n",
f, l, m, buf, (const void*)s, b?'T':'F',(int)d);
}
#define FB_TRACE(m,s,b,d) _fb_trace(m,s,b,d,__FILE__,__LINE__)
#else
#define FB_TRACE(m,s,b,d)
#endif
static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
const UnicodeString &a = *(const UnicodeString*)t1.pointer;
const UnicodeString &b = *(const UnicodeString*)t2.pointer;
return a.compare(b);
}
/**
* A UVector which implements a set of strings.
*/
class U_I18N_API UStringSet : public UVector {
public:
UStringSet(UErrorCode &status) : UVector(uprv_deleteUObject,
uhash_compareUnicodeString,
1,
status) {}
virtual ~UStringSet();
/**
* Is this UnicodeSet contained?
*/
inline UBool contains(const UnicodeString& s) {
return contains((void*) &s);
}
using UVector::contains;
/**
* Return the ith UnicodeString alias
*/
inline const UnicodeString* getStringAt(int32_t i) const {
return (const UnicodeString*)elementAt(i);
}
/**
* Adopt the UnicodeString if not already contained.
* Caller no longer owns the pointer in any case.
* @return true if adopted successfully, false otherwise (error, or else duplicate)
*/
inline UBool adopt(UnicodeString *str, UErrorCode &status) {
if(U_FAILURE(status) || contains(*str)) {
delete str;
return false;
} else {
sortedInsert(str, compareUnicodeString, status);
if(U_FAILURE(status)) {
delete str;
return false;
}
return true;
}
}
/**
* Add by value.
* @return true if successfully adopted.
*/
inline UBool add(const UnicodeString& str, UErrorCode &status) {
if(U_FAILURE(status)) return false;
UnicodeString *t = new UnicodeString(str);
if(t==NULL) {
status = U_MEMORY_ALLOCATION_ERROR; return false;
}
return adopt(t, status);
}
/**
* Remove this string.
* @return true if successfully removed, false otherwise (error, or else it wasn't there)
*/
inline UBool remove(const UnicodeString &s, UErrorCode &status) {
if(U_FAILURE(status)) return false;
return removeElement((void*) &s);
}
};
/**
* Virtual, won't be inlined
*/
UStringSet::~UStringSet() {}
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
static const int32_t kMATCH = (1<<1); //< exact match - skip this one. static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
@ -27,11 +119,11 @@ static const int32_t kSuppressInReverse = (1<<0);
static const int32_t kAddToForward = (1<<1); static const int32_t kAddToForward = (1<<1);
static const UChar kFULLSTOP = 0x002E; // '.' static const UChar kFULLSTOP = 0x002E; // '.'
class ULISentenceBreakIterator : public BreakIterator { class SimpleFilteredSentenceBreakIterator : public BreakIterator {
public: public:
ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status); SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
virtual ~ULISentenceBreakIterator() {} virtual ~SimpleFilteredSentenceBreakIterator() {}
ULISentenceBreakIterator(const ULISentenceBreakIterator& other); SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other);
private: private:
LocalPointer<BreakIterator> fDelegate; LocalPointer<BreakIterator> fDelegate;
LocalUTextPointer fText; LocalUTextPointer fText;
@ -48,7 +140,7 @@ public:
status = U_SAFECLONE_ALLOCATED_WARNING; status = U_SAFECLONE_ALLOCATED_WARNING;
return clone(); return clone();
} }
virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); } virtual BreakIterator* clone(void) const { return new SimpleFilteredSentenceBreakIterator(*this); }
virtual UClassID getDynamicClassID(void) const { return NULL; } virtual UClassID getDynamicClassID(void) const { return NULL; }
virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; } virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; }
@ -77,7 +169,7 @@ public:
}; };
ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other) SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(const SimpleFilteredSentenceBreakIterator& other)
: BreakIterator(other), fDelegate(other.fDelegate->clone()) : BreakIterator(other), fDelegate(other.fDelegate->clone())
{ {
/* /*
@ -92,7 +184,7 @@ ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterato
} }
ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) : SimpleFilteredSentenceBreakIterator::SimpleFilteredSentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)), BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
fDelegate(adopt), fDelegate(adopt),
fBackwardsTrie(backwards), fBackwardsTrie(backwards),
@ -101,7 +193,7 @@ ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsT
// all set.. // all set..
} }
int32_t ULISentenceBreakIterator::next() { int32_t SimpleFilteredSentenceBreakIterator::next() {
int32_t n = fDelegate->next(); int32_t n = fDelegate->next();
if(n == UBRK_DONE || // at end or if(n == UBRK_DONE || // at end or
fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
@ -196,40 +288,32 @@ int32_t ULISentenceBreakIterator::next() {
return n; return n;
} }
U_NAMESPACE_END /**
* Concrete implementation of builder class.
#if 0 */
// Would improve performance - but, platform issues.
// for the 'set'
namespace std {
template <> struct hash<icu::UnicodeString> {
size_t operator()( const UnicodeString& str ) const {
return (size_t)str.hashCode();
}
};
}
#endif
U_NAMESPACE_BEGIN
class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder { class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
public: public:
virtual ~SimpleFilteredBreakIteratorBuilder(); virtual ~SimpleFilteredBreakIteratorBuilder();
SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status); SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
SimpleFilteredBreakIteratorBuilder(); SimpleFilteredBreakIteratorBuilder(UErrorCode &status);
virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status); virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status); virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status); virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
private: private:
set<UnicodeString> fSet; UStringSet fSet;
}; };
SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder() SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
{ {
} }
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(UErrorCode &status)
: fSet(status)
{
}
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status) SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
: fSet() : fSet(status)
{ {
if(U_SUCCESS(status)) { if(U_SUCCESS(status)) {
LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status)); LocalUResourceBundlePointer b(ures_open(U_ICUDATA_BRKITR, fromLocale.getBaseName(), &status));
@ -252,23 +336,20 @@ SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Loc
} }
} }
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
: fSet()
{
}
UBool UBool
SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status) SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
{ {
if( U_FAILURE(status) ) return FALSE; UBool r = fSet.add(exception, status);
return fSet.insert(exception).second; FB_TRACE("suppressBreakAfter",&exception,r,0);
return r;
} }
UBool UBool
SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status) SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
{ {
if( U_FAILURE(status) ) return FALSE; UBool r = fSet.remove(exception, status);
return ((fSet.erase(exception)) != 0); FB_TRACE("unsuppressBreakAfter",&exception,r,0);
return r;
} }
BreakIterator * BreakIterator *
@ -293,11 +374,19 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr
LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M." LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
int n=0; int n=0;
for ( set<UnicodeString>::iterator i = fSet.begin(); for ( int32_t i = 0;
i != fSet.end(); i<fSet.size();
i++) { i++) {
const UnicodeString &abbr = *i; const UnicodeString *abbr = fSet.getStringAt(i);
ustrs[n] = abbr; if(abbr) {
FB_TRACE("build",abbr,TRUE,i);
ustrs[n] = *abbr; // copy by value
FB_TRACE("ustrs[n]",&ustrs[n],TRUE,i);
} else {
FB_TRACE("build",abbr,FALSE,i);
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
partials[n] = 0; // default: not partial partials[n] = 0; // default: not partial
n++; n++;
} }
@ -305,34 +394,37 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr
for(int i=0;i<subCount;i++) { for(int i=0;i<subCount;i++) {
int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
if(nn>-1 && (nn+1)!=ustrs[i].length()) { if(nn>-1 && (nn+1)!=ustrs[i].length()) {
//if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer()); FB_TRACE("partial",&ustrs[i],FALSE,i);
// is partial. // is partial.
// is it unique? // is it unique?
int sameAs = -1; int sameAs = -1;
for(int j=0;j<subCount;j++) { for(int j=0;j<subCount;j++) {
if(j==i) continue; if(j==i) continue;
if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) { if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
//if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1); FB_TRACE("prefix",&ustrs[j],FALSE,nn+1);
//UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn //UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
if(partials[j]==0) { // hasn't been processed yet if(partials[j]==0) { // hasn't been processed yet
partials[j] = kSuppressInReverse | kAddToForward; partials[j] = kSuppressInReverse | kAddToForward;
//if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer()); FB_TRACE("suppressing",&ustrs[j],FALSE,j);
} else if(partials[j] & kSuppressInReverse) { } else if(partials[j] & kSuppressInReverse) {
sameAs = j; // the other entry is already in the reverse table. sameAs = j; // the other entry is already in the reverse table.
} }
} }
} }
//if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]); FB_TRACE("for partial same-",&ustrs[i],FALSE,sameAs);
FB_TRACE(" == partial #",&ustrs[i],FALSE,partials[i]);
UnicodeString prefix(ustrs[i], 0, nn+1); UnicodeString prefix(ustrs[i], 0, nn+1);
if(sameAs == -1 && partials[i] == 0) { if(sameAs == -1 && partials[i] == 0) {
// first one - add the prefix to the reverse table. // first one - add the prefix to the reverse table.
prefix.reverse(); prefix.reverse();
builder->add(prefix, kPARTIAL, status); builder->add(prefix, kPARTIAL, status);
revCount++; revCount++;
//if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status)); FB_TRACE("Added partial",&prefix,FALSE, i);
FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
partials[i] = kSuppressInReverse | kAddToForward; partials[i] = kSuppressInReverse | kAddToForward;
} else { } else {
//if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer()); FB_TRACE("NOT adding partial",&prefix,FALSE, i);
FB_TRACE(u_errorName(status),&ustrs[i],FALSE,i);
} }
} }
} }
@ -341,9 +433,9 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr
ustrs[i].reverse(); ustrs[i].reverse();
builder->add(ustrs[i], kMATCH, status); builder->add(ustrs[i], kMATCH, status);
revCount++; revCount++;
//if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status)); FB_TRACE(u_errorName(status), &ustrs[i], FALSE, i);
} else { } else {
//if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer()); FB_TRACE("Adding fwd",&ustrs[i], FALSE, i);
// an optimization would be to only add the portion after the '.' // an optimization would be to only add the portion after the '.'
// for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward, // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
@ -355,12 +447,12 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr
////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status)); ////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
} }
} }
//if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount); FB_TRACE("AbbrCount",NULL,FALSE, subCount);
if(revCount>0) { if(revCount>0) {
backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status)); backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
if(U_FAILURE(status)) { if(U_FAILURE(status)) {
//printf("Error %s building backwards\n", u_errorName(status)); FB_TRACE(u_errorName(status),NULL,FALSE, -1);
return NULL; return NULL;
} }
} }
@ -368,16 +460,16 @@ SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UEr
if(fwdCount>0) { if(fwdCount>0) {
forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status)); forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
if(U_FAILURE(status)) { if(U_FAILURE(status)) {
//printf("Error %s building forwards\n", u_errorName(status)); FB_TRACE(u_errorName(status),NULL,FALSE, -1);
return NULL; return NULL;
} }
} }
return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status); return new SimpleFilteredSentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
} }
// ----------- // ----------- Base class implementation
FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() { FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
} }
@ -388,18 +480,16 @@ FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
FilteredBreakIteratorBuilder * FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) { FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
if(U_FAILURE(status)) return NULL; if(U_FAILURE(status)) return NULL;
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status)); LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status));
if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR; if(U_SUCCESS(status) && !ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
return ret.orphan(); return ret.orphan();
} }
FilteredBreakIteratorBuilder * FilteredBreakIteratorBuilder *
FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) { FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
if(U_FAILURE(status)) return NULL; if(U_FAILURE(status)) return NULL;
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(status));
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder()); if(U_SUCCESS(status) && !ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
return ret.orphan(); return ret.orphan();
} }

View File

@ -10,7 +10,7 @@
#include "unicode/brkiter.h" #include "unicode/brkiter.h"
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING && !UCONFIG_NO_FILTERED_BREAK_ITERATION #if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_FILTERED_BREAK_ITERATION
U_NAMESPACE_BEGIN U_NAMESPACE_BEGIN