ICU-10286 Check in filtered break behavior. Does not load bundles yet. Also fixes to locbased and brkiter ( no way for subclasses to set locale id )
X-SVN-Rev: 35357
This commit is contained in:
parent
bf0d5601c2
commit
bbe5a9e0d8
@ -1,10 +1,10 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1997-2013, International Business Machines Corporation and
|
||||
* Copyright (C) 1997-2014, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
*
|
||||
* File TXTBDRY.CPP
|
||||
* File brkiter.cpp
|
||||
*
|
||||
* Modification History:
|
||||
*
|
||||
@ -461,6 +461,11 @@ int32_t BreakIterator::getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UE
|
||||
return 1;
|
||||
}
|
||||
|
||||
BreakIterator::BreakIterator (const Locale& valid, const Locale& actual) {
|
||||
U_LOCALE_BASED(locBased, (*this));
|
||||
locBased.setLocaleIDs(valid, actual);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2004, International Business Machines
|
||||
* Copyright (c) 2004-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
@ -43,4 +43,9 @@ void LocaleBased::setLocaleIDs(const char* validID, const char* actualID) {
|
||||
}
|
||||
}
|
||||
|
||||
void LocaleBased::setLocaleIDs(const Locale& validID, const Locale& actualID) {
|
||||
uprv_strcpy(valid, validID.getName());
|
||||
uprv_strcpy(actual, actualID.getName());
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (c) 2004, International Business Machines
|
||||
* Copyright (c) 2004-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Author: Alan Liu
|
||||
@ -75,6 +75,14 @@ class U_COMMON_API LocaleBased : public UMemory {
|
||||
*/
|
||||
void setLocaleIDs(const char* valid, const char* actual);
|
||||
|
||||
/**
|
||||
* Set the locale meta-data for the service object wrapped by this
|
||||
* object.
|
||||
* @param valid the ID of the valid locale
|
||||
* @param actual the ID of the actual locale
|
||||
*/
|
||||
void setLocaleIDs(const Locale& valid, const Locale& actual);
|
||||
|
||||
private:
|
||||
|
||||
char* valid;
|
||||
|
@ -623,7 +623,8 @@ protected:
|
||||
BreakIterator();
|
||||
/** @internal */
|
||||
BreakIterator (const BreakIterator &other) : UObject(other) {}
|
||||
|
||||
/** @internal */
|
||||
BreakIterator (const Locale& valid, const Locale& actual);
|
||||
private:
|
||||
|
||||
/** @internal */
|
||||
|
@ -7,8 +7,356 @@
|
||||
|
||||
#include "unicode/filteredbrk.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
|
||||
#include <unicode/ucharstriebuilder.h>
|
||||
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <set>
|
||||
#include <string>
|
||||
#include <functional>
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
using namespace std;
|
||||
|
||||
static const UBool debug = FALSE;
|
||||
static const int32_t kPARTIAL = (1<<0); //< partial - need to run through forward trie
|
||||
static const int32_t kMATCH = (1<<1); //< exact match - skip this one.
|
||||
static const int32_t kSuppressInReverse = (1<<0);
|
||||
static const int32_t kAddToForward = (1<<1);
|
||||
static const UChar kFULLSTOP = 0x002E; // '.'
|
||||
|
||||
class ULISentenceBreakIterator : public BreakIterator {
|
||||
public:
|
||||
ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status);
|
||||
virtual ~ULISentenceBreakIterator() {}
|
||||
ULISentenceBreakIterator(const ULISentenceBreakIterator& other);
|
||||
private:
|
||||
LocalPointer<BreakIterator> fDelegate;
|
||||
LocalUTextPointer fText;
|
||||
LocalPointer<UCharsTrie> fBackwardsTrie; // i.e. ".srM" for Mrs.
|
||||
LocalPointer<UCharsTrie> fForwardsPartialTrie; // Has ".a" for "a.M."
|
||||
|
||||
/* -- subclass interface -- */
|
||||
public:
|
||||
/* -- cloning and other subclass stuff -- */
|
||||
virtual BreakIterator * createBufferClone(void */*stackBuffer*/,
|
||||
int32_t &/*BufferSize*/,
|
||||
UErrorCode &status) {
|
||||
// for now - always deep clone
|
||||
status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
return clone();
|
||||
}
|
||||
virtual BreakIterator* clone(void) const { return new ULISentenceBreakIterator(*this); }
|
||||
virtual UClassID getDynamicClassID(void) const { return NULL; }
|
||||
virtual UBool operator==(const BreakIterator& o) const { if(*this==o) return true; return false; }
|
||||
|
||||
/* -- text modifying -- */
|
||||
virtual void setText(UText *text, UErrorCode &status) { fDelegate->setText(text,status); }
|
||||
virtual BreakIterator &refreshInputText(UText *input, UErrorCode &status) { fDelegate->refreshInputText(input,status); return *this; }
|
||||
virtual void adoptText(CharacterIterator* it) { fDelegate->adoptText(it); }
|
||||
virtual void setText(const UnicodeString &text) { fDelegate->setText(text); }
|
||||
|
||||
/* -- other functions that are just delegated -- */
|
||||
virtual UText *getUText(UText *fillIn, UErrorCode &status) const { return fDelegate->getUText(fillIn,status); }
|
||||
virtual CharacterIterator& getText(void) const { return fDelegate->getText(); }
|
||||
|
||||
/* -- ITERATION -- */
|
||||
virtual int32_t first(void) { return fDelegate->first(); }
|
||||
virtual int32_t preceding(int32_t offset) { return fDelegate->preceding(offset); }
|
||||
virtual int32_t previous(void) { return fDelegate->previous(); }
|
||||
virtual UBool isBoundary(int32_t offset) { return fDelegate->isBoundary(offset); }
|
||||
virtual int32_t current(void) const { return fDelegate->current(); }
|
||||
|
||||
virtual int32_t next(void);
|
||||
|
||||
virtual int32_t next(int32_t n) { return fDelegate->next(n); }
|
||||
virtual int32_t following(int32_t offset) { return fDelegate->following(offset); }
|
||||
virtual int32_t last(void) { return fDelegate->last(); }
|
||||
|
||||
};
|
||||
|
||||
ULISentenceBreakIterator::ULISentenceBreakIterator(const ULISentenceBreakIterator& other)
|
||||
: BreakIterator(other), fDelegate(other.fDelegate->clone())
|
||||
{
|
||||
/*
|
||||
TODO: not able to clone Tries. Should be a refcounted hidden master instead.
|
||||
if(other.fBackwardsTrie.isValid()) {
|
||||
fBackwardsTrie.adoptInstead(other.fBackwardsTrie->clone());
|
||||
}
|
||||
if(other.fForwardsPartialTrie.isValid()) {
|
||||
fForwardsPartialTrie.adoptInstead(other.fForwardsPartialTrie->clone());
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
|
||||
ULISentenceBreakIterator::ULISentenceBreakIterator(BreakIterator *adopt, UCharsTrie *forwards, UCharsTrie *backwards, UErrorCode &status) :
|
||||
BreakIterator(adopt->getLocale(ULOC_VALID_LOCALE,status),adopt->getLocale(ULOC_ACTUAL_LOCALE,status)),
|
||||
fDelegate(adopt),
|
||||
fBackwardsTrie(backwards),
|
||||
fForwardsPartialTrie(forwards)
|
||||
{
|
||||
// all set..
|
||||
}
|
||||
|
||||
int32_t ULISentenceBreakIterator::next() {
|
||||
int32_t n = fDelegate->next();
|
||||
if(n == UBRK_DONE || // at end or
|
||||
fBackwardsTrie.isNull()) { // .. no backwards table loaded == no exceptions
|
||||
return n;
|
||||
}
|
||||
// OK, do we need to break here?
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
// refresh text
|
||||
fText.adoptInstead(fDelegate->getUText(fText.orphan(), status));
|
||||
//if(debug2) u_printf("str, native len=%d\n", utext_nativeLength(fText.getAlias()));
|
||||
do { // outer loop runs once per underlying break (from fDelegate).
|
||||
// loops while 'n' points to an exception.
|
||||
utext_setNativeIndex(fText.getAlias(), n); // from n..
|
||||
fBackwardsTrie->reset();
|
||||
UChar32 uch;
|
||||
//if(debug2) u_printf(" n@ %d\n", n);
|
||||
// Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
|
||||
if((uch=utext_previous32(fText.getAlias()))==(UChar32)0x0020) { // TODO: skip a class of chars here??
|
||||
// TODO only do this the 1st time?
|
||||
//if(debug2) u_printf("skipping prev: |%C| \n", (UChar)uch);
|
||||
} else {
|
||||
//if(debug2) u_printf("not skipping prev: |%C| \n", (UChar)uch);
|
||||
uch = utext_next32(fText.getAlias());
|
||||
//if(debug2) u_printf(" -> : |%C| \n", (UChar)uch);
|
||||
}
|
||||
UStringTrieResult r = USTRINGTRIE_INTERMEDIATE_VALUE;
|
||||
|
||||
int32_t bestPosn = -1;
|
||||
int32_t bestValue = -1;
|
||||
|
||||
while((uch=utext_previous32(fText.getAlias()))!=U_SENTINEL && // more to consume backwards and..
|
||||
USTRINGTRIE_HAS_NEXT(r=fBackwardsTrie->nextForCodePoint(uch))) {// more in the trie
|
||||
if(USTRINGTRIE_HAS_VALUE(r)) { // remember the best match so far
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
bestValue = fBackwardsTrie->getValue();
|
||||
}
|
||||
//if(debug2) u_printf("rev< /%C/ cont?%d @%d\n", (UChar)uch, r, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
|
||||
if(USTRINGTRIE_MATCHES(r)) { // exact match?
|
||||
//if(debug2) u_printf("rev<?/%C/?end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
bestValue = fBackwardsTrie->getValue();
|
||||
bestPosn = utext_getNativeIndex(fText.getAlias());
|
||||
//if(debug2) u_printf("rev<+/%C/+end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
}
|
||||
|
||||
if(bestPosn>=0) {
|
||||
//if(debug2) u_printf("rev< /%C/ end of seq.. r=%d, bestPosn=%d, bestValue=%d\n", (UChar)uch, r, bestPosn, bestValue);
|
||||
|
||||
//if(USTRINGTRIE_MATCHES(r)) { // matched - so, now what?
|
||||
//int32_t bestValue = fBackwardsTrie->getValue();
|
||||
////if(debug2) u_printf("rev< /%C/ matched, skip..%d bestValue=%d\n", (UChar)uch, r, bestValue);
|
||||
|
||||
if(bestValue == kMATCH) { // exact match!
|
||||
//if(debug2) u_printf(" exact backward match\n");
|
||||
n = fDelegate->next(); // skip this one. Find the next lowerlevel break.
|
||||
if(n==UBRK_DONE) return n;
|
||||
continue; // See if the next is another exception.
|
||||
} else if(bestValue == kPARTIAL
|
||||
&& fForwardsPartialTrie.isValid()) { // make sure there's a forward trie
|
||||
//if(debug2) u_printf(" partial backward match\n");
|
||||
// We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
|
||||
// to see if it matches something going forward.
|
||||
fForwardsPartialTrie->reset();
|
||||
UStringTrieResult rfwd = USTRINGTRIE_INTERMEDIATE_VALUE;
|
||||
utext_setNativeIndex(fText.getAlias(), bestPosn); // hope that's close ..
|
||||
//if(debug2) u_printf("Retrying at %d\n", bestPosn);
|
||||
while((uch=utext_next32(fText.getAlias()))!=U_SENTINEL &&
|
||||
USTRINGTRIE_HAS_NEXT(rfwd=fForwardsPartialTrie->nextForCodePoint(uch))) {
|
||||
//if(debug2) u_printf("fwd> /%C/ cont?%d @%d\n", (UChar)uch, rfwd, utext_getNativeIndex(fText.getAlias()));
|
||||
}
|
||||
if(USTRINGTRIE_MATCHES(rfwd)) {
|
||||
//if(debug2) u_printf("fwd> /%C/ == forward match!\n", (UChar)uch);
|
||||
// only full matches here, nothing to check
|
||||
// skip the next:
|
||||
n = fDelegate->next();
|
||||
if(n==UBRK_DONE) return n;
|
||||
continue;
|
||||
} else {
|
||||
//if(debug2) u_printf("fwd> /%C/ no match.\n", (UChar)uch);
|
||||
// no match (no exception) -return the 'underlying' break
|
||||
return n;
|
||||
}
|
||||
} else {
|
||||
return n; // internal error and/or no forwards trie
|
||||
}
|
||||
} else {
|
||||
//if(debug2) u_printf("rev< /%C/ .. no match..%d\n", (UChar)uch, r); // no best match
|
||||
return n; // No match - so exit. Not an exception.
|
||||
}
|
||||
} while(n != UBRK_DONE);
|
||||
return n;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
// for the 'set'
|
||||
namespace std {
|
||||
template <> struct hash<icu::UnicodeString> {
|
||||
size_t operator()( const UnicodeString& str ) const {
|
||||
return (size_t)str.hashCode();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class SimpleFilteredBreakIteratorBuilder : public FilteredBreakIteratorBuilder {
|
||||
public:
|
||||
virtual ~SimpleFilteredBreakIteratorBuilder();
|
||||
SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status);
|
||||
SimpleFilteredBreakIteratorBuilder();
|
||||
virtual UBool suppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
|
||||
virtual UBool unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status);
|
||||
virtual BreakIterator *build(BreakIterator* adoptBreakIterator, UErrorCode& status);
|
||||
private:
|
||||
set<UnicodeString> fSet;
|
||||
};
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::~SimpleFilteredBreakIteratorBuilder()
|
||||
{
|
||||
}
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder(const Locale &fromLocale, UErrorCode &status)
|
||||
: fSet()
|
||||
{
|
||||
// TODO: load, set
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
}
|
||||
|
||||
SimpleFilteredBreakIteratorBuilder::SimpleFilteredBreakIteratorBuilder()
|
||||
: fSet()
|
||||
{
|
||||
}
|
||||
|
||||
UBool
|
||||
SimpleFilteredBreakIteratorBuilder::suppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
|
||||
{
|
||||
return fSet.insert(exception).second;
|
||||
}
|
||||
|
||||
UBool
|
||||
SimpleFilteredBreakIteratorBuilder::unsuppressBreakAfter(const UnicodeString& exception, UErrorCode& status)
|
||||
{
|
||||
return ((fSet.erase(exception)) != 0);
|
||||
}
|
||||
BreakIterator *
|
||||
SimpleFilteredBreakIteratorBuilder::build(BreakIterator* adoptBreakIterator, UErrorCode& status) {
|
||||
LocalPointer<BreakIterator> adopt(adoptBreakIterator);
|
||||
|
||||
if(U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
LocalPointer<UCharsTrieBuilder> builder(new UCharsTrieBuilder(status));
|
||||
LocalPointer<UCharsTrieBuilder> builder2(new UCharsTrieBuilder(status));
|
||||
|
||||
int32_t revCount = 0;
|
||||
int32_t fwdCount = 0;
|
||||
|
||||
int32_t subCount = fSet.size();
|
||||
LocalArray<UnicodeString> ustrs(new UnicodeString[subCount]);
|
||||
LocalArray<int> partials(new int[subCount]);
|
||||
|
||||
LocalPointer<UCharsTrie> backwardsTrie; // i.e. ".srM" for Mrs.
|
||||
LocalPointer<UCharsTrie> forwardsPartialTrie; // Has ".a" for "a.M."
|
||||
|
||||
int n=0;
|
||||
for ( set<UnicodeString>::iterator i = fSet.begin();
|
||||
i != fSet.end();
|
||||
i++) {
|
||||
const UnicodeString &abbr = *i;
|
||||
ustrs[n] = abbr;
|
||||
partials[n] = 0; // default: not partial
|
||||
n++;
|
||||
}
|
||||
// first pass - find partials.
|
||||
for(int i=0;i<subCount;i++) {
|
||||
int nn = ustrs[i].indexOf(kFULLSTOP); // TODO: non-'.' abbreviations
|
||||
if(nn>-1 && (nn+1)!=ustrs[i].length()) {
|
||||
//if(true) u_printf("Is a partial: /%S/\n", ustrs[i].getTerminatedBuffer());
|
||||
// is partial.
|
||||
// is it unique?
|
||||
int sameAs = -1;
|
||||
for(int j=0;j<subCount;j++) {
|
||||
if(j==i) continue;
|
||||
if(ustrs[i].compare(0,nn+1,ustrs[j],0,nn+1)==0) {
|
||||
//if(true) u_printf("Prefix match: /%S/ to %d\n", ustrs[j].getTerminatedBuffer(), nn+1);
|
||||
//UBool otherIsPartial = ((nn+1)!=ustrs[j].length()); // true if ustrs[j] doesn't end at nn
|
||||
if(partials[j]==0) { // hasn't been processed yet
|
||||
partials[j] = kSuppressInReverse | kAddToForward;
|
||||
//if(true) u_printf("Suppressing: /%S/\n", ustrs[j].getTerminatedBuffer());
|
||||
} else if(partials[j] & kSuppressInReverse) {
|
||||
sameAs = j; // the other entry is already in the reverse table.
|
||||
}
|
||||
}
|
||||
}
|
||||
//if(debug2) u_printf("for partial /%S/ same=%d partials=%d\n", ustrs[i].getTerminatedBuffer(), sameAs, partials[i]);
|
||||
UnicodeString prefix(ustrs[i], 0, nn+1);
|
||||
if(sameAs == -1 && partials[i] == 0) {
|
||||
// first one - add the prefix to the reverse table.
|
||||
prefix.reverse();
|
||||
builder->add(prefix, kPARTIAL, status);
|
||||
revCount++;
|
||||
//if(debug2) u_printf("Added Partial: /%S/ from /%S/ status=%s\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer(), u_errorName(status));
|
||||
partials[i] = kSuppressInReverse | kAddToForward;
|
||||
} else {
|
||||
//if(debug2) u_printf(" // not adding partial for /%S/ from /%S/\n", prefix.getTerminatedBuffer(), ustrs[i].getTerminatedBuffer());
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int i=0;i<subCount;i++) {
|
||||
if(partials[i]==0) {
|
||||
ustrs[i].reverse();
|
||||
builder->add(ustrs[i], kMATCH, status);
|
||||
revCount++;
|
||||
//if(debug2) u_printf("Added: /%S/ status=%s\n", ustrs[i].getTerminatedBuffer(), u_errorName(status));
|
||||
} else {
|
||||
//if(debug2) u_printf(" Adding fwd: /%S/\n", ustrs[i].getTerminatedBuffer());
|
||||
|
||||
// an optimization would be to only add the portion after the '.'
|
||||
// for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the forward,
|
||||
// instead of "Ph.D." since we already know the "Ph." part is a match.
|
||||
// would need the trie to be able to hold 0-length strings, though.
|
||||
builder2->add(ustrs[i], kMATCH, status); // forward
|
||||
fwdCount++;
|
||||
//ustrs[i].reverse();
|
||||
////if(debug2) u_printf("SUPPRESS- not Added(%d): /%S/ status=%s\n",partials[i], ustrs[i].getTerminatedBuffer(), u_errorName(status));
|
||||
}
|
||||
}
|
||||
//if(debug) u_printf(" %s has %d abbrs.\n", fJSONSource.c_str(), subCount);
|
||||
|
||||
if(revCount>0) {
|
||||
backwardsTrie.adoptInstead(builder->build(USTRINGTRIE_BUILD_FAST, status));
|
||||
if(U_FAILURE(status)) {
|
||||
//if(debug) u_printf("Error %s building backwards\n", u_errorName(status));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if(fwdCount>0) {
|
||||
forwardsPartialTrie.adoptInstead(builder2->build(USTRINGTRIE_BUILD_FAST, status));
|
||||
if(U_FAILURE(status)) {
|
||||
//if(debug) u_printf("Error %s building forwards\n", u_errorName(status));
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return new ULISentenceBreakIterator(adopt.orphan(), forwardsPartialTrie.orphan(), backwardsTrie.orphan(), status);
|
||||
}
|
||||
|
||||
|
||||
// -----------
|
||||
|
||||
FilteredBreakIteratorBuilder::FilteredBreakIteratorBuilder() {
|
||||
}
|
||||
|
||||
@ -16,18 +364,23 @@ FilteredBreakIteratorBuilder::~FilteredBreakIteratorBuilder() {
|
||||
}
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createInstance(const Locale& /*where*/, UErrorCode& status) {
|
||||
if (U_FAILURE(status)) return NULL;
|
||||
FilteredBreakIteratorBuilder::createInstance(const Locale& where, UErrorCode& status) {
|
||||
if(U_FAILURE(status)) return NULL;
|
||||
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return NULL;
|
||||
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder(where, status));
|
||||
if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return ret.orphan();
|
||||
}
|
||||
|
||||
|
||||
FilteredBreakIteratorBuilder *
|
||||
FilteredBreakIteratorBuilder::createInstance(UErrorCode& status) {
|
||||
status = U_UNSUPPORTED_ERROR;
|
||||
return NULL;
|
||||
if(U_FAILURE(status)) return NULL;
|
||||
|
||||
LocalPointer<FilteredBreakIteratorBuilder> ret(new SimpleFilteredBreakIteratorBuilder());
|
||||
if(!ret.isValid()) status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return ret.orphan();
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif //#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
|
@ -10,7 +10,7 @@
|
||||
|
||||
#include "unicode/brkiter.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/********************************************************************
|
||||
* Copyright (c) 1999-2013, International Business Machines
|
||||
* Copyright (c) 1999-2014, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************
|
||||
* Date Name Description
|
||||
@ -23,7 +23,9 @@
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
#include "unicode/filteredbrk.h"
|
||||
#endif
|
||||
/**
|
||||
* API Test the RuleBasedBreakIterator class
|
||||
*/
|
||||
@ -643,8 +645,8 @@ void RBBIAPITest::TestRuleStatus() {
|
||||
//no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
|
||||
// changed UBRK_WORD_KANA to UBRK_WORD_IDEO
|
||||
u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
|
||||
// 012345678901234567 8 9 0
|
||||
// Katakana
|
||||
// 012345678901234567 8 9 0
|
||||
// Katakana
|
||||
str, 30);
|
||||
UnicodeString testString1(str);
|
||||
int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
|
||||
@ -878,7 +880,7 @@ void RBBIAPITest::TestRegistration() {
|
||||
BreakIterator* ja_char = BreakIterator::createCharacterInstance("ja_JP", status);
|
||||
BreakIterator* root_word = BreakIterator::createWordInstance("", status);
|
||||
BreakIterator* root_char = BreakIterator::createCharacterInstance("", status);
|
||||
|
||||
|
||||
if (status == U_MISSING_RESOURCE_ERROR || status == U_FILE_ACCESS_ERROR) {
|
||||
dataerrln("Error creating instances of break interactors - %s", u_errorName(status));
|
||||
|
||||
@ -886,7 +888,7 @@ void RBBIAPITest::TestRegistration() {
|
||||
delete ja_char;
|
||||
delete root_word;
|
||||
delete root_char;
|
||||
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1057,7 +1059,7 @@ void RBBIAPITest::TestRoundtripRules() {
|
||||
|
||||
// Try out the RuleBasedBreakIterator constructors that take RBBIDataHeader*
|
||||
// (these are protected so we access them via a local class RBBIWithProtectedFunctions).
|
||||
// This is just a sanity check, not a thorough test (e.g. we don't check that the
|
||||
// This is just a sanity check, not a thorough test (e.g. we don't check that the
|
||||
// first delete actually frees rulesCopy).
|
||||
void RBBIAPITest::TestCreateFromRBBIData() {
|
||||
// Get some handy RBBIData
|
||||
@ -1083,7 +1085,7 @@ void RBBIAPITest::TestCreateFromRBBIData() {
|
||||
uprv_free( rulesCopy );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Now try the non-adopting constructor
|
||||
brkItr = new RBBIWithProtectedFunctions(builtRules, RBBIWithProtectedFunctions::kDontAdopt, status);
|
||||
if ( U_SUCCESS(status) ) {
|
||||
@ -1168,7 +1170,7 @@ void RBBIAPITest::TestRefreshInputText() {
|
||||
TEST_ASSERT(7 == bi->next());
|
||||
TEST_ASSERT(8 == bi->next());
|
||||
TEST_ASSERT(UBRK_DONE == bi->next());
|
||||
|
||||
|
||||
utext_close(&ut1);
|
||||
utext_close(&ut2);
|
||||
}
|
||||
@ -1176,6 +1178,142 @@ void RBBIAPITest::TestRefreshInputText() {
|
||||
|
||||
}
|
||||
|
||||
static void prtbrks(BreakIterator* brk, const UnicodeString &ustr, IntlTest &it) {
|
||||
static const UChar PILCROW=0x00B6, CHSTR=0x3010, CHEND=0x3011; // lenticular brackets
|
||||
it.logln(UnicodeString("String:'")+ustr+UnicodeString("'"));
|
||||
|
||||
int32_t *pos = new int32_t[ustr.length()];
|
||||
int32_t posCount = 0;
|
||||
|
||||
// calculate breaks up front, so we can print out
|
||||
// sans any debugging
|
||||
for(int32_t n = 0; (n=brk->next())!=UBRK_DONE; ) {
|
||||
pos[posCount++] = n;
|
||||
if(posCount>=ustr.length()) {
|
||||
it.errln("brk count exceeds string length!");
|
||||
return;
|
||||
}
|
||||
}
|
||||
UnicodeString out;
|
||||
out.append((UChar)CHSTR);
|
||||
int32_t prev = 0;
|
||||
for(int32_t i=0;i<posCount;i++) {
|
||||
int32_t n=pos[i];
|
||||
out.append(ustr.tempSubString(prev,n-prev));
|
||||
out.append((UChar)PILCROW);
|
||||
prev=n;
|
||||
}
|
||||
out.append(ustr.tempSubString(prev,ustr.length()-prev));
|
||||
out.append((UChar)CHEND);
|
||||
it.logln(out);
|
||||
|
||||
out.remove();
|
||||
for(int32_t i=0;i<posCount;i++) {
|
||||
char tmp[100];
|
||||
sprintf(tmp,"%d ",pos[i]);
|
||||
out.append(UnicodeString(tmp));
|
||||
}
|
||||
it.logln(out);
|
||||
delete [] pos;
|
||||
}
|
||||
|
||||
|
||||
void RBBIAPITest::TestFilteredBreakIteratorBuilder() {
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
LocalPointer<FilteredBreakIteratorBuilder> builder;
|
||||
LocalPointer<BreakIterator> baseBI;
|
||||
LocalPointer<BreakIterator> filteredBI;
|
||||
|
||||
const UnicodeString text("In the meantime Mr. Weston arrived with his small ship, which he had now recovered. Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge."); // (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
|
||||
const UnicodeString ABBR_MR("Mr.");
|
||||
const UnicodeString ABBR_CAPT("Capt.");
|
||||
|
||||
{
|
||||
logln("Constructing empty builder\n");
|
||||
builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Constructing base BI\n");
|
||||
baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Building new BI\n");
|
||||
filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Testing:");
|
||||
filteredBI->setText(text);
|
||||
TEST_ASSERT(20 == filteredBI->next()); // Mr.
|
||||
TEST_ASSERT(84 == filteredBI->next()); // recovered.
|
||||
TEST_ASSERT(90 == filteredBI->next()); // Capt.
|
||||
TEST_ASSERT(181 == filteredBI->next()); // Mr.
|
||||
TEST_ASSERT(278 == filteredBI->next()); // charge.
|
||||
filteredBI->first();
|
||||
prtbrks(filteredBI.getAlias(), text, *this);
|
||||
}
|
||||
|
||||
{
|
||||
logln("Constructing empty builder\n");
|
||||
builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Adding Mr. as an exception\n");
|
||||
TEST_ASSERT(TRUE == builder->suppressBreakAfter(ABBR_MR, status));
|
||||
TEST_ASSERT(FALSE == builder->suppressBreakAfter(ABBR_MR, status)); // already have it
|
||||
TEST_ASSERT(TRUE == builder->unsuppressBreakAfter(ABBR_MR, status));
|
||||
TEST_ASSERT(FALSE == builder->unsuppressBreakAfter(ABBR_MR, status)); // already removed it
|
||||
TEST_ASSERT(TRUE == builder->suppressBreakAfter(ABBR_MR, status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Constructing base BI\n");
|
||||
baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Building new BI\n");
|
||||
filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Testing:");
|
||||
filteredBI->setText(text);
|
||||
TEST_ASSERT(84 == filteredBI->next());
|
||||
TEST_ASSERT(90 == filteredBI->next());// Capt.
|
||||
TEST_ASSERT(278 == filteredBI->next());
|
||||
filteredBI->first();
|
||||
prtbrks(filteredBI.getAlias(), text, *this);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
logln("Constructing empty builder\n");
|
||||
builder.adoptInstead(FilteredBreakIteratorBuilder::createInstance(status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Adding Mr. and Capt as an exception\n");
|
||||
TEST_ASSERT(TRUE == builder->suppressBreakAfter(ABBR_MR, status));
|
||||
TEST_ASSERT(TRUE == builder->suppressBreakAfter(ABBR_CAPT, status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Constructing base BI\n");
|
||||
baseBI.adoptInstead(BreakIterator::createSentenceInstance(Locale::getEnglish(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Building new BI\n");
|
||||
filteredBI.adoptInstead(builder->build(baseBI.orphan(), status));
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
logln("Testing:");
|
||||
filteredBI->setText(text);
|
||||
TEST_ASSERT(84 == filteredBI->next());
|
||||
TEST_ASSERT(278 == filteredBI->next());
|
||||
filteredBI->first();
|
||||
prtbrks(filteredBI.getAlias(), text, *this);
|
||||
}
|
||||
|
||||
#else
|
||||
logln("Skipped- not: !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING");
|
||||
#endif
|
||||
}
|
||||
|
||||
//---------------------------------------------
|
||||
// runIndexedTest
|
||||
@ -1210,6 +1348,11 @@ void RBBIAPITest::runIndexedTest( int32_t index, UBool exec, const char* &name,
|
||||
#endif
|
||||
case 14: name = "TestRefreshInputText"; if (exec) TestRefreshInputText(); break;
|
||||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION && U_HAVE_STD_STRING
|
||||
case 15: name = "TestFilteredBreakIteratorBuilder"; if(exec) TestFilteredBreakIteratorBuilder(); break;
|
||||
#else
|
||||
case 15: name="skip"; break;
|
||||
#endif
|
||||
default: name = ""; break; // needed to end loop
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1999-2013 International Business Machines Corporation and
|
||||
* Copyright (c) 1999-2014 International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/************************************************************************
|
||||
@ -53,6 +53,7 @@ public:
|
||||
**/
|
||||
void TestIteration(void);
|
||||
|
||||
void TestFilteredBreakIteratorBuilder(void);
|
||||
|
||||
/**
|
||||
* Tests creating RuleBasedBreakIterator from rules strings.
|
||||
|
Loading…
Reference in New Issue
Block a user