ICU-3944 text access, RBBI access. Fix extra space allocation problem
X-SVN-Rev: 18060
This commit is contained in:
parent
80e01d0da1
commit
0f8bc50e81
@ -330,7 +330,8 @@ int32_t RuleBasedBreakIterator::first(void) {
|
||||
if (fText == NULL)
|
||||
return BreakIterator::DONE;
|
||||
|
||||
fText->first();
|
||||
//fText->first();
|
||||
fText->setToStart();
|
||||
return fText->getIndex();
|
||||
}
|
||||
|
||||
@ -1350,17 +1351,239 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// UText functions
|
||||
// UText functions As a temporary implementation, create a type of CharacterIterator
|
||||
// that works over UText, and let the RBBI engine continue to
|
||||
// work on CharacterIterator, which it always has.
|
||||
//
|
||||
// The permanent solution is to rework the RBBI engine to use
|
||||
// UText directly, which will be more efficient for all input
|
||||
// sources.
|
||||
//
|
||||
// This CharacterIterator implementation over UText is not complete,
|
||||
// it has only what is needed for RBBI, and is not intended
|
||||
// to ever become public.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
|
||||
class CharacterIteratorUT: public CharacterIterator {
|
||||
public:
|
||||
CharacterIteratorUT(UText *ut);
|
||||
virtual ~CharacterIteratorUT();
|
||||
|
||||
virtual CharacterIterator *clone() const;
|
||||
virtual UBool operator==(const ForwardCharacterIterator& that) const;
|
||||
virtual UChar setIndex(int32_t position);
|
||||
virtual UChar32 previous32(void);
|
||||
virtual UChar32 next32(void);
|
||||
virtual UBool hasNext();
|
||||
virtual UChar32 current32(void) const;
|
||||
virtual UBool hasPrevious();
|
||||
virtual int32_t move(int32_t delta, EOrigin origin);
|
||||
static UClassID getStaticClassID(void);
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
UText *fUText;
|
||||
virtual void resetTo(const UText *ut, UErrorCode *status);
|
||||
|
||||
private:
|
||||
CharacterIteratorUT();
|
||||
|
||||
// The following functions are not needed by RBBI,
|
||||
// but are pure virtual in CharacterIterator, so must be defined.
|
||||
// Only stubs are provided in this implementation.
|
||||
virtual int32_t hashCode(void) const {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar nextPostInc(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar32 next32PostInc(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar first(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar32 first32(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar last(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar32 last32(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar32 setIndex32(int32_t position) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar current(void) const {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar next(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual UChar previous(void) {U_ASSERT(FALSE); return 0;};
|
||||
virtual int32_t move32(int32_t delta, EOrigin origin) {U_ASSERT(FALSE); return 0;};
|
||||
virtual void getText(UnicodeString& result) {U_ASSERT(FALSE);};
|
||||
};
|
||||
|
||||
|
||||
|
||||
//
|
||||
// The following fields are inherited from CharacterIterator.
|
||||
// This implementation __MUST__ keep them current because of non-virtual inline
|
||||
// functions defined in CharacterIterator.
|
||||
// int32_t textLength; // length of the text.
|
||||
// int32_t pos; // current index position
|
||||
// int32_t begin; // starting index. Always 0 for us.
|
||||
// int32_t end; // ending index
|
||||
//
|
||||
// CharacterIterator was designed assuming that utf-16 indexing would be used,
|
||||
// but native indexing will pass through OK. This partial implementation only
|
||||
// provides the '32' flavored code point access, not UChar access.
|
||||
//
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT);
|
||||
|
||||
CharacterIteratorUT::CharacterIteratorUT(UText *ut) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
fUText = utext_clone(NULL, ut, FALSE, &status);
|
||||
|
||||
// Set the inherited CharacterItertor fields
|
||||
textLength = utext_nativeLength(ut);
|
||||
pos = 0;
|
||||
begin = 0;
|
||||
end = textLength;
|
||||
}
|
||||
|
||||
CharacterIteratorUT::CharacterIteratorUT() {
|
||||
fUText = NULL;
|
||||
textLength = 0;
|
||||
pos = 0;
|
||||
begin = 0;
|
||||
end = 0;
|
||||
}
|
||||
|
||||
CharacterIteratorUT::~CharacterIteratorUT() {
|
||||
utext_close(fUText);
|
||||
}
|
||||
|
||||
|
||||
CharacterIterator *CharacterIteratorUT::clone() const {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
CharacterIteratorUT *result = new CharacterIteratorUT();
|
||||
result->fUText = utext_clone(NULL, fUText, TRUE, &status);
|
||||
if (U_SUCCESS(status)) {
|
||||
result->textLength = utext_nativeLength(fUText);
|
||||
result->pos = 0;
|
||||
result->begin = 0;
|
||||
result->end = textLength;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UBool CharacterIteratorUT::operator==(const ForwardCharacterIterator& that) const {
|
||||
if (this->getDynamicClassID() != that.getDynamicClassID()) {
|
||||
return FALSE;
|
||||
}
|
||||
const CharacterIteratorUT *realThat = (const CharacterIteratorUT *)&that;
|
||||
UBool result = this->fUText->context == realThat->fUText->context;
|
||||
return result;
|
||||
}
|
||||
|
||||
UChar CharacterIteratorUT::setIndex(int32_t position) {
|
||||
pos = position;
|
||||
if (pos > end) {
|
||||
pos = end;
|
||||
}
|
||||
utext_setNativeIndex(fUText, pos);
|
||||
return 0xffff; // RBBI doesn't use return value, and UText can't return a UChar easily.
|
||||
}
|
||||
|
||||
UChar32 CharacterIteratorUT::previous32(void) {
|
||||
UChar32 result = UTEXT_PREVIOUS32(fUText);
|
||||
pos = utext_getNativeIndex(fUText); // TODO: maybe optimize common case?
|
||||
if (result < 0) {
|
||||
result = 0x0000ffff;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UChar32 CharacterIteratorUT::next32(void) {
|
||||
// TODO: optimize.
|
||||
UTEXT_NEXT32(fUText);
|
||||
pos = utext_getNativeIndex(fUText);
|
||||
UChar32 result = UTEXT_NEXT32(fUText);
|
||||
if (result < 0) {
|
||||
result = 0x0000ffff;
|
||||
} else {
|
||||
UTEXT_PREVIOUS32(fUText);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UBool CharacterIteratorUT::hasNext() {
|
||||
// What would really be best for RBBI is a hasNext32()
|
||||
UBool result = TRUE;
|
||||
if (pos >= end-1) {
|
||||
result = FALSE;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UChar32 CharacterIteratorUT::current32(void) const {
|
||||
UChar32 result = utext_current32(fUText);
|
||||
if (result < 0) {
|
||||
result = 0x0000ffff;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
UBool CharacterIteratorUT::hasPrevious() {
|
||||
UBool result = pos > 0;
|
||||
return result;
|
||||
}
|
||||
|
||||
int32_t CharacterIteratorUT::move(int32_t delta, EOrigin origin) {
|
||||
// only needed for the inherited inline implementation of setToStart().
|
||||
int32_t result = pos;
|
||||
switch (origin) {
|
||||
case kStart:
|
||||
result = delta;
|
||||
break;
|
||||
case kCurrent:
|
||||
result = pos + delta;
|
||||
break;
|
||||
case kEnd:
|
||||
result = end + delta;
|
||||
break;
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
utext_setNativeIndex(fUText, result);
|
||||
pos = utext_getNativeIndex(fUText); // align to cp boundary
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CharacterIteratorUT::resetTo(const UText *ut, UErrorCode *status) {
|
||||
// Reset this CharacterIteratorUT to use a new UText.
|
||||
fUText = utext_clone(fUText, ut, FALSE, status);
|
||||
utext_setNativeIndex(fUText, 0);
|
||||
textLength = utext_nativeLength(fUText);
|
||||
pos = 0;
|
||||
end = textLength;
|
||||
}
|
||||
|
||||
void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
|
||||
// TODO: implement this.
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
reset();
|
||||
if (fText != NULL &&
|
||||
fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID())
|
||||
{
|
||||
// The break iterator is already using a UText based character iterator.
|
||||
// Copy the new UText into the existing character iterator's UText.
|
||||
CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText;
|
||||
utcr->resetTo(ut, &status);
|
||||
} else {
|
||||
delete fText;
|
||||
fText = new CharacterIteratorUT(ut);
|
||||
}
|
||||
this->first();
|
||||
}
|
||||
|
||||
|
||||
UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
|
||||
// TODO: implement this.
|
||||
return fillIn;
|
||||
UText *result = NULL;
|
||||
if (U_SUCCESS(status) && fText!=NULL &&
|
||||
fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID())
|
||||
{
|
||||
CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText;
|
||||
result = utext_clone(result, utcr->fUText, FALSE, &status);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
@ -113,6 +113,11 @@ utext_setNativeIndex(UText *ut, int32_t index) {
|
||||
U_DRAFT UChar32 U_EXPORT2
|
||||
utext_current32(UText *ut) {
|
||||
UChar32 c = U_SENTINEL;
|
||||
if (ut->chunk.offset==ut->chunk.length) {
|
||||
// Current position is just off the end of the chunk.
|
||||
// Can also happen at startup, with a zero length chunk at zero offset.
|
||||
ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk);
|
||||
}
|
||||
if (ut->chunk.offset < ut->chunk.length) {
|
||||
c = ut->chunk.contents[ut->chunk.offset];
|
||||
if (U16_IS_SURROGATE(c)) {
|
||||
@ -429,7 +434,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
|
||||
*ut = emptyText;
|
||||
ut->flags |= UTEXT_HEAP_ALLOCATED;
|
||||
if (spaceRequired>0) {
|
||||
ut->extraSize = spaceRequired;
|
||||
ut->extraSize = extraSpace;
|
||||
ut->pExtra = &((ExtendedUText *)ut)->extension;
|
||||
}
|
||||
}
|
||||
@ -461,6 +466,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
ut->extraSize = extraSpace;
|
||||
ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -21,6 +21,7 @@
|
||||
#include "rbbidata.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/utext.h"
|
||||
|
||||
/**
|
||||
* API Test the RuleBasedBreakIterator class
|
||||
@ -292,6 +293,58 @@ void RBBIAPITest::TestGetSetAdoptText()
|
||||
errln((UnicodeString)"ERROR:4 error in adoptText ");
|
||||
}
|
||||
|
||||
// UText API
|
||||
//
|
||||
// Quick test to see if UText is working at all.
|
||||
//
|
||||
const char *s1 = "hello world";
|
||||
const char *s2 = "see ya";
|
||||
// 012345678901
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UText *ut = utext_openUTF8(NULL, s1, -1, &status);
|
||||
wordIter1->setText(ut, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
int32_t pos;
|
||||
pos = wordIter1->first();
|
||||
TEST_ASSERT(pos==0);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==5);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==6);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==11);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==UBRK_DONE);
|
||||
|
||||
status = U_ZERO_ERROR;
|
||||
UText *ut2 = utext_openUTF8(NULL, s2, -1, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
wordIter1->setText(ut2, status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
pos = wordIter1->first();
|
||||
TEST_ASSERT(pos==0);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==3);
|
||||
pos = wordIter1->next();
|
||||
TEST_ASSERT(pos==4);
|
||||
|
||||
pos = wordIter1->last();
|
||||
TEST_ASSERT(pos==6);
|
||||
pos = wordIter1->previous();
|
||||
TEST_ASSERT(pos==4);
|
||||
pos = wordIter1->previous();
|
||||
TEST_ASSERT(pos==3);
|
||||
pos = wordIter1->previous();
|
||||
TEST_ASSERT(pos==0);
|
||||
pos = wordIter1->previous();
|
||||
TEST_ASSERT(pos==UBRK_DONE);
|
||||
|
||||
utext_close(ut);
|
||||
utext_close(ut2);
|
||||
|
||||
delete wordIter1;
|
||||
delete charIter1;
|
||||
delete rb;
|
||||
|
Loading…
Reference in New Issue
Block a user