ICU-3944 text access, RBBI access. Fix extra space allocation problem

X-SVN-Rev: 18060
This commit is contained in:
Andy Heninger 2005-06-26 21:31:36 +00:00
parent 80e01d0da1
commit 0f8bc50e81
3 changed files with 288 additions and 6 deletions

View File

@ -330,7 +330,8 @@ int32_t RuleBasedBreakIterator::first(void) {
if (fText == NULL)
return BreakIterator::DONE;
fText->first();
//fText->first();
fText->setToStart();
return fText->getIndex();
}
@ -1350,17 +1351,239 @@ UBool RuleBasedBreakIterator::isDictionaryChar(UChar32 c) {
//-------------------------------------------------------------------------------
//
// UText functions
// UText functions As a temporary implementation, create a type of CharacterIterator
// that works over UText, and let the RBBI engine continue to
// work on CharacterIterator, which it always has.
//
// The permanent solution is to rework the RBBI engine to use
// UText directly, which will be more efficient for all input
// sources.
//
// This CharacterIterator implementation over UText is not complete,
// it has only what is needed for RBBI, and is not intended
// to ever become public.
//
//-------------------------------------------------------------------------------
class CharacterIteratorUT: public CharacterIterator {
public:
CharacterIteratorUT(UText *ut);
virtual ~CharacterIteratorUT();
virtual CharacterIterator *clone() const;
virtual UBool operator==(const ForwardCharacterIterator& that) const;
virtual UChar setIndex(int32_t position);
virtual UChar32 previous32(void);
virtual UChar32 next32(void);
virtual UBool hasNext();
virtual UChar32 current32(void) const;
virtual UBool hasPrevious();
virtual int32_t move(int32_t delta, EOrigin origin);
static UClassID getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
UText *fUText;
virtual void resetTo(const UText *ut, UErrorCode *status);
private:
CharacterIteratorUT();
// The following functions are not needed by RBBI,
// but are pure virtual in CharacterIterator, so must be defined.
// Only stubs are provided in this implementation.
virtual int32_t hashCode(void) const {U_ASSERT(FALSE); return 0;};
virtual UChar nextPostInc(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 next32PostInc(void) {U_ASSERT(FALSE); return 0;};
virtual UChar first(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 first32(void) {U_ASSERT(FALSE); return 0;};
virtual UChar last(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 last32(void) {U_ASSERT(FALSE); return 0;};
virtual UChar32 setIndex32(int32_t position) {U_ASSERT(FALSE); return 0;};
virtual UChar current(void) const {U_ASSERT(FALSE); return 0;};
virtual UChar next(void) {U_ASSERT(FALSE); return 0;};
virtual UChar previous(void) {U_ASSERT(FALSE); return 0;};
virtual int32_t move32(int32_t delta, EOrigin origin) {U_ASSERT(FALSE); return 0;};
virtual void getText(UnicodeString& result) {U_ASSERT(FALSE);};
};
//
// The following fields are inherited from CharacterIterator.
// This implementation __MUST__ keep them current because of non-virtual inline
// functions defined in CharacterIterator.
// int32_t textLength; // length of the text.
// int32_t pos; // current index position
// int32_t begin; // starting index. Always 0 for us.
// int32_t end; // ending index
//
// CharacterIterator was designed assuming that utf-16 indexing would be used,
// but native indexing will pass through OK. This partial implementation only
// provides the '32' flavored code point access, not UChar access.
//
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CharacterIteratorUT);
CharacterIteratorUT::CharacterIteratorUT(UText *ut) {
UErrorCode status = U_ZERO_ERROR;
fUText = utext_clone(NULL, ut, FALSE, &status);
// Set the inherited CharacterItertor fields
textLength = utext_nativeLength(ut);
pos = 0;
begin = 0;
end = textLength;
}
CharacterIteratorUT::CharacterIteratorUT() {
fUText = NULL;
textLength = 0;
pos = 0;
begin = 0;
end = 0;
}
CharacterIteratorUT::~CharacterIteratorUT() {
utext_close(fUText);
}
CharacterIterator *CharacterIteratorUT::clone() const {
UErrorCode status = U_ZERO_ERROR;
CharacterIteratorUT *result = new CharacterIteratorUT();
result->fUText = utext_clone(NULL, fUText, TRUE, &status);
if (U_SUCCESS(status)) {
result->textLength = utext_nativeLength(fUText);
result->pos = 0;
result->begin = 0;
result->end = textLength;
}
return result;
}
UBool CharacterIteratorUT::operator==(const ForwardCharacterIterator& that) const {
if (this->getDynamicClassID() != that.getDynamicClassID()) {
return FALSE;
}
const CharacterIteratorUT *realThat = (const CharacterIteratorUT *)&that;
UBool result = this->fUText->context == realThat->fUText->context;
return result;
}
UChar CharacterIteratorUT::setIndex(int32_t position) {
pos = position;
if (pos > end) {
pos = end;
}
utext_setNativeIndex(fUText, pos);
return 0xffff; // RBBI doesn't use return value, and UText can't return a UChar easily.
}
UChar32 CharacterIteratorUT::previous32(void) {
UChar32 result = UTEXT_PREVIOUS32(fUText);
pos = utext_getNativeIndex(fUText); // TODO: maybe optimize common case?
if (result < 0) {
result = 0x0000ffff;
}
return result;
}
UChar32 CharacterIteratorUT::next32(void) {
// TODO: optimize.
UTEXT_NEXT32(fUText);
pos = utext_getNativeIndex(fUText);
UChar32 result = UTEXT_NEXT32(fUText);
if (result < 0) {
result = 0x0000ffff;
} else {
UTEXT_PREVIOUS32(fUText);
}
return result;
}
UBool CharacterIteratorUT::hasNext() {
// What would really be best for RBBI is a hasNext32()
UBool result = TRUE;
if (pos >= end-1) {
result = FALSE;
}
return result;
}
UChar32 CharacterIteratorUT::current32(void) const {
UChar32 result = utext_current32(fUText);
if (result < 0) {
result = 0x0000ffff;
}
return result;
}
UBool CharacterIteratorUT::hasPrevious() {
UBool result = pos > 0;
return result;
}
int32_t CharacterIteratorUT::move(int32_t delta, EOrigin origin) {
// only needed for the inherited inline implementation of setToStart().
int32_t result = pos;
switch (origin) {
case kStart:
result = delta;
break;
case kCurrent:
result = pos + delta;
break;
case kEnd:
result = end + delta;
break;
default:
U_ASSERT(FALSE);
}
utext_setNativeIndex(fUText, result);
pos = utext_getNativeIndex(fUText); // align to cp boundary
return result;
}
void CharacterIteratorUT::resetTo(const UText *ut, UErrorCode *status) {
// Reset this CharacterIteratorUT to use a new UText.
fUText = utext_clone(fUText, ut, FALSE, status);
utext_setNativeIndex(fUText, 0);
textLength = utext_nativeLength(fUText);
pos = 0;
end = textLength;
}
void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) {
// TODO: implement this.
if (U_FAILURE(status)) {
return;
}
reset();
if (fText != NULL &&
fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID())
{
// The break iterator is already using a UText based character iterator.
// Copy the new UText into the existing character iterator's UText.
CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText;
utcr->resetTo(ut, &status);
} else {
delete fText;
fText = new CharacterIteratorUT(ut);
}
this->first();
}
UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const {
// TODO: implement this.
return fillIn;
UText *result = NULL;
if (U_SUCCESS(status) && fText!=NULL &&
fText->getDynamicClassID() == CharacterIteratorUT::getStaticClassID())
{
CharacterIteratorUT *utcr = (CharacterIteratorUT *)fText;
result = utext_clone(result, utcr->fUText, FALSE, &status);
}
return result;
}

View File

@ -113,6 +113,11 @@ utext_setNativeIndex(UText *ut, int32_t index) {
U_DRAFT UChar32 U_EXPORT2
utext_current32(UText *ut) {
UChar32 c = U_SENTINEL;
if (ut->chunk.offset==ut->chunk.length) {
// Current position is just off the end of the chunk.
// Can also happen at startup, with a zero length chunk at zero offset.
ut->access(ut, ut->chunk.nativeLimit, TRUE, &ut->chunk);
}
if (ut->chunk.offset < ut->chunk.length) {
c = ut->chunk.contents[ut->chunk.offset];
if (U16_IS_SURROGATE(c)) {
@ -429,7 +434,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
*ut = emptyText;
ut->flags |= UTEXT_HEAP_ALLOCATED;
if (spaceRequired>0) {
ut->extraSize = spaceRequired;
ut->extraSize = extraSpace;
ut->pExtra = &((ExtendedUText *)ut)->extension;
}
}
@ -461,6 +466,7 @@ utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
*status = U_MEMORY_ALLOCATION_ERROR;
} else {
ut->extraSize = extraSpace;
ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
}
}
}

View File

@ -21,6 +21,7 @@
#include "rbbidata.h"
#include "cstring.h"
#include "unicode/ustring.h"
#include "unicode/utext.h"
/**
* API Test the RuleBasedBreakIterator class
@ -292,6 +293,58 @@ void RBBIAPITest::TestGetSetAdoptText()
errln((UnicodeString)"ERROR:4 error in adoptText ");
}
// UText API
//
// Quick test to see if UText is working at all.
//
const char *s1 = "hello world";
const char *s2 = "see ya";
// 012345678901
status = U_ZERO_ERROR;
UText *ut = utext_openUTF8(NULL, s1, -1, &status);
wordIter1->setText(ut, status);
TEST_ASSERT_SUCCESS(status);
int32_t pos;
pos = wordIter1->first();
TEST_ASSERT(pos==0);
pos = wordIter1->next();
TEST_ASSERT(pos==5);
pos = wordIter1->next();
TEST_ASSERT(pos==6);
pos = wordIter1->next();
TEST_ASSERT(pos==11);
pos = wordIter1->next();
TEST_ASSERT(pos==UBRK_DONE);
status = U_ZERO_ERROR;
UText *ut2 = utext_openUTF8(NULL, s2, -1, &status);
TEST_ASSERT_SUCCESS(status);
wordIter1->setText(ut2, status);
TEST_ASSERT_SUCCESS(status);
pos = wordIter1->first();
TEST_ASSERT(pos==0);
pos = wordIter1->next();
TEST_ASSERT(pos==3);
pos = wordIter1->next();
TEST_ASSERT(pos==4);
pos = wordIter1->last();
TEST_ASSERT(pos==6);
pos = wordIter1->previous();
TEST_ASSERT(pos==4);
pos = wordIter1->previous();
TEST_ASSERT(pos==3);
pos = wordIter1->previous();
TEST_ASSERT(pos==0);
pos = wordIter1->previous();
TEST_ASSERT(pos==UBRK_DONE);
utext_close(ut);
utext_close(ut2);
delete wordIter1;
delete charIter1;
delete rb;