ICU-8167 make trie iterators nested classes inside their tries

X-SVN-Rev: 29272
This commit is contained in:
Markus Scherer 2011-01-06 18:40:26 +00:00
parent 3518ad81e3
commit 397d6f7372
12 changed files with 342 additions and 339 deletions

View File

@ -85,7 +85,7 @@ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
ucnv_ext.o ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o ucnvdisp.o ucnv_set.o ucnv_ct.o \
uresbund.o ures_cnv.o uresdata.o resbund.o resbund_cnv.o \
ucat.o locmap.o uloc.o locid.o locutil.o locavailable.o locdispnames.o loclikely.o locresdata.o \
bytestream.o stringpiece.o bytestrie.o \
bytestream.o stringpiece.o bytestrie.o bytestrieiterator.o \
ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \

View File

@ -21,6 +21,7 @@
*/
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "ustringtrie.h"
@ -29,7 +30,8 @@ U_NAMESPACE_BEGIN
class ByteSink;
class BytesTrieBuilder;
class BytesTrieIterator;
class CharString;
class UVector32;
/**
* Light-weight, non-const reader class for a BytesTrie.
@ -166,9 +168,95 @@ public:
*/
int32_t getNextBytes(ByteSink &out) const;
/**
* Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
*/
class Iterator : public UMemory {
public:
/**
* Iterates from the root of a byte-serialized BytesTrie.
* @param trieBytes The trie bytes.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified BytesTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
~Iterator();
/**
* Resets this iterator to its initial state.
*/
Iterator &reset();
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const;
/**
* Finds the next (byte sequence, value) pair if there is one.
*
* If the byte sequence is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return The NUL-terminated byte sequence for the last successful next().
*/
const StringPiece &getString() const { return sp_; }
/**
* @return The value for the last successful next().
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop();
const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
const uint8_t *bytes_;
const uint8_t *pos_;
const uint8_t *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
CharString *str_;
StringPiece sp_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from BytesTrie.bytes.
// The second integer has the str_->length() from before the node in bits 15..0,
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
UVector32 *stack_;
};
private:
friend class BytesTrieBuilder;
friend class BytesTrieIterator;
inline void stop() {
pos_=NULL;

View File

@ -15,25 +15,51 @@
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytestrie.h"
#include "bytestrieiterator.h"
#include "charstr.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
BytesTrieIterator::BytesTrieIterator(const void *trieBytes, int32_t maxStringLength,
UErrorCode &errorCode)
BytesTrie::Iterator::Iterator(const void *trieBytes, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(reinterpret_cast<const uint8_t *>(trieBytes)),
pos_(bytes_), initialPos_(bytes_),
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
str_(NULL), maxLength_(maxStringLength), value_(0), stack_(NULL) {
if(U_FAILURE(errorCode)) {
return;
}
// str_ and stack_ are pointers so that it's easy to turn bytestrie.h into
// a public API header for which we would want it to depend only on
// other public headers.
// Unlike BytesTrie itself, its Iterator performs memory allocations anyway
// via the CharString and UVector32 implementations, so this additional
// cost is minimal.
str_=new CharString();
stack_=new UVector32(errorCode);
if(U_SUCCESS(errorCode) && (str_==NULL || stack_==NULL)) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
BytesTrieIterator::BytesTrieIterator(const BytesTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
BytesTrie::Iterator::Iterator(const BytesTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
: bytes_(trie.bytes_), pos_(trie.pos_), initialPos_(trie.pos_),
remainingMatchLength_(trie.remainingMatchLength_),
initialRemainingMatchLength_(trie.remainingMatchLength_),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
str_(NULL), maxLength_(maxStringLength), value_(0), stack_(NULL) {
if(U_FAILURE(errorCode)) {
return;
}
str_=new CharString();
stack_=new UVector32(errorCode);
if(U_FAILURE(errorCode)) {
return;
}
if(str_==NULL || stack_==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str.
@ -41,43 +67,52 @@ BytesTrieIterator::BytesTrieIterator(const BytesTrie &trie, int32_t maxStringLen
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_.append(reinterpret_cast<const char *>(pos_), length, errorCode);
str_->append(reinterpret_cast<const char *>(pos_), length, errorCode);
pos_+=length;
remainingMatchLength_-=length;
}
}
BytesTrieIterator &BytesTrieIterator::reset() {
BytesTrie::Iterator::~Iterator() {
delete str_;
delete stack_;
}
BytesTrie::Iterator &
BytesTrie::Iterator::reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
int32_t length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_.truncate(length);
str_->truncate(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.setSize(0);
stack_->setSize(0);
return *this;
}
UBool
BytesTrieIterator::next(UErrorCode &errorCode) {
BytesTrie::Iterator::hasNext() const { return pos_!=NULL || !stack_->isEmpty(); }
UBool
BytesTrie::Iterator::next(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const uint8_t *pos=pos_;
if(pos==NULL) {
if(stack_.isEmpty()) {
if(stack_->isEmpty()) {
return FALSE;
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
int32_t stackSize=stack_.size();
int32_t length=stack_.elementAti(stackSize-1);
pos=bytes_+stack_.elementAti(stackSize-2);
stack_.setSize(stackSize-2);
str_.truncate(length&0xffff);
int32_t stackSize=stack_->size();
int32_t length=stack_->elementAti(stackSize-1);
pos=bytes_+stack_->elementAti(stackSize-2);
stack_->setSize(stackSize-2);
str_->truncate(length&0xffff);
length=(int32_t)((uint32_t)length>>16);
if(length>1) {
pos=branchNext(pos, length, errorCode);
@ -85,7 +120,7 @@ BytesTrieIterator::next(UErrorCode &errorCode) {
return TRUE; // Reached a final value.
}
} else {
str_.append((char)*pos++, errorCode);
str_->append((char)*pos++, errorCode);
}
}
if(remainingMatchLength_>=0) {
@ -99,15 +134,15 @@ BytesTrieIterator::next(UErrorCode &errorCode) {
// Deliver value for the byte sequence so far.
UBool isFinal=(UBool)(node&BytesTrie::kValueIsFinal);
value_=BytesTrie::readValue(pos, node>>1);
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
if(isFinal || (maxLength_>0 && str_->length()==maxLength_)) {
pos_=NULL;
} else {
pos_=BytesTrie::skipValue(pos, node);
}
sp_.set(str_.data(), str_.length());
sp_.set(str_->data(), str_->length());
return TRUE;
}
if(maxLength_>0 && str_.length()==maxLength_) {
if(maxLength_>0 && str_->length()==maxLength_) {
return truncateAndStop();
}
if(node<BytesTrie::kMinLinearMatch) {
@ -121,25 +156,33 @@ BytesTrieIterator::next(UErrorCode &errorCode) {
} else {
// Linear-match node, append length bytes to str_.
int32_t length=node-BytesTrie::kMinLinearMatch+1;
if(maxLength_>0 && str_.length()+length>maxLength_) {
str_.append(reinterpret_cast<const char *>(pos),
maxLength_-str_.length(), errorCode);
if(maxLength_>0 && str_->length()+length>maxLength_) {
str_->append(reinterpret_cast<const char *>(pos),
maxLength_-str_->length(), errorCode);
return truncateAndStop();
}
str_.append(reinterpret_cast<const char *>(pos), length, errorCode);
str_->append(reinterpret_cast<const char *>(pos), length, errorCode);
pos+=length;
}
}
}
UBool
BytesTrie::Iterator::truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
sp_.set(str_->data(), str_->length());
return TRUE;
}
// Branch node, needs to take the first outbound edge and push state for the rest.
const uint8_t *
BytesTrieIterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
BytesTrie::Iterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode) {
while(length>BytesTrie::kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison byte
// Push state for the greater-or-equal edge.
stack_.addElement((int32_t)(BytesTrie::skipDelta(pos)-bytes_), errorCode);
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
stack_->addElement((int32_t)(BytesTrie::skipDelta(pos)-bytes_), errorCode);
stack_->addElement(((length-(length>>1))<<16)|str_->length(), errorCode);
// Follow the less-than edge.
length>>=1;
pos=BytesTrie::jumpByDelta(pos);
@ -151,12 +194,12 @@ BytesTrieIterator::branchNext(const uint8_t *pos, int32_t length, UErrorCode &er
UBool isFinal=(UBool)(node&BytesTrie::kValueIsFinal);
int32_t value=BytesTrie::readValue(pos, node>>1);
pos=BytesTrie::skipValue(pos, node);
stack_.addElement((int32_t)(pos-bytes_), errorCode);
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
str_.append((char)trieByte, errorCode);
stack_->addElement((int32_t)(pos-bytes_), errorCode);
stack_->addElement(((length-1)<<16)|str_->length(), errorCode);
str_->append((char)trieByte, errorCode);
if(isFinal) {
pos_=NULL;
sp_.set(str_.data(), str_.length());
sp_.set(str_->data(), str_->length());
value_=value;
return NULL;
} else {

View File

@ -401,6 +401,7 @@
<ClCompile Include="usprep.cpp" />
<ClCompile Include="bytestream.cpp" />
<ClCompile Include="bytestrie.cpp" />
<ClCompile Include="bytestrieiterator.cpp" />
<ClCompile Include="chariter.cpp" />
<ClCompile Include="charstr.cpp" />
<ClCompile Include="cstring.c" />

View File

@ -18,7 +18,6 @@
#include "unicode/stringpiece.h"
#include "bytestrie.h"
#include "bytestriebuilder.h"
#include "bytestrieiterator.h"
#include "intltest.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -63,7 +62,7 @@ public:
void checkNextWithState(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkNextString(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkIterator(const StringPiece &trieBytes, const StringAndValue data[], int32_t dataLength);
void checkIterator(BytesTrieIterator &iter, const StringAndValue data[], int32_t dataLength);
void checkIterator(BytesTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
};
extern IntlTest *createBytesTrieTest() {
@ -392,8 +391,8 @@ void BytesTrieTest::TestIteratorFromBranch() {
trie.next('a');
trie.next('n');
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
BytesTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trie) constructor")) {
BytesTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
@ -445,8 +444,8 @@ void BytesTrieTest::TestIteratorFromLinearMatch() {
trie.next('u');
trie.next('a');
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
BytesTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trie) constructor")) {
BytesTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
@ -468,8 +467,8 @@ void BytesTrieTest::TestTruncatingIteratorFromRoot() {
return; // buildTrie() reported an error
}
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
BytesTrieIterator iter(sp.data(), 4, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trie) constructor")) {
BytesTrie::Iterator iter(sp.data(), 4, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the first 4 characters
@ -525,8 +524,8 @@ void BytesTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
trie.next('b');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
// Truncate within the linear-match node.
BytesTrieIterator iter(trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trie) constructor")) {
BytesTrie::Iterator iter(trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
@ -556,8 +555,8 @@ void BytesTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
trie.next('c');
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
// Truncate after the linear-match node.
BytesTrieIterator iter(trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trie) constructor")) {
BytesTrie::Iterator iter(trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
@ -798,14 +797,14 @@ void BytesTrieTest::checkNextString(const StringPiece &trieBytes,
void BytesTrieTest::checkIterator(const StringPiece &trieBytes,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
BytesTrieIterator iter(trieBytes.data(), 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrieIterator(trieBytes) constructor")) {
BytesTrie::Iterator iter(trieBytes.data(), 0, errorCode);
if(errorCode.logIfFailureAndReset("BytesTrie::Iterator(trieBytes) constructor")) {
return;
}
checkIterator(iter, data, dataLength);
}
void BytesTrieTest::checkIterator(BytesTrieIterator &iter,
void BytesTrieTest::checkIterator(BytesTrie::Iterator &iter,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
for(int32_t i=0; i<dataLength; ++i) {

View File

@ -18,7 +18,6 @@
#include "unicode/uniset.h"
#include "ucharstrie.h"
#include "ucharstriebuilder.h"
#include "ucharstrieiterator.h"
#include "intltest.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -69,7 +68,7 @@ public:
void checkNextWithState(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkNextString(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkIterator(const UnicodeString &trieUChars, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrieIterator &iter, const StringAndValue data[], int32_t dataLength);
void checkIterator(UCharsTrie::Iterator &iter, const StringAndValue data[], int32_t dataLength);
};
extern IntlTest *createUCharsTrieTest() {
@ -560,8 +559,8 @@ void UCharsTrieTest::TestIteratorFromBranch() {
trie.next(u_a);
trie.next(u_n);
IcuTestErrorCode errorCode(*this, "TestIteratorFromBranch()");
UCharsTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trie) constructor")) {
UCharsTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
@ -613,8 +612,8 @@ void UCharsTrieTest::TestIteratorFromLinearMatch() {
trie.next(u_u);
trie.next(u_a);
IcuTestErrorCode errorCode(*this, "TestIteratorFromLinearMatch()");
UCharsTrieIterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trie) constructor")) {
UCharsTrie::Iterator iter(trie, 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the suffixes
@ -636,8 +635,8 @@ void UCharsTrieTest::TestTruncatingIteratorFromRoot() {
return; // buildTrie() reported an error
}
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromRoot()");
UCharsTrieIterator iter(trieUChars.getBuffer(), 4, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trie) constructor")) {
UCharsTrie::Iterator iter(trieUChars.getBuffer(), 4, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
// Expected data: Same as in buildMonthsTrie(), except only the first 4 characters
@ -693,8 +692,8 @@ void UCharsTrieTest::TestTruncatingIteratorFromLinearMatchShort() {
trie.next(u_b);
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchShort()");
// Truncate within the linear-match node.
UCharsTrieIterator iter(trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trie) constructor")) {
UCharsTrie::Iterator iter(trie, 2, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
@ -724,8 +723,8 @@ void UCharsTrieTest::TestTruncatingIteratorFromLinearMatchLong() {
trie.next(u_c);
IcuTestErrorCode errorCode(*this, "TestTruncatingIteratorFromLinearMatchLong()");
// Truncate after the linear-match node.
UCharsTrieIterator iter(trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trie) constructor")) {
UCharsTrie::Iterator iter(trie, 3, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trie) constructor")) {
return;
}
static const StringAndValue expected[]={
@ -989,14 +988,14 @@ void UCharsTrieTest::checkNextString(const UnicodeString &trieUChars,
void UCharsTrieTest::checkIterator(const UnicodeString &trieUChars,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
UCharsTrieIterator iter(trieUChars.getBuffer(), 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrieIterator(trieUChars) constructor")) {
UCharsTrie::Iterator iter(trieUChars.getBuffer(), 0, errorCode);
if(errorCode.logIfFailureAndReset("UCharsTrie::Iterator(trieUChars) constructor")) {
return;
}
checkIterator(iter, data, dataLength);
}
void UCharsTrieTest::checkIterator(UCharsTrieIterator &iter,
void UCharsTrieTest::checkIterator(UCharsTrie::Iterator &iter,
const StringAndValue data[], int32_t dataLength) {
IcuTestErrorCode errorCode(*this, "checkIterator()");
for(int32_t i=0; i<dataLength; ++i) {

View File

@ -52,7 +52,7 @@ LDFLAGS += $(LDFLAGSICUTOOLUTIL)
LIBS = $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS)
OBJECTS = filestrm.o package.o pkgitems.o swapimpl.o toolutil.o unewdata.o \
stringtriebuilder.o bytestriebuilder.o bytestrieiterator.o \
stringtriebuilder.o bytestriebuilder.o \
ucharstrie.o ucharstriebuilder.o ucharstrieiterator.o \
denseranges.o \
ucm.o ucmstate.o uoptions.o uparse.o \

View File

@ -1,126 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytestrieiterator.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov03
* created by: Markus W. Scherer
*/
#ifndef __BYTESTRIEITERATOR_H__
#define __BYTESTRIEITERATOR_H__
/**
* \file
* \brief C++ API: BytesTrie iterator for all of its (byte sequence, value) pairs.
*/
// Needed if and when we change the .dat package index to a BytesTrie,
// so that icupkg can work with an input package.
#include "unicode/utypes.h"
#include "unicode/stringpiece.h"
#include "bytestrie.h"
#include "charstr.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/**
* Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
*/
class U_TOOLUTIL_API BytesTrieIterator : public UMemory {
public:
/**
* Iterates from the root of a byte-serialized BytesTrie.
* @param trieBytes The trie bytes.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
BytesTrieIterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified BytesTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings/byte sequences.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
BytesTrieIterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Resets this iterator to its initial state.
*/
BytesTrieIterator &reset();
/**
* Finds the next (byte sequence, value) pair if there is one.
*
* If the byte sequence is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
/**
* @return the NUL-terminated byte sequence for the last successful next()
*/
const StringPiece &getString() const { return sp_; }
/**
* @return the value for the last successful next()
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
sp_.set(str_.data(), str_.length());
return TRUE;
}
const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
const uint8_t *bytes_;
const uint8_t *pos_;
const uint8_t *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
CharString str_;
StringPiece sp_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from BytesTrie.bytes.
// The second integer has the str.length() from before the node in bits 15..0,
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
UVector32 stack_;
};
U_NAMESPACE_END
#endif // __BYTESTRIEITERATOR_H__

View File

@ -247,7 +247,6 @@
</ItemDefinitionGroup>
<ItemGroup>
<ClCompile Include="bytestriebuilder.cpp" />
<ClCompile Include="bytestrieiterator.cpp" />
<ClCompile Include="denseranges.cpp" />
<ClCompile Include="filestrm.c" />
<ClCompile Include="filetools.cpp" />
@ -297,7 +296,6 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="bytestriebuilder.h" />
<ClInclude Include="bytestrieiterator.h" />
<ClInclude Include="denseranges.h" />
<ClInclude Include="filestrm.h" />
<ClInclude Include="filetools.h" />
@ -314,7 +312,6 @@
<ClInclude Include="ucbuf.h" />
<ClInclude Include="ucharstrie.h" />
<ClInclude Include="ucharstriebuilder.h" />
<ClInclude Include="ucharstrieiterator.h" />
<ClInclude Include="ucm.h" />
<ClInclude Include="unewdata.h" />
<ClInclude Include="uoptions.h" />

View File

@ -22,6 +22,7 @@
*/
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uobject.h"
#include "uassert.h"
#include "ustringtrie.h"
@ -29,7 +30,7 @@
U_NAMESPACE_BEGIN
class UCharsTrieBuilder;
class UCharsTrieIterator;
class UVector32;
/**
* Base class for objects to which Unicode characters and strings can be appended.
@ -229,9 +230,99 @@ public:
*/
int32_t getNextUChars(Appendable &out) const;
/**
* Iterator for all of the (string, value) pairs in a UCharsTrie.
*/
class Iterator : public UMemory {
public:
/**
* Iterates from the root of a UChar-serialized UCharsTrie.
* @param trieUChars The trie UChars.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
Iterator(const UChar *trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified UCharsTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
~Iterator();
/**
* Resets this iterator to its initial state.
*/
Iterator &reset();
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const;
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return The string for the last successful next().
*/
const UnicodeString &getString() const { return str_; }
/**
* @return The value for the last successful next().
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
return TRUE;
}
const UChar *branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode);
const UChar *uchars_;
const UChar *pos_;
const UChar *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
UBool skipValue_; // Skip intermediate value which was already delivered.
UnicodeString str_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from ByteTrie.bytes.
// The second integer has the str_.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
// but the code looks more confusing that way.)
UVector32 *stack_;
};
private:
friend class UCharsTrieBuilder;
friend class UCharsTrieIterator;
inline void stop() {
pos_=NULL;

View File

@ -15,26 +15,50 @@
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "ucharstrie.h"
#include "ucharstrieiterator.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
UCharsTrieIterator::UCharsTrieIterator(const UChar *trieUChars, int32_t maxStringLength,
UErrorCode &errorCode)
UCharsTrie::Iterator::Iterator(const UChar *trieUChars, int32_t maxStringLength,
UErrorCode &errorCode)
: uchars_(trieUChars),
pos_(uchars_), initialPos_(uchars_),
remainingMatchLength_(-1), initialRemainingMatchLength_(-1),
skipValue_(FALSE),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {}
maxLength_(maxStringLength), value_(0), stack_(NULL) {
if(U_FAILURE(errorCode)) {
return;
}
// stack_ is a pointer so that it's easy to turn ucharstrie.h into
// a public API header for which we would want it to depend only on
// other public headers.
// Unlike UCharsTrie itself, its Iterator performs memory allocations anyway
// via the UnicodeString and UVector32 implementations, so this additional
// cost is minimal.
stack_=new UVector32(errorCode);
if(stack_==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
}
UCharsTrieIterator::UCharsTrieIterator(const UCharsTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
UCharsTrie::Iterator::Iterator(const UCharsTrie &trie, int32_t maxStringLength,
UErrorCode &errorCode)
: uchars_(trie.uchars_), pos_(trie.pos_), initialPos_(trie.pos_),
remainingMatchLength_(trie.remainingMatchLength_),
initialRemainingMatchLength_(trie.remainingMatchLength_),
skipValue_(FALSE),
maxLength_(maxStringLength), value_(0), stack_(errorCode) {
maxLength_(maxStringLength), value_(0), stack_(NULL) {
if(U_FAILURE(errorCode)) {
return;
}
stack_=new UVector32(errorCode);
if(U_FAILURE(errorCode)) {
return;
}
if(stack_==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining UChars to str.
@ -48,7 +72,12 @@ UCharsTrieIterator::UCharsTrieIterator(const UCharsTrie &trie, int32_t maxString
}
}
UCharsTrieIterator &UCharsTrieIterator::reset() {
UCharsTrie::Iterator::~Iterator() {
delete stack_;
}
UCharsTrie::Iterator &
UCharsTrie::Iterator::reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
skipValue_=FALSE;
@ -59,26 +88,29 @@ UCharsTrieIterator &UCharsTrieIterator::reset() {
str_.truncate(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.setSize(0);
stack_->setSize(0);
return *this;
}
UBool
UCharsTrieIterator::next(UErrorCode &errorCode) {
UCharsTrie::Iterator::hasNext() const { return pos_!=NULL || !stack_->isEmpty(); }
UBool
UCharsTrie::Iterator::next(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const UChar *pos=pos_;
if(pos==NULL) {
if(stack_.isEmpty()) {
if(stack_->isEmpty()) {
return FALSE;
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
int32_t stackSize=stack_.size();
int32_t length=stack_.elementAti(stackSize-1);
pos=uchars_+stack_.elementAti(stackSize-2);
stack_.setSize(stackSize-2);
int32_t stackSize=stack_->size();
int32_t length=stack_->elementAti(stackSize-1);
pos=uchars_+stack_->elementAti(stackSize-2);
stack_->setSize(stackSize-2);
str_.truncate(length&0xffff);
length=(int32_t)((uint32_t)length>>16);
if(length>1) {
@ -149,12 +181,12 @@ UCharsTrieIterator::next(UErrorCode &errorCode) {
// Branch node, needs to take the first outbound edge and push state for the rest.
const UChar *
UCharsTrieIterator::branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode) {
UCharsTrie::Iterator::branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode) {
while(length>UCharsTrie::kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
// Push state for the greater-or-equal edge.
stack_.addElement((int32_t)(UCharsTrie::skipDelta(pos)-uchars_), errorCode);
stack_.addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
stack_->addElement((int32_t)(UCharsTrie::skipDelta(pos)-uchars_), errorCode);
stack_->addElement(((length-(length>>1))<<16)|str_.length(), errorCode);
// Follow the less-than edge.
length>>=1;
pos=UCharsTrie::jumpByDelta(pos);
@ -166,8 +198,8 @@ UCharsTrieIterator::branchNext(const UChar *pos, int32_t length, UErrorCode &err
UBool isFinal=(UBool)(node>>15);
int32_t value=UCharsTrie::readValue(pos, node&=0x7fff);
pos=UCharsTrie::skipValue(pos, node);
stack_.addElement((int32_t)(pos-uchars_), errorCode);
stack_.addElement(((length-1)<<16)|str_.length(), errorCode);
stack_->addElement((int32_t)(pos-uchars_), errorCode);
stack_->addElement(((length-1)<<16)|str_.length(), errorCode);
str_.append(trieUnit);
if(isFinal) {
pos_=NULL;

View File

@ -1,121 +0,0 @@
/*
*******************************************************************************
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucharstrieiterator.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010nov15
* created by: Markus W. Scherer
*/
#ifndef __UCHARSTRIEITERATOR_H__
#define __UCHARSTRIEITERATOR_H__
/**
* \file
* \brief C++ API: UCharsTrie iterator for all of its (string, value) pairs.
*/
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "ucharstrie.h"
#include "uvectr32.h"
U_NAMESPACE_BEGIN
/**
* Iterator for all of the (string, value) pairs in a UCharsTrie.
*/
class U_TOOLUTIL_API UCharsTrieIterator : public UMemory {
public:
/**
* Iterates from the root of a UChar-serialized UCharsTrie.
* @param trieUChars The trie UChars.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
UCharsTrieIterator(const UChar *trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Iterates from the current state of the specified UCharsTrie.
* @param trie The trie whose state will be copied for iteration.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
*/
UCharsTrieIterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
/**
* Resets this iterator to its initial state.
*/
UCharsTrieIterator &reset();
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return TRUE if there is another element.
*/
UBool next(UErrorCode &errorCode);
/**
* @return TRUE if there are more elements.
*/
UBool hasNext() const { return pos_!=NULL || !stack_.isEmpty(); }
/**
* @return the NUL-terminated string for the last successful next()
*/
const UnicodeString &getString() const { return str_; }
/**
* @return the value for the last successful next()
*/
int32_t getValue() const { return value_; }
private:
UBool truncateAndStop() {
pos_=NULL;
value_=-1; // no real value for str
return TRUE;
}
const UChar *branchNext(const UChar *pos, int32_t length, UErrorCode &errorCode);
const UChar *uchars_;
const UChar *pos_;
const UChar *initialPos_;
int32_t remainingMatchLength_;
int32_t initialRemainingMatchLength_;
UBool skipValue_; // Skip intermediate value which was already delivered.
UnicodeString str_;
int32_t maxLength_;
int32_t value_;
// The stack stores pairs of integers for backtracking to another
// outbound edge of a branch node.
// The first integer is an offset from ByteTrie.bytes.
// The second integer has the str.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
// but the code looks more confusing that way.)
UVector32 stack_;
};
U_NAMESPACE_END
#endif // __UCHARSTRIEITERATOR_H__