ICU-8167 share as much code between builder versions and methods as possible

X-SVN-Rev: 29257
This commit is contained in:
Markus Scherer 2011-01-02 07:22:36 +00:00
parent e0c2a3c61c
commit 5e23f1c3e9
6 changed files with 642 additions and 885 deletions

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetriebuilder.cpp
@ -214,17 +214,7 @@ ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode)
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
DictTrieBuilder::build(buildOption, elementsLength, errorCode);
if(bytes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
@ -233,322 +223,64 @@ ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode)
return result;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length byteIndex.
void
ByteTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t byteIndex) {
UBool hasValue=FALSE;
int32_t value=0;
if(byteIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
writeValueAndFinal(value, TRUE); // final-value node
return;
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than byteIndex.
const ByteTrieElement &minElement=elements[start];
const ByteTrieElement &maxElement=elements[limit-1];
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
if(minByte==maxByte) {
// Linear-match node: All strings have the same character at byteIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastByteIndex=byteIndex;
while(++lastByteIndex<minStringLength &&
minElement.charAt(lastByteIndex, strings)==
maxElement.charAt(lastByteIndex, strings)) {}
writeNode(start, limit, lastByteIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const char *s=minElement.getString(strings).data();
int32_t length=lastByteIndex-byteIndex;
while(length>ByteTrie::kMaxLinearMatchLength) {
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
length-=ByteTrie::kMaxLinearMatchLength;
write(s+lastByteIndex, ByteTrie::kMaxLinearMatchLength);
write(ByteTrie::kMinLinearMatch+ByteTrie::kMaxLinearMatchLength-1);
}
write(s+byteIndex, length);
write(ByteTrie::kMinLinearMatch+length-1);
} else {
// Branch node.
int32_t length=0; // Number of different bytes at byteIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minByte!=maxByte.
writeBranchSubNode(start, limit, byteIndex, length);
write(--length);
if(length>=ByteTrie::kMinLinearMatch) {
write(0);
}
}
if(hasValue) {
writeValueAndFinal(value, FALSE);
}
int32_t
ByteTrieBuilder::getElementStringLength(int32_t i) const {
return elements[i].getStringLength(strings);
}
// start<limit && all strings longer than byteIndex &&
// length different bytes at byteIndex
void
ByteTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length) {
char middleBytes[16];
int32_t lessThan[16];
int32_t ltLength=0;
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle byte.
// First, find the middle byte.
int32_t count=length/2;
int32_t i=start;
char byte;
do {
byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
writeBranchSubNode(start, i, byteIndex, length/2);
lessThan[ltLength]=bytesLength;
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each byte, find its elements array start and whether it has a final value.
int32_t starts[ByteTrie::kMaxBranchLinearSubNodeLength];
UBool final[ByteTrie::kMaxBranchLinearSubNodeLength-1];
int32_t byteNumber=0;
do {
int32_t i=starts[byteNumber]=start;
char byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
final[byteNumber]= start==i-1 && byteIndex+1==elements[start].getStringLength(strings);
start=i;
} while(++byteNumber<length-1);
// byteNumber==length-1, and the maxByte elements range is [start..limit[
starts[byteNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minByte sub-node first,
// then its jump delta would be larger.
// Instead we write the minByte sub-node last, for a shorter delta.
int32_t jumpTargets[ByteTrie::kMaxBranchLinearSubNodeLength-1];
do {
--byteNumber;
if(!final[byteNumber]) {
writeNode(starts[byteNumber], starts[byteNumber+1], byteIndex+1);
jumpTargets[byteNumber]=bytesLength;
}
} while(byteNumber>0);
// The maxByte sub-node is written as the very last one because we do
// not jump for it at all.
byteNumber=length-1;
writeNode(start, limit, byteIndex+1);
write((uint8_t)elements[start].charAt(byteIndex, strings));
// Write the rest of this node's byte-value pairs.
while(--byteNumber>=0) {
start=starts[byteNumber];
int32_t value;
if(final[byteNumber]) {
// Write the final value for the one string ending with this byte.
value=elements[start].getValue();
} else {
// Write the delta to the start position of the sub-node.
value=bytesLength-jumpTargets[byteNumber];
}
writeValueAndFinal(value, final[byteNumber]);
write((uint8_t)elements[start].charAt(byteIndex, strings));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDelta(bytesLength-lessThan[ltLength]); // less-than
write((uint8_t)middleBytes[ltLength]);
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length byteIndex.
DictTrieBuilder::Node *
ByteTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(byteIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
Node *node;
// Now all [start..limit[ strings are longer than byteIndex.
const ByteTrieElement &minElement=elements[start];
const ByteTrieElement &maxElement=elements[limit-1];
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
if(minByte==maxByte) {
// Linear-match node: All strings have the same character at byteIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastByteIndex=byteIndex;
while(++lastByteIndex<minStringLength &&
minElement.charAt(lastByteIndex, strings)==
maxElement.charAt(lastByteIndex, strings)) {}
Node *nextNode=makeNode(start, limit, lastByteIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const char *s=minElement.getString(strings).data();
int32_t length=lastByteIndex-byteIndex;
while(length>ByteTrie::kMaxLinearMatchLength) {
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
length-=ByteTrie::kMaxLinearMatchLength;
node=new BTLinearMatchNode(
s+lastByteIndex,
ByteTrie::kMaxLinearMatchLength,
nextNode);
node=registerNode(node, errorCode);
nextNode=node;
}
node=new BTLinearMatchNode(s+byteIndex, length, nextNode);
} else {
// Branch node.
int32_t length=0; // Number of different bytes at byteIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minByte!=maxByte.
Node *subNode=makeBranchSubNode(start, limit, byteIndex, length, errorCode);
node=new BTBranchHeadNode(length, subNode);
}
node=registerNode(node, errorCode);
if(hasValue) {
node=registerNode(new BTValueNode(value, node), errorCode);
}
return node;
}
// start<limit && all strings longer than byteIndex &&
// length different bytes at byteIndex
DictTrieBuilder::Node *
ByteTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
char middleBytes[16];
Node *lessThan[16];
int32_t ltLength=0;
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle byte.
// First, find the middle byte.
int32_t count=length/2;
int32_t i=start;
char byte;
do {
byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
lessThan[ltLength]=makeBranchSubNode(start, i, byteIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
BTListBranchNode *listNode=new BTListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each byte, find its elements array start and whether it has a final value.
int32_t byteNumber=0;
do {
int32_t i=start;
char byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
if(start==i-1 && byteIndex+1==elements[start].getStringLength(strings)) {
listNode->add((uint8_t)byte, elements[start].getValue());
} else {
listNode->add((uint8_t)byte, makeNode(start, i, byteIndex+1, errorCode));
}
start=i;
} while(++byteNumber<length-1);
// byteNumber==length-1, and the maxByte elements range is [start..limit[
char byte=elements[start].charAt(byteIndex, strings);
if(start==limit-1 && byteIndex+1==elements[start].getStringLength(strings)) {
listNode->add((uint8_t)byte, elements[start].getValue());
} else {
listNode->add((uint8_t)byte, makeNode(start, limit, byteIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new BTSplitBranchNode(middleBytes[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
void
ByteTrieBuilder::BTFinalValueNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
offset=b.writeValueAndFinal(value, TRUE);
}
UBool
ByteTrieBuilder::BTValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const BTValueNode &o=(const BTValueNode &)other;
return next==o.next;
UChar
ByteTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
return (uint8_t)elements[i].charAt(byteIndex, strings);
}
int32_t
ByteTrieBuilder::BTValueNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
ByteTrieBuilder::getElementValue(int32_t i) const {
return elements[i].getValue();
}
void
ByteTrieBuilder::BTValueNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
offset=b.writeValueAndFinal(value, FALSE);
int32_t
ByteTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
const ByteTrieElement &firstElement=elements[first];
const ByteTrieElement &lastElement=elements[last];
int32_t minStringLength=firstElement.getStringLength(strings);
while(++byteIndex<minStringLength &&
firstElement.charAt(byteIndex, strings)==
lastElement.charAt(byteIndex, strings)) {}
return byteIndex;
}
int32_t
ByteTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
++length;
} while(i<limit);
return length;
}
int32_t
ByteTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
do {
char byte=elements[i++].charAt(byteIndex, strings);
while(byte==elements[i].charAt(byteIndex, strings)) {
++i;
}
} while(--count>0);
return i;
}
int32_t
ByteTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
char b=(char)byte;
while(b==elements[i].charAt(byteIndex, strings)) {
++i;
}
return i;
}
ByteTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
@ -573,74 +305,16 @@ ByteTrieBuilder::BTLinearMatchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
b.write(s, length);
offset=b.write(minLinearMatch()+length-1);
offset=b.write(b.getMinLinearMatch()+length-1);
}
void
ByteTrieBuilder::BTListBranchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minByte sub-node first,
// then its jump delta would be larger.
// Instead we write the minByte sub-node last, for a shorter delta.
int32_t byteNumber=length-1;
Node *rightEdge=equal[byteNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--byteNumber;
if(equal[byteNumber]!=NULL) {
equal[byteNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(byteNumber>0);
// The maxByte sub-node is written as the very last one because we do
// not jump for it at all.
byteNumber=length-1;
if(rightEdge==NULL) {
b.writeValueAndFinal(values[byteNumber], TRUE);
} else {
rightEdge->write(builder);
}
b.write(units[byteNumber]);
// Write the rest of this node's byte-value pairs.
while(--byteNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[byteNumber]==NULL) {
// Write the final value for the one string ending with this byte.
value=values[byteNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[byteNumber]->getOffset()>0);
value=b.bytesLength-equal[byteNumber]->getOffset();
isFinal=FALSE;
}
b.writeValueAndFinal(value, isFinal);
offset=b.write(units[byteNumber]);
}
}
void
ByteTrieBuilder::BTSplitBranchNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
b.writeDelta(b.bytesLength-lessThan->getOffset()); // less-than
offset=b.write(unit);
}
void
ByteTrieBuilder::BTBranchHeadNode::write(DictTrieBuilder &builder) {
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
next->write(builder);
offset=b.write((length-1));
if(length>minLinearMatch()) {
offset=b.write(0);
}
DictTrieBuilder::Node *
ByteTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
Node *nextNode) const {
return new BTLinearMatchNode(
elements[i].getString(strings).data()+byteIndex,
length,
nextNode);
}
UBool
@ -689,6 +363,11 @@ ByteTrieBuilder::write(const char *b, int32_t length) {
return bytesLength;
}
int32_t
ByteTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
return write(elements[i].getString(strings).data()+byteIndex, length);
}
int32_t
ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
char intBytes[5];
@ -722,7 +401,17 @@ ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
}
int32_t
ByteTrieBuilder::writeDelta(int32_t i) {
ByteTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
int32_t offset=write(node);
if(hasValue) {
offset=writeValueAndFinal(value, FALSE);
}
return offset;
}
int32_t
ByteTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
int32_t i=bytesLength-jumpTarget;
char intBytes[5];
int32_t length;
U_ASSERT(i>=0);

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: bytetriebuilder.h
@ -46,43 +46,21 @@ public:
}
private:
void writeNode(int32_t start, int32_t limit, int32_t byteIndex);
void writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
virtual int32_t getElementStringLength(int32_t i) const;
virtual UChar getElementUnit(int32_t i, int32_t byteIndex) const;
virtual int32_t getElementValue(int32_t i) const;
Node *makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode);
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
int32_t length, UErrorCode &errorCode);
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const;
UBool ensureCapacity(int32_t length);
int32_t write(int32_t byte);
int32_t write(const char *b, int32_t length);
int32_t writeValueAndFinal(int32_t i, UBool final);
int32_t writeDelta(int32_t i);
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const;
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const;
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const;
// Compacting builder.
virtual UBool matchNodesCanHaveValues() const { return FALSE; }
// Indirect "friend" access.
// Nested classes cannot be friends of ByteTrie unless the whole header is included,
// at least with AIX xlC_r,
// so this Builder class, which is a friend, provides the necessary value.
static int32_t minLinearMatch() { return ByteTrie::kMinLinearMatch; }
class BTFinalValueNode : public FinalValueNode {
public:
BTFinalValueNode(int32_t v) : FinalValueNode(v) {}
virtual void write(DictTrieBuilder &builder);
};
class BTValueNode : public ValueNode {
public:
BTValueNode(int32_t v, Node *nextNode)
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
private:
Node *next;
};
virtual int32_t getMaxBranchLinearSubNodeLength() const { return ByteTrie::kMaxBranchLinearSubNodeLength; }
virtual int32_t getMinLinearMatch() const { return ByteTrie::kMinLinearMatch; }
virtual int32_t getMaxLinearMatchLength() const { return ByteTrie::kMaxLinearMatchLength; }
class BTLinearMatchNode : public LinearMatchNode {
public:
@ -93,26 +71,16 @@ private:
const char *s;
};
class BTListBranchNode : public ListBranchNode {
public:
BTListBranchNode() : ListBranchNode() {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const;
class BTSplitBranchNode : public SplitBranchNode {
public:
BTSplitBranchNode(char middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: SplitBranchNode((uint8_t)middleUnit, lessThanNode, greaterOrEqualNode) {}
virtual void write(DictTrieBuilder &builder);
};
class BTBranchHeadNode : public BranchHeadNode {
public:
BTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createFinalValueNode(int32_t value) const { return new BTFinalValueNode(value); }
UBool ensureCapacity(int32_t length);
virtual int32_t write(int32_t byte);
int32_t write(const char *b, int32_t length);
virtual int32_t writeElementUnits(int32_t i, int32_t byteIndex, int32_t length);
virtual int32_t writeValueAndFinal(int32_t i, UBool final);
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
virtual int32_t writeDeltaTo(int32_t jumpTarget);
CharString strings;
ByteTrieElement *elements;

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: dicttriebuilder.cpp
@ -63,6 +63,257 @@ DictTrieBuilder::deleteCompactBuilder() {
nodes=NULL;
}
void
DictTrieBuilder::build(UDictTrieBuildOption buildOption, int32_t elementsLength,
UErrorCode &errorCode) {
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
int32_t
DictTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
UBool hasValue=FALSE;
int32_t value=0;
int32_t type;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return writeValueAndFinal(value, TRUE); // final-value node
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
writeNode(start, limit, lastUnitIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
write(getMinLinearMatch()+maxLinearMatchLength-1);
}
writeElementUnits(start, unitIndex, length);
type=getMinLinearMatch()+length-1;
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
writeBranchSubNode(start, limit, unitIndex, length);
if(--length<getMinLinearMatch()) {
type=length;
} else {
write(length);
type=0;
}
}
return writeValueAndType(hasValue, value, type);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
int32_t
DictTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
UChar middleUnits[kMaxSplitBranchLevels];
int32_t lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Encode the less-than branch first.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t starts[kMaxBranchLinearSubNodeLength];
UBool final[kMaxBranchLinearSubNodeLength-1];
int32_t unitNumber=0;
do {
int32_t i=starts[unitNumber]=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
final[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
starts[unitNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
do {
--unitNumber;
if(!final[unitNumber]) {
jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
writeNode(start, limit, unitIndex+1);
int32_t offset=write(getElementUnit(start, unitIndex));
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
start=starts[unitNumber];
int32_t value;
if(final[unitNumber]) {
// Write the final value for the one string ending with this unit.
value=getElementValue(start);
} else {
// Write the delta to the start position of the sub-node.
value=offset-jumpTargets[unitNumber];
}
writeValueAndFinal(value, final[unitNumber]);
offset=write(getElementUnit(start, unitIndex));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDeltaTo(lessThan[ltLength]);
offset=write(middleUnits[ltLength]);
}
return offset;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
DictTrieBuilder::Node *
DictTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
Node *node;
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
nextNode=registerNode(node, errorCode);
}
node=createLinearMatchNode(start, unitIndex, length, nextNode);
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
node=new BranchHeadNode(length, subNode);
}
if(hasValue && node!=NULL) {
if(matchNodesCanHaveValues()) {
((ValueNode *)node)->setValue(value);
} else {
node=new IntermediateValueNode(value, registerNode(node, errorCode));
}
}
return registerNode(node, errorCode);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
DictTrieBuilder::Node *
DictTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UChar middleUnits[kMaxSplitBranchLevels];
Node *lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Create the less-than branch.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
ListBranchNode *listNode=new ListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t unitNumber=0;
do {
int32_t i=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
}
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
UChar unit=getElementUnit(start, unitIndex);
if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
DictTrieBuilder::Node *
DictTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
@ -102,7 +353,7 @@ DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
if(old!=NULL) {
return (Node *)old->key.pointer;
}
Node *newNode=createFinalValueNode(value);
Node *newNode=new FinalValueNode(value);
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
@ -121,19 +372,25 @@ DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
return newNode;
}
UBool DictTrieBuilder::hashNode(const void *node) {
UBool
DictTrieBuilder::hashNode(const void *node) {
return ((const Node *)node)->hashCode();
}
UBool DictTrieBuilder::equalNodes(const void *left, const void *right) {
UBool
DictTrieBuilder::equalNodes(const void *left, const void *right) {
return *(const Node *)left==*(const Node *)right;
}
UBool DictTrieBuilder::Node::operator==(const Node &other) const {
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder)
UBool
DictTrieBuilder::Node::operator==(const Node &other) const {
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
}
int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
int32_t
DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber;
}
@ -142,28 +399,25 @@ int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder::Node)
UBool DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
UBool
DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
// Not:
// if(!Node::operator==(other)) {
// return FALSE;
// }
// because registerFinalValue() compares a stack-allocated FinalValueNode
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
// with the specific builder's subclass of FinalValueNode,
// and !Node::operator==(other) will always be false for that because it
// compares the typeid's.
// This workaround assumes that the subclass does not add fields that need to be compared.
if(hash!=other.hashCode()) {
if(!Node::operator==(other)) {
return FALSE;
}
const FinalValueNode *o=dynamic_cast<const FinalValueNode *>(&other);
return o!=NULL && value==o->value;
const FinalValueNode &o=(const FinalValueNode &)other;
return value==o.value;
}
UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
void
DictTrieBuilder::FinalValueNode::write(DictTrieBuilder &builder) {
offset=builder.writeValueAndFinal(value, TRUE);
}
UBool
DictTrieBuilder::ValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
@ -174,7 +428,34 @@ UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
return hasValue==o.hasValue && (!hasValue || value==o.value);
}
UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
UBool
DictTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const IntermediateValueNode &o=(const IntermediateValueNode &)other;
return next==o.next;
}
int32_t
DictTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
DictTrieBuilder::IntermediateValueNode::write(DictTrieBuilder &builder) {
next->write(builder);
offset=builder.writeValueAndFinal(value, FALSE);
}
UBool
DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
@ -185,14 +466,16 @@ UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
return length==o.length && next==o.next;
}
int32_t DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
int32_t
DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
UBool
DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
@ -208,7 +491,8 @@ UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
return TRUE;
}
int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
int32_t
DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
int32_t step=0;
@ -226,7 +510,51 @@ int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber)
return edgeNumber;
}
UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
void
DictTrieBuilder::ListBranchNode::write(DictTrieBuilder &builder) {
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t unitNumber=length-1;
Node *rightEdge=equal[unitNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=NULL) {
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==NULL) {
builder.writeValueAndFinal(values[unitNumber], TRUE);
} else {
rightEdge->write(builder);
}
offset=builder.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[unitNumber]==NULL) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[unitNumber]->getOffset()>0);
value=offset-equal[unitNumber]->getOffset();
isFinal=FALSE;
}
builder.writeValueAndFinal(value, isFinal);
offset=builder.write(units[unitNumber]);
}
}
UBool
DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
@ -237,7 +565,8 @@ UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
}
int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
int32_t
DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
@ -246,7 +575,20 @@ int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber
return edgeNumber;
}
UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
void
DictTrieBuilder::SplitBranchNode::write(DictTrieBuilder &builder) {
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
builder.writeDeltaTo(lessThan->getOffset()); // less-than
offset=builder.write(unit);
}
UBool
DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
@ -257,11 +599,23 @@ UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
return length==o.length && next==o.next;
}
int32_t DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
int32_t
DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
DictTrieBuilder::BranchHeadNode::write(DictTrieBuilder &builder) {
next->write(builder);
if(length<=builder.getMinLinearMatch()) {
offset=builder.writeValueAndType(hasValue, value, length-1);
} else {
builder.write(length-1);
offset=builder.writeValueAndType(hasValue, value, 0);
}
}
U_NAMESPACE_END

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: dicttriebuilder.h
@ -28,7 +28,7 @@ enum UDictTrieBuildOption {
U_NAMESPACE_BEGIN
class U_TOOLUTIL_API DictTrieBuilder : public UMemory {
class U_TOOLUTIL_API DictTrieBuilder : public UObject {
public:
/** @internal */
static UBool hashNode(const void *node);
@ -39,11 +39,46 @@ protected:
DictTrieBuilder();
virtual ~DictTrieBuilder();
class Node;
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
void deleteCompactBuilder();
void build(UDictTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
int32_t writeNode(int32_t start, int32_t limit, int32_t byteIndex);
int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
class Node;
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode);
virtual int32_t getElementStringLength(int32_t i) const = 0;
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
virtual int32_t getElementValue(int32_t i) const = 0;
// Finds the first unit index after this one where
// the first and last element have different units again.
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
// Number of different bytes at unitIndex.
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
virtual UBool matchNodesCanHaveValues() const = 0;
virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
virtual int32_t getMinLinearMatch() const = 0;
virtual int32_t getMaxLinearMatchLength() const = 0;
// max(ByteTrie::kMaxBranchLinearSubNodeLength, UCharTrie::kMaxBranchLinearSubNodeLength).
static const int32_t kMaxBranchLinearSubNodeLength=5;
// Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
// log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
static const int32_t kMaxSplitBranchLevels=14;
/**
* Makes sure that there is only one unique node registered that is
* equivalent to newNode.
@ -81,8 +116,6 @@ protected:
* a Node pointer, or before setting a new UErrorCode.
*/
virtual Node *createFinalValueNode(int32_t value) const = 0;
// Hash set of nodes, maps from nodes to integer 1.
UHashtable *nodes;
@ -146,12 +179,17 @@ protected:
virtual UClassID getDynamicClassID() const;
};
// This class should not be overridden because
// registerFinalValue() compares a stack-allocated FinalValueNode
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
// with the input node, and the
// !Node::operator==(other) used inside FinalValueNode::operator==(other)
// will be false if the typeid's are different.
class FinalValueNode : public Node {
public:
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
virtual UBool operator==(const Node &other) const;
// Dummy default implementation, must be overridden for real writing.
virtual void write(DictTrieBuilder & /*builder*/) {}
virtual void write(DictTrieBuilder &builder);
protected:
int32_t value;
};
@ -170,6 +208,17 @@ protected:
int32_t value;
};
class IntermediateValueNode : public ValueNode {
public:
IntermediateValueNode(int32_t v, Node *nextNode)
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
protected:
Node *next;
};
class LinearMatchNode : public ValueNode {
public:
LinearMatchNode(int32_t len, Node *nextNode)
@ -194,6 +243,7 @@ protected:
ListBranchNode() : BranchNode(0x444444), length(0) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
// Adds a unit with a final value.
void add(int32_t c, int32_t value) {
units[length]=(UChar)c;
@ -211,11 +261,10 @@ protected:
hash=(hash*37+c)*37+hashCode(node);
}
protected:
// TODO: 10 -> max(BT/UCT max list lengths)
Node *equal[10]; // NULL means "has final value".
Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
int32_t length;
int32_t values[10];
UChar units[10];
int32_t values[kMaxBranchLinearSubNodeLength];
UChar units[kMaxBranchLinearSubNodeLength];
};
class SplitBranchNode : public BranchNode {
@ -226,6 +275,7 @@ protected:
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
protected:
UChar unit;
Node *lessThan;
@ -240,10 +290,24 @@ protected:
length(len), next(subNode) {}
virtual UBool operator==(const Node &other) const;
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
virtual void write(DictTrieBuilder &builder);
protected:
int32_t length;
Node *next; // A branch sub-node.
};
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const = 0;
virtual int32_t write(int32_t byte) = 0;
virtual int32_t writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) = 0;
virtual int32_t writeValueAndFinal(int32_t i, UBool final) = 0;
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
private:
// No ICU "poor man's RTTI" for this class nor its subclasses.
virtual UClassID getDynamicClassID() const;
};
U_NAMESPACE_END

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartriebuilder.h
@ -179,17 +179,7 @@ UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result,
errorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
DictTrieBuilder::build(buildOption, elementsLength, errorCode);
if(uchars==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
} else {
@ -198,295 +188,63 @@ UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result,
return result;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
void
UCharTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
UBool hasValue=FALSE;
int32_t value=0;
int32_t type;
if(unitIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
writeValueAndFinal(value, TRUE); // final-value node
return;
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than unitIndex.
const UCharTrieElement &minElement=elements[start];
const UCharTrieElement &maxElement=elements[limit-1];
int32_t minUnit=minElement.charAt(unitIndex, strings);
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastUnitIndex=unitIndex;
while(++lastUnitIndex<minStringLength &&
minElement.charAt(lastUnitIndex, strings)==
maxElement.charAt(lastUnitIndex, strings)) {}
writeNode(start, limit, lastUnitIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const UChar *s=minElement.getString(strings).getBuffer();
int32_t length=lastUnitIndex-unitIndex;
while(length>UCharTrie::kMaxLinearMatchLength) {
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
length-=UCharTrie::kMaxLinearMatchLength;
write(s+lastUnitIndex, UCharTrie::kMaxLinearMatchLength);
write(UCharTrie::kMinLinearMatch+UCharTrie::kMaxLinearMatchLength-1);
}
write(s+unitIndex, length);
type=UCharTrie::kMinLinearMatch+length-1;
} else {
// Branch node.
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
UChar unit=elements[i++].charAt(unitIndex, strings);
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minUnit!=maxUnit.
writeBranchSubNode(start, limit, unitIndex, length);
if(--length<UCharTrie::kMinLinearMatch) {
type=length;
} else {
write(length);
type=0;
}
}
writeValueAndType(hasValue, value, type);
int32_t
UCharTrieBuilder::getElementStringLength(int32_t i) const {
return elements[i].getStringLength(strings);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
void
UCharTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
UChar middleUnits[16];
int32_t lessThan[16];
int32_t ltLength=0;
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t count=length/2;
int32_t i=start;
UChar unit;
do {
unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
} while(--count>0);
// Encode the less-than branch first.
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
writeBranchSubNode(start, i, unitIndex, length/2);
lessThan[ltLength]=ucharsLength;
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t starts[UCharTrie::kMaxBranchLinearSubNodeLength];
UBool final[UCharTrie::kMaxBranchLinearSubNodeLength-1];
int32_t unitNumber=0;
UChar
UCharTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
return elements[i].charAt(unitIndex, strings);
}
int32_t
UCharTrieBuilder::getElementValue(int32_t i) const {
return elements[i].getValue();
}
int32_t
UCharTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
const UCharTrieElement &firstElement=elements[first];
const UCharTrieElement &lastElement=elements[last];
int32_t minStringLength=firstElement.getStringLength(strings);
while(++unitIndex<minStringLength &&
firstElement.charAt(unitIndex, strings)==
lastElement.charAt(unitIndex, strings)) {}
return unitIndex;
}
int32_t
UCharTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
UChar unit=elements[i++].charAt(unitIndex, strings);
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
++length;
} while(i<limit);
return length;
}
int32_t
UCharTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
do {
int32_t i=starts[unitNumber]=start;
UChar unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
final[unitNumber]= start==i-1 && unitIndex+1==elements[start].getStringLength(strings);
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
starts[unitNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t jumpTargets[UCharTrie::kMaxBranchLinearSubNodeLength-1];
do {
--unitNumber;
if(!final[unitNumber]) {
writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
jumpTargets[unitNumber]=ucharsLength;
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
writeNode(start, limit, unitIndex+1);
write(elements[start].charAt(unitIndex, strings));
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
start=starts[unitNumber];
int32_t value;
if(final[unitNumber]) {
// Write the final value for the one string ending with this unit.
value=elements[start].getValue();
} else {
// Write the delta to the start position of the sub-node.
value=ucharsLength-jumpTargets[unitNumber];
}
writeValueAndFinal(value, final[unitNumber]);
write(elements[start].charAt(unitIndex, strings));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDelta(ucharsLength-lessThan[ltLength]); // less-than
write(middleUnits[ltLength]);
}
} while(--count>0);
return i;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
DictTrieBuilder::Node *
UCharTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
int32_t
UCharTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
UBool hasValue=FALSE;
int32_t value=0;
if(unitIndex==elements[start].getStringLength(strings)) {
// An intermediate or final value.
value=elements[start++].getValue();
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
ValueNode *node;
// Now all [start..limit[ strings are longer than unitIndex.
const UCharTrieElement &minElement=elements[start];
const UCharTrieElement &maxElement=elements[limit-1];
int32_t minUnit=minElement.charAt(unitIndex, strings);
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t minStringLength=minElement.getStringLength(strings);
int32_t lastUnitIndex=unitIndex;
while(++lastUnitIndex<minStringLength &&
minElement.charAt(lastUnitIndex, strings)==
maxElement.charAt(lastUnitIndex, strings)) {}
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
const UChar *s=minElement.getString(strings).getBuffer();
int32_t length=lastUnitIndex-unitIndex;
while(length>UCharTrie::kMaxLinearMatchLength) {
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
length-=UCharTrie::kMaxLinearMatchLength;
node=new UCTLinearMatchNode(
s+lastUnitIndex,
UCharTrie::kMaxLinearMatchLength,
nextNode);
node=(ValueNode *)registerNode(node, errorCode);
nextNode=node;
}
node=new UCTLinearMatchNode(s+unitIndex, length, nextNode);
} else {
// Branch node.
int32_t length=0; // Number of different units at unitIndex.
int32_t i=start;
do {
UChar unit=elements[i++].charAt(unitIndex, strings);
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
++length;
} while(i<limit);
// length>=2 because minUnit!=maxUnit.
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
node=new UCTBranchHeadNode(length, subNode);
}
if(hasValue && node!=NULL) {
node->setValue(value);
}
return registerNode(node, errorCode);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
DictTrieBuilder::Node *
UCharTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UChar middleUnits[16];
Node *lessThan[16];
int32_t ltLength=0;
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t count=length/2;
int32_t i=start;
UChar unit;
do {
unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
} while(--count>0);
// Create the less-than branch.
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
UCTListBranchNode *listNode=new UCTListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t unitNumber=0;
do {
int32_t i=start;
UChar unit=elements[i++].charAt(unitIndex, strings);
while(unit==elements[i].charAt(unitIndex, strings)) {
++i;
}
if(start==i-1 && unitIndex+1==elements[start].getStringLength(strings)) {
listNode->add(unit, elements[start].getValue());
} else {
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
}
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
UChar unit=elements[start].charAt(unitIndex, strings);
if(start==limit-1 && unitIndex+1==elements[start].getStringLength(strings)) {
listNode->add(unit, elements[start].getValue());
} else {
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new UCTSplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
void
UCharTrieBuilder::UCTFinalValueNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
offset=b.writeValueAndFinal(value, TRUE);
return i;
}
UCharTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
@ -511,76 +269,16 @@ UCharTrieBuilder::UCTLinearMatchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
next->write(builder);
b.write(s, length);
offset=b.writeValueAndType(hasValue, value, minLinearMatch()+length-1);
offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
}
void
UCharTrieBuilder::UCTListBranchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t unitNumber=length-1;
Node *rightEdge=equal[unitNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=NULL) {
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==NULL) {
b.writeValueAndFinal(values[unitNumber], TRUE);
} else {
rightEdge->write(builder);
}
b.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[unitNumber]==NULL) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[unitNumber]->getOffset()>0);
value=b.ucharsLength-equal[unitNumber]->getOffset();
isFinal=FALSE;
}
b.writeValueAndFinal(value, isFinal);
offset=b.write(units[unitNumber]);
}
}
void
UCharTrieBuilder::UCTSplitBranchNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
b.writeDelta(b.ucharsLength-lessThan->getOffset()); // less-than
offset=b.write(unit);
}
void
UCharTrieBuilder::UCTBranchHeadNode::write(DictTrieBuilder &builder) {
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
next->write(builder);
if(length<=minLinearMatch()) {
offset=b.writeValueAndType(hasValue, value, length-1);
} else {
b.write(length-1);
offset=b.writeValueAndType(hasValue, value, 0);
}
DictTrieBuilder::Node *
UCharTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const {
return new UCTLinearMatchNode(
elements[i].getString(strings).getBuffer()+unitIndex,
length,
nextNode);
}
UBool
@ -629,6 +327,11 @@ UCharTrieBuilder::write(const UChar *s, int32_t length) {
return ucharsLength;
}
int32_t
UCharTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
}
int32_t
UCharTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
UChar intUnits[3];
@ -675,7 +378,8 @@ UCharTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node)
}
int32_t
UCharTrieBuilder::writeDelta(int32_t i) {
UCharTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
int32_t i=ucharsLength-jumpTarget;
UChar intUnits[3];
int32_t length;
U_ASSERT(i>=0);

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: uchartriebuilder.h
@ -45,33 +45,21 @@ public:
}
private:
void writeNode(int32_t start, int32_t limit, int32_t unitIndex);
void writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
virtual int32_t getElementStringLength(int32_t i) const;
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const;
virtual int32_t getElementValue(int32_t i) const;
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode);
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const;
UBool ensureCapacity(int32_t length);
int32_t write(int32_t unit);
int32_t write(const UChar *s, int32_t length);
int32_t writeValueAndFinal(int32_t i, UBool final);
int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
int32_t writeDelta(int32_t i);
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const;
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const;
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const;
// Compacting builder.
virtual UBool matchNodesCanHaveValues() const { return TRUE; }
// Indirect "friend" access.
// Nested classes cannot be friends of UCharTrie unless the whole header is included,
// at least with AIX xlC_r,
// so this Builder class, which is a friend, provides the necessary value.
static int32_t minLinearMatch() { return UCharTrie::kMinLinearMatch; }
class UCTFinalValueNode : public FinalValueNode {
public:
UCTFinalValueNode(int32_t v) : FinalValueNode(v) {}
virtual void write(DictTrieBuilder &builder);
};
virtual int32_t getMaxBranchLinearSubNodeLength() const { return UCharTrie::kMaxBranchLinearSubNodeLength; }
virtual int32_t getMinLinearMatch() const { return UCharTrie::kMinLinearMatch; }
virtual int32_t getMaxLinearMatchLength() const { return UCharTrie::kMaxLinearMatchLength; }
class UCTLinearMatchNode : public LinearMatchNode {
public:
@ -82,26 +70,16 @@ private:
const UChar *s;
};
class UCTListBranchNode : public ListBranchNode {
public:
UCTListBranchNode() : ListBranchNode() {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
Node *nextNode) const;
class UCTSplitBranchNode : public SplitBranchNode {
public:
UCTSplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
: SplitBranchNode(middleUnit, lessThanNode, greaterOrEqualNode) {}
virtual void write(DictTrieBuilder &builder);
};
class UCTBranchHeadNode : public BranchHeadNode {
public:
UCTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
virtual void write(DictTrieBuilder &builder);
};
virtual Node *createFinalValueNode(int32_t value) const { return new UCTFinalValueNode(value); }
UBool ensureCapacity(int32_t length);
virtual int32_t write(int32_t unit);
int32_t write(const UChar *s, int32_t length);
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length);
virtual int32_t writeValueAndFinal(int32_t i, UBool final);
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
virtual int32_t writeDeltaTo(int32_t jumpTarget);
UnicodeString strings;
UCharTrieElement *elements;