ICU-8167 share as much code between builder versions and methods as possible
X-SVN-Rev: 29257
This commit is contained in:
parent
e0c2a3c61c
commit
5e23f1c3e9
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetriebuilder.cpp
|
||||
@ -214,17 +214,7 @@ ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode)
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
if(buildOption==UDICTTRIE_BUILD_FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* UDICTTRIE_BUILD_SMALL */ {
|
||||
createCompactBuilder(2*elementsLength, errorCode);
|
||||
Node *root=makeNode(0, elementsLength, 0, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
root->markRightEdgesFirst(-1);
|
||||
root->write(*this);
|
||||
}
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
DictTrieBuilder::build(buildOption, elementsLength, errorCode);
|
||||
if(bytes==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
@ -233,322 +223,64 @@ ByteTrieBuilder::build(UDictTrieBuildOption buildOption, UErrorCode &errorCode)
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length byteIndex.
|
||||
void
|
||||
ByteTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t byteIndex) {
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(byteIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
writeValueAndFinal(value, TRUE); // final-value node
|
||||
return;
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than byteIndex.
|
||||
const ByteTrieElement &minElement=elements[start];
|
||||
const ByteTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
|
||||
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
|
||||
if(minByte==maxByte) {
|
||||
// Linear-match node: All strings have the same character at byteIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastByteIndex=byteIndex;
|
||||
while(++lastByteIndex<minStringLength &&
|
||||
minElement.charAt(lastByteIndex, strings)==
|
||||
maxElement.charAt(lastByteIndex, strings)) {}
|
||||
writeNode(start, limit, lastByteIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const char *s=minElement.getString(strings).data();
|
||||
int32_t length=lastByteIndex-byteIndex;
|
||||
while(length>ByteTrie::kMaxLinearMatchLength) {
|
||||
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
|
||||
length-=ByteTrie::kMaxLinearMatchLength;
|
||||
write(s+lastByteIndex, ByteTrie::kMaxLinearMatchLength);
|
||||
write(ByteTrie::kMinLinearMatch+ByteTrie::kMaxLinearMatchLength-1);
|
||||
}
|
||||
write(s+byteIndex, length);
|
||||
write(ByteTrie::kMinLinearMatch+length-1);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different bytes at byteIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minByte!=maxByte.
|
||||
writeBranchSubNode(start, limit, byteIndex, length);
|
||||
write(--length);
|
||||
if(length>=ByteTrie::kMinLinearMatch) {
|
||||
write(0);
|
||||
}
|
||||
}
|
||||
if(hasValue) {
|
||||
writeValueAndFinal(value, FALSE);
|
||||
}
|
||||
int32_t
|
||||
ByteTrieBuilder::getElementStringLength(int32_t i) const {
|
||||
return elements[i].getStringLength(strings);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than byteIndex &&
|
||||
// length different bytes at byteIndex
|
||||
void
|
||||
ByteTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length) {
|
||||
char middleBytes[16];
|
||||
int32_t lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle byte.
|
||||
// First, find the middle byte.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
char byte;
|
||||
do {
|
||||
byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
|
||||
writeBranchSubNode(start, i, byteIndex, length/2);
|
||||
lessThan[ltLength]=bytesLength;
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each byte, find its elements array start and whether it has a final value.
|
||||
int32_t starts[ByteTrie::kMaxBranchLinearSubNodeLength];
|
||||
UBool final[ByteTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
int32_t byteNumber=0;
|
||||
do {
|
||||
int32_t i=starts[byteNumber]=start;
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
final[byteNumber]= start==i-1 && byteIndex+1==elements[start].getStringLength(strings);
|
||||
start=i;
|
||||
} while(++byteNumber<length-1);
|
||||
// byteNumber==length-1, and the maxByte elements range is [start..limit[
|
||||
starts[byteNumber]=start;
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minByte sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minByte sub-node last, for a shorter delta.
|
||||
int32_t jumpTargets[ByteTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
do {
|
||||
--byteNumber;
|
||||
if(!final[byteNumber]) {
|
||||
writeNode(starts[byteNumber], starts[byteNumber+1], byteIndex+1);
|
||||
jumpTargets[byteNumber]=bytesLength;
|
||||
}
|
||||
} while(byteNumber>0);
|
||||
// The maxByte sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
byteNumber=length-1;
|
||||
writeNode(start, limit, byteIndex+1);
|
||||
write((uint8_t)elements[start].charAt(byteIndex, strings));
|
||||
// Write the rest of this node's byte-value pairs.
|
||||
while(--byteNumber>=0) {
|
||||
start=starts[byteNumber];
|
||||
int32_t value;
|
||||
if(final[byteNumber]) {
|
||||
// Write the final value for the one string ending with this byte.
|
||||
value=elements[start].getValue();
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
value=bytesLength-jumpTargets[byteNumber];
|
||||
}
|
||||
writeValueAndFinal(value, final[byteNumber]);
|
||||
write((uint8_t)elements[start].charAt(byteIndex, strings));
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
writeDelta(bytesLength-lessThan[ltLength]); // less-than
|
||||
write((uint8_t)middleBytes[ltLength]);
|
||||
}
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length byteIndex.
|
||||
DictTrieBuilder::Node *
|
||||
ByteTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(byteIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value, errorCode);
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
Node *node;
|
||||
// Now all [start..limit[ strings are longer than byteIndex.
|
||||
const ByteTrieElement &minElement=elements[start];
|
||||
const ByteTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minByte=(uint8_t)minElement.charAt(byteIndex, strings);
|
||||
int32_t maxByte=(uint8_t)maxElement.charAt(byteIndex, strings);
|
||||
if(minByte==maxByte) {
|
||||
// Linear-match node: All strings have the same character at byteIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastByteIndex=byteIndex;
|
||||
while(++lastByteIndex<minStringLength &&
|
||||
minElement.charAt(lastByteIndex, strings)==
|
||||
maxElement.charAt(lastByteIndex, strings)) {}
|
||||
Node *nextNode=makeNode(start, limit, lastByteIndex, errorCode);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const char *s=minElement.getString(strings).data();
|
||||
int32_t length=lastByteIndex-byteIndex;
|
||||
while(length>ByteTrie::kMaxLinearMatchLength) {
|
||||
lastByteIndex-=ByteTrie::kMaxLinearMatchLength;
|
||||
length-=ByteTrie::kMaxLinearMatchLength;
|
||||
node=new BTLinearMatchNode(
|
||||
s+lastByteIndex,
|
||||
ByteTrie::kMaxLinearMatchLength,
|
||||
nextNode);
|
||||
node=registerNode(node, errorCode);
|
||||
nextNode=node;
|
||||
}
|
||||
node=new BTLinearMatchNode(s+byteIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different bytes at byteIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minByte!=maxByte.
|
||||
Node *subNode=makeBranchSubNode(start, limit, byteIndex, length, errorCode);
|
||||
node=new BTBranchHeadNode(length, subNode);
|
||||
}
|
||||
node=registerNode(node, errorCode);
|
||||
if(hasValue) {
|
||||
node=registerNode(new BTValueNode(value, node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than byteIndex &&
|
||||
// length different bytes at byteIndex
|
||||
DictTrieBuilder::Node *
|
||||
ByteTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
|
||||
int32_t length, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
char middleBytes[16];
|
||||
Node *lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>ByteTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle byte.
|
||||
// First, find the middle byte.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
char byte;
|
||||
do {
|
||||
byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
byte=middleBytes[ltLength]=elements[i].charAt(byteIndex, strings); // middle byte
|
||||
lessThan[ltLength]=makeBranchSubNode(start, i, byteIndex, length/2, errorCode);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
BTListBranchNode *listNode=new BTListBranchNode();
|
||||
if(listNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// For each byte, find its elements array start and whether it has a final value.
|
||||
int32_t byteNumber=0;
|
||||
do {
|
||||
int32_t i=start;
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
if(start==i-1 && byteIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add((uint8_t)byte, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add((uint8_t)byte, makeNode(start, i, byteIndex+1, errorCode));
|
||||
}
|
||||
start=i;
|
||||
} while(++byteNumber<length-1);
|
||||
// byteNumber==length-1, and the maxByte elements range is [start..limit[
|
||||
char byte=elements[start].charAt(byteIndex, strings);
|
||||
if(start==limit-1 && byteIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add((uint8_t)byte, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add((uint8_t)byte, makeNode(start, limit, byteIndex+1, errorCode));
|
||||
}
|
||||
Node *node=registerNode(listNode, errorCode);
|
||||
// Create the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new BTSplitBranchNode(middleBytes[ltLength], lessThan[ltLength], node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTFinalValueNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
offset=b.writeValueAndFinal(value, TRUE);
|
||||
}
|
||||
|
||||
UBool
|
||||
ByteTrieBuilder::BTValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!ValueNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const BTValueNode &o=(const BTValueNode &)other;
|
||||
return next==o.next;
|
||||
UChar
|
||||
ByteTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
|
||||
return (uint8_t)elements[i].charAt(byteIndex, strings);
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::BTValueNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
ByteTrieBuilder::getElementValue(int32_t i) const {
|
||||
return elements[i].getValue();
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTValueNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
offset=b.writeValueAndFinal(value, FALSE);
|
||||
int32_t
|
||||
ByteTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
|
||||
const ByteTrieElement &firstElement=elements[first];
|
||||
const ByteTrieElement &lastElement=elements[last];
|
||||
int32_t minStringLength=firstElement.getStringLength(strings);
|
||||
while(++byteIndex<minStringLength &&
|
||||
firstElement.charAt(byteIndex, strings)==
|
||||
lastElement.charAt(byteIndex, strings)) {}
|
||||
return byteIndex;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(i<limit && byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
return length;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
|
||||
do {
|
||||
char byte=elements[i++].charAt(byteIndex, strings);
|
||||
while(byte==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
return i;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
|
||||
char b=(char)byte;
|
||||
while(b==elements[i].charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
ByteTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
|
||||
@ -573,74 +305,16 @@ ByteTrieBuilder::BTLinearMatchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
b.write(s, length);
|
||||
offset=b.write(minLinearMatch()+length-1);
|
||||
offset=b.write(b.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTListBranchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minByte sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minByte sub-node last, for a shorter delta.
|
||||
int32_t byteNumber=length-1;
|
||||
Node *rightEdge=equal[byteNumber];
|
||||
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
|
||||
do {
|
||||
--byteNumber;
|
||||
if(equal[byteNumber]!=NULL) {
|
||||
equal[byteNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
|
||||
}
|
||||
} while(byteNumber>0);
|
||||
// The maxByte sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
byteNumber=length-1;
|
||||
if(rightEdge==NULL) {
|
||||
b.writeValueAndFinal(values[byteNumber], TRUE);
|
||||
} else {
|
||||
rightEdge->write(builder);
|
||||
}
|
||||
b.write(units[byteNumber]);
|
||||
// Write the rest of this node's byte-value pairs.
|
||||
while(--byteNumber>=0) {
|
||||
int32_t value;
|
||||
UBool isFinal;
|
||||
if(equal[byteNumber]==NULL) {
|
||||
// Write the final value for the one string ending with this byte.
|
||||
value=values[byteNumber];
|
||||
isFinal=TRUE;
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
U_ASSERT(equal[byteNumber]->getOffset()>0);
|
||||
value=b.bytesLength-equal[byteNumber]->getOffset();
|
||||
isFinal=FALSE;
|
||||
}
|
||||
b.writeValueAndFinal(value, isFinal);
|
||||
offset=b.write(units[byteNumber]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTSplitBranchNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
// Encode the less-than branch first.
|
||||
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
|
||||
// Encode the greater-or-equal branch last because we do not jump for it at all.
|
||||
greaterOrEqual->write(builder);
|
||||
// Write this node.
|
||||
U_ASSERT(lessThan->getOffset()>0);
|
||||
b.writeDelta(b.bytesLength-lessThan->getOffset()); // less-than
|
||||
offset=b.write(unit);
|
||||
}
|
||||
|
||||
void
|
||||
ByteTrieBuilder::BTBranchHeadNode::write(DictTrieBuilder &builder) {
|
||||
ByteTrieBuilder &b=(ByteTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
offset=b.write((length-1));
|
||||
if(length>minLinearMatch()) {
|
||||
offset=b.write(0);
|
||||
}
|
||||
DictTrieBuilder::Node *
|
||||
ByteTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
|
||||
Node *nextNode) const {
|
||||
return new BTLinearMatchNode(
|
||||
elements[i].getString(strings).data()+byteIndex,
|
||||
length,
|
||||
nextNode);
|
||||
}
|
||||
|
||||
UBool
|
||||
@ -689,6 +363,11 @@ ByteTrieBuilder::write(const char *b, int32_t length) {
|
||||
return bytesLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
|
||||
return write(elements[i].getString(strings).data()+byteIndex, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
|
||||
char intBytes[5];
|
||||
@ -722,7 +401,17 @@ ByteTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeDelta(int32_t i) {
|
||||
ByteTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
|
||||
int32_t offset=write(node);
|
||||
if(hasValue) {
|
||||
offset=writeValueAndFinal(value, FALSE);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
int32_t
|
||||
ByteTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
|
||||
int32_t i=bytesLength-jumpTarget;
|
||||
char intBytes[5];
|
||||
int32_t length;
|
||||
U_ASSERT(i>=0);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: bytetriebuilder.h
|
||||
@ -46,43 +46,21 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void writeNode(int32_t start, int32_t limit, int32_t byteIndex);
|
||||
void writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
|
||||
virtual int32_t getElementStringLength(int32_t i) const;
|
||||
virtual UChar getElementUnit(int32_t i, int32_t byteIndex) const;
|
||||
virtual int32_t getElementValue(int32_t i) const;
|
||||
|
||||
Node *makeNode(int32_t start, int32_t limit, int32_t byteIndex, UErrorCode &errorCode);
|
||||
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex,
|
||||
int32_t length, UErrorCode &errorCode);
|
||||
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const;
|
||||
|
||||
UBool ensureCapacity(int32_t length);
|
||||
int32_t write(int32_t byte);
|
||||
int32_t write(const char *b, int32_t length);
|
||||
int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
int32_t writeDelta(int32_t i);
|
||||
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const;
|
||||
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const;
|
||||
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const;
|
||||
|
||||
// Compacting builder.
|
||||
virtual UBool matchNodesCanHaveValues() const { return FALSE; }
|
||||
|
||||
// Indirect "friend" access.
|
||||
// Nested classes cannot be friends of ByteTrie unless the whole header is included,
|
||||
// at least with AIX xlC_r,
|
||||
// so this Builder class, which is a friend, provides the necessary value.
|
||||
static int32_t minLinearMatch() { return ByteTrie::kMinLinearMatch; }
|
||||
|
||||
class BTFinalValueNode : public FinalValueNode {
|
||||
public:
|
||||
BTFinalValueNode(int32_t v) : FinalValueNode(v) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class BTValueNode : public ValueNode {
|
||||
public:
|
||||
BTValueNode(int32_t v, Node *nextNode)
|
||||
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
private:
|
||||
Node *next;
|
||||
};
|
||||
virtual int32_t getMaxBranchLinearSubNodeLength() const { return ByteTrie::kMaxBranchLinearSubNodeLength; }
|
||||
virtual int32_t getMinLinearMatch() const { return ByteTrie::kMinLinearMatch; }
|
||||
virtual int32_t getMaxLinearMatchLength() const { return ByteTrie::kMaxLinearMatchLength; }
|
||||
|
||||
class BTLinearMatchNode : public LinearMatchNode {
|
||||
public:
|
||||
@ -93,26 +71,16 @@ private:
|
||||
const char *s;
|
||||
};
|
||||
|
||||
class BTListBranchNode : public ListBranchNode {
|
||||
public:
|
||||
BTListBranchNode() : ListBranchNode() {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
|
||||
Node *nextNode) const;
|
||||
|
||||
class BTSplitBranchNode : public SplitBranchNode {
|
||||
public:
|
||||
BTSplitBranchNode(char middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: SplitBranchNode((uint8_t)middleUnit, lessThanNode, greaterOrEqualNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class BTBranchHeadNode : public BranchHeadNode {
|
||||
public:
|
||||
BTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const { return new BTFinalValueNode(value); }
|
||||
UBool ensureCapacity(int32_t length);
|
||||
virtual int32_t write(int32_t byte);
|
||||
int32_t write(const char *b, int32_t length);
|
||||
virtual int32_t writeElementUnits(int32_t i, int32_t byteIndex, int32_t length);
|
||||
virtual int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
|
||||
virtual int32_t writeDeltaTo(int32_t jumpTarget);
|
||||
|
||||
CharString strings;
|
||||
ByteTrieElement *elements;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: dicttriebuilder.cpp
|
||||
@ -63,6 +63,257 @@ DictTrieBuilder::deleteCompactBuilder() {
|
||||
nodes=NULL;
|
||||
}
|
||||
|
||||
void
|
||||
DictTrieBuilder::build(UDictTrieBuildOption buildOption, int32_t elementsLength,
|
||||
UErrorCode &errorCode) {
|
||||
if(buildOption==UDICTTRIE_BUILD_FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* UDICTTRIE_BUILD_SMALL */ {
|
||||
createCompactBuilder(2*elementsLength, errorCode);
|
||||
Node *root=makeNode(0, elementsLength, 0, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
root->markRightEdgesFirst(-1);
|
||||
root->write(*this);
|
||||
}
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
int32_t
|
||||
DictTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
int32_t type;
|
||||
if(unitIndex==getElementStringLength(start)) {
|
||||
// An intermediate or final value.
|
||||
value=getElementValue(start++);
|
||||
if(start==limit) {
|
||||
return writeValueAndFinal(value, TRUE); // final-value node
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
int32_t minUnit=getElementUnit(start, unitIndex);
|
||||
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
|
||||
writeNode(start, limit, lastUnitIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
|
||||
while(length>maxLinearMatchLength) {
|
||||
lastUnitIndex-=maxLinearMatchLength;
|
||||
length-=maxLinearMatchLength;
|
||||
writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
|
||||
write(getMinLinearMatch()+maxLinearMatchLength-1);
|
||||
}
|
||||
writeElementUnits(start, unitIndex, length);
|
||||
type=getMinLinearMatch()+length-1;
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=countElementUnits(start, limit, unitIndex);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
writeBranchSubNode(start, limit, unitIndex, length);
|
||||
if(--length<getMinLinearMatch()) {
|
||||
type=length;
|
||||
} else {
|
||||
write(length);
|
||||
type=0;
|
||||
}
|
||||
}
|
||||
return writeValueAndType(hasValue, value, type);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
int32_t
|
||||
DictTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
|
||||
UChar middleUnits[kMaxSplitBranchLevels];
|
||||
int32_t lessThan[kMaxSplitBranchLevels];
|
||||
int32_t ltLength=0;
|
||||
while(length>getMaxBranchLinearSubNodeLength()) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
|
||||
// Encode the less-than branch first.
|
||||
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
|
||||
lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t starts[kMaxBranchLinearSubNodeLength];
|
||||
UBool final[kMaxBranchLinearSubNodeLength-1];
|
||||
int32_t unitNumber=0;
|
||||
do {
|
||||
int32_t i=starts[unitNumber]=start;
|
||||
UChar unit=getElementUnit(i++, unitIndex);
|
||||
i=indexOfElementWithNextUnit(i, unitIndex, unit);
|
||||
final[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
starts[unitNumber]=start;
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
|
||||
do {
|
||||
--unitNumber;
|
||||
if(!final[unitNumber]) {
|
||||
jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
writeNode(start, limit, unitIndex+1);
|
||||
int32_t offset=write(getElementUnit(start, unitIndex));
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
start=starts[unitNumber];
|
||||
int32_t value;
|
||||
if(final[unitNumber]) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=getElementValue(start);
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
value=offset-jumpTargets[unitNumber];
|
||||
}
|
||||
writeValueAndFinal(value, final[unitNumber]);
|
||||
offset=write(getElementUnit(start, unitIndex));
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
writeDeltaTo(lessThan[ltLength]);
|
||||
offset=write(middleUnits[ltLength]);
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
DictTrieBuilder::Node *
|
||||
DictTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(unitIndex==getElementStringLength(start)) {
|
||||
// An intermediate or final value.
|
||||
value=getElementValue(start++);
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value, errorCode);
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
Node *node;
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
int32_t minUnit=getElementUnit(start, unitIndex);
|
||||
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
|
||||
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
|
||||
while(length>maxLinearMatchLength) {
|
||||
lastUnitIndex-=maxLinearMatchLength;
|
||||
length-=maxLinearMatchLength;
|
||||
node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
|
||||
nextNode=registerNode(node, errorCode);
|
||||
}
|
||||
node=createLinearMatchNode(start, unitIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=countElementUnits(start, limit, unitIndex);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
|
||||
node=new BranchHeadNode(length, subNode);
|
||||
}
|
||||
if(hasValue && node!=NULL) {
|
||||
if(matchNodesCanHaveValues()) {
|
||||
((ValueNode *)node)->setValue(value);
|
||||
} else {
|
||||
node=new IntermediateValueNode(value, registerNode(node, errorCode));
|
||||
}
|
||||
}
|
||||
return registerNode(node, errorCode);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
DictTrieBuilder::Node *
|
||||
DictTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UChar middleUnits[kMaxSplitBranchLevels];
|
||||
Node *lessThan[kMaxSplitBranchLevels];
|
||||
int32_t ltLength=0;
|
||||
while(length>getMaxBranchLinearSubNodeLength()) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
|
||||
// Create the less-than branch.
|
||||
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
|
||||
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
ListBranchNode *listNode=new ListBranchNode();
|
||||
if(listNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t unitNumber=0;
|
||||
do {
|
||||
int32_t i=start;
|
||||
UChar unit=getElementUnit(i++, unitIndex);
|
||||
i=indexOfElementWithNextUnit(i, unitIndex, unit);
|
||||
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
|
||||
listNode->add(unit, getElementValue(start));
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
|
||||
}
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
UChar unit=getElementUnit(start, unitIndex);
|
||||
if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
|
||||
listNode->add(unit, getElementValue(start));
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
|
||||
}
|
||||
Node *node=registerNode(listNode, errorCode);
|
||||
// Create the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
DictTrieBuilder::Node *
|
||||
DictTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
@ -102,7 +353,7 @@ DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
|
||||
if(old!=NULL) {
|
||||
return (Node *)old->key.pointer;
|
||||
}
|
||||
Node *newNode=createFinalValueNode(value);
|
||||
Node *newNode=new FinalValueNode(value);
|
||||
if(newNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
@ -121,19 +372,25 @@ DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
|
||||
return newNode;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::hashNode(const void *node) {
|
||||
UBool
|
||||
DictTrieBuilder::hashNode(const void *node) {
|
||||
return ((const Node *)node)->hashCode();
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::equalNodes(const void *left, const void *right) {
|
||||
UBool
|
||||
DictTrieBuilder::equalNodes(const void *left, const void *right) {
|
||||
return *(const Node *)left==*(const Node *)right;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::Node::operator==(const Node &other) const {
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder)
|
||||
|
||||
UBool
|
||||
DictTrieBuilder::Node::operator==(const Node &other) const {
|
||||
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
int32_t
|
||||
DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber;
|
||||
}
|
||||
@ -142,28 +399,25 @@ int32_t DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
|
||||
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder::Node)
|
||||
|
||||
UBool DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
|
||||
UBool
|
||||
DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
// Not:
|
||||
// if(!Node::operator==(other)) {
|
||||
// return FALSE;
|
||||
// }
|
||||
// because registerFinalValue() compares a stack-allocated FinalValueNode
|
||||
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
|
||||
// with the specific builder's subclass of FinalValueNode,
|
||||
// and !Node::operator==(other) will always be false for that because it
|
||||
// compares the typeid's.
|
||||
// This workaround assumes that the subclass does not add fields that need to be compared.
|
||||
if(hash!=other.hashCode()) {
|
||||
if(!Node::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const FinalValueNode *o=dynamic_cast<const FinalValueNode *>(&other);
|
||||
return o!=NULL && value==o->value;
|
||||
const FinalValueNode &o=(const FinalValueNode &)other;
|
||||
return value==o.value;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
|
||||
void
|
||||
DictTrieBuilder::FinalValueNode::write(DictTrieBuilder &builder) {
|
||||
offset=builder.writeValueAndFinal(value, TRUE);
|
||||
}
|
||||
|
||||
UBool
|
||||
DictTrieBuilder::ValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
@ -174,7 +428,34 @@ UBool DictTrieBuilder::ValueNode::operator==(const Node &other) const {
|
||||
return hasValue==o.hasValue && (!hasValue || value==o.value);
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
|
||||
UBool
|
||||
DictTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
if(!ValueNode::operator==(other)) {
|
||||
return FALSE;
|
||||
}
|
||||
const IntermediateValueNode &o=(const IntermediateValueNode &)other;
|
||||
return next==o.next;
|
||||
}
|
||||
|
||||
int32_t
|
||||
DictTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
void
|
||||
DictTrieBuilder::IntermediateValueNode::write(DictTrieBuilder &builder) {
|
||||
next->write(builder);
|
||||
offset=builder.writeValueAndFinal(value, FALSE);
|
||||
}
|
||||
|
||||
UBool
|
||||
DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
@ -185,14 +466,16 @@ UBool DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
|
||||
return length==o.length && next==o.next;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
int32_t
|
||||
DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
|
||||
UBool
|
||||
DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
@ -208,7 +491,8 @@ UBool DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
int32_t
|
||||
DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
firstEdgeNumber=edgeNumber;
|
||||
int32_t step=0;
|
||||
@ -226,7 +510,51 @@ int32_t DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber)
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
|
||||
void
|
||||
DictTrieBuilder::ListBranchNode::write(DictTrieBuilder &builder) {
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t unitNumber=length-1;
|
||||
Node *rightEdge=equal[unitNumber];
|
||||
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
|
||||
do {
|
||||
--unitNumber;
|
||||
if(equal[unitNumber]!=NULL) {
|
||||
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
if(rightEdge==NULL) {
|
||||
builder.writeValueAndFinal(values[unitNumber], TRUE);
|
||||
} else {
|
||||
rightEdge->write(builder);
|
||||
}
|
||||
offset=builder.write(units[unitNumber]);
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
int32_t value;
|
||||
UBool isFinal;
|
||||
if(equal[unitNumber]==NULL) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=values[unitNumber];
|
||||
isFinal=TRUE;
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
U_ASSERT(equal[unitNumber]->getOffset()>0);
|
||||
value=offset-equal[unitNumber]->getOffset();
|
||||
isFinal=FALSE;
|
||||
}
|
||||
builder.writeValueAndFinal(value, isFinal);
|
||||
offset=builder.write(units[unitNumber]);
|
||||
}
|
||||
}
|
||||
|
||||
UBool
|
||||
DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
@ -237,7 +565,8 @@ UBool DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
|
||||
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
int32_t
|
||||
DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
firstEdgeNumber=edgeNumber;
|
||||
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
|
||||
@ -246,7 +575,20 @@ int32_t DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
|
||||
void
|
||||
DictTrieBuilder::SplitBranchNode::write(DictTrieBuilder &builder) {
|
||||
// Encode the less-than branch first.
|
||||
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
|
||||
// Encode the greater-or-equal branch last because we do not jump for it at all.
|
||||
greaterOrEqual->write(builder);
|
||||
// Write this node.
|
||||
U_ASSERT(lessThan->getOffset()>0);
|
||||
builder.writeDeltaTo(lessThan->getOffset()); // less-than
|
||||
offset=builder.write(unit);
|
||||
}
|
||||
|
||||
UBool
|
||||
DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
|
||||
if(this==&other) {
|
||||
return TRUE;
|
||||
}
|
||||
@ -257,11 +599,23 @@ UBool DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
|
||||
return length==o.length && next==o.next;
|
||||
}
|
||||
|
||||
int32_t DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
int32_t
|
||||
DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
|
||||
if(offset==0) {
|
||||
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
void
|
||||
DictTrieBuilder::BranchHeadNode::write(DictTrieBuilder &builder) {
|
||||
next->write(builder);
|
||||
if(length<=builder.getMinLinearMatch()) {
|
||||
offset=builder.writeValueAndType(hasValue, value, length-1);
|
||||
} else {
|
||||
builder.write(length-1);
|
||||
offset=builder.writeValueAndType(hasValue, value, 0);
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: dicttriebuilder.h
|
||||
@ -28,7 +28,7 @@ enum UDictTrieBuildOption {
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class U_TOOLUTIL_API DictTrieBuilder : public UMemory {
|
||||
class U_TOOLUTIL_API DictTrieBuilder : public UObject {
|
||||
public:
|
||||
/** @internal */
|
||||
static UBool hashNode(const void *node);
|
||||
@ -39,11 +39,46 @@ protected:
|
||||
DictTrieBuilder();
|
||||
virtual ~DictTrieBuilder();
|
||||
|
||||
class Node;
|
||||
|
||||
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
|
||||
void deleteCompactBuilder();
|
||||
|
||||
void build(UDictTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
|
||||
|
||||
int32_t writeNode(int32_t start, int32_t limit, int32_t byteIndex);
|
||||
int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t byteIndex, int32_t length);
|
||||
|
||||
class Node;
|
||||
|
||||
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
|
||||
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode);
|
||||
|
||||
virtual int32_t getElementStringLength(int32_t i) const = 0;
|
||||
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
|
||||
virtual int32_t getElementValue(int32_t i) const = 0;
|
||||
|
||||
// Finds the first unit index after this one where
|
||||
// the first and last element have different units again.
|
||||
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
|
||||
|
||||
// Number of different bytes at unitIndex.
|
||||
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
|
||||
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
|
||||
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
|
||||
|
||||
virtual UBool matchNodesCanHaveValues() const = 0;
|
||||
|
||||
virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
|
||||
virtual int32_t getMinLinearMatch() const = 0;
|
||||
virtual int32_t getMaxLinearMatchLength() const = 0;
|
||||
|
||||
// max(ByteTrie::kMaxBranchLinearSubNodeLength, UCharTrie::kMaxBranchLinearSubNodeLength).
|
||||
static const int32_t kMaxBranchLinearSubNodeLength=5;
|
||||
|
||||
// Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
|
||||
// log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
|
||||
static const int32_t kMaxSplitBranchLevels=14;
|
||||
|
||||
/**
|
||||
* Makes sure that there is only one unique node registered that is
|
||||
* equivalent to newNode.
|
||||
@ -81,8 +116,6 @@ protected:
|
||||
* a Node pointer, or before setting a new UErrorCode.
|
||||
*/
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const = 0;
|
||||
|
||||
// Hash set of nodes, maps from nodes to integer 1.
|
||||
UHashtable *nodes;
|
||||
|
||||
@ -146,12 +179,17 @@ protected:
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
};
|
||||
|
||||
// This class should not be overridden because
|
||||
// registerFinalValue() compares a stack-allocated FinalValueNode
|
||||
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
|
||||
// with the input node, and the
|
||||
// !Node::operator==(other) used inside FinalValueNode::operator==(other)
|
||||
// will be false if the typeid's are different.
|
||||
class FinalValueNode : public Node {
|
||||
public:
|
||||
FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
// Dummy default implementation, must be overridden for real writing.
|
||||
virtual void write(DictTrieBuilder & /*builder*/) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
protected:
|
||||
int32_t value;
|
||||
};
|
||||
@ -170,6 +208,17 @@ protected:
|
||||
int32_t value;
|
||||
};
|
||||
|
||||
class IntermediateValueNode : public ValueNode {
|
||||
public:
|
||||
IntermediateValueNode(int32_t v, Node *nextNode)
|
||||
: ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
protected:
|
||||
Node *next;
|
||||
};
|
||||
|
||||
class LinearMatchNode : public ValueNode {
|
||||
public:
|
||||
LinearMatchNode(int32_t len, Node *nextNode)
|
||||
@ -194,6 +243,7 @@ protected:
|
||||
ListBranchNode() : BranchNode(0x444444), length(0) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
// Adds a unit with a final value.
|
||||
void add(int32_t c, int32_t value) {
|
||||
units[length]=(UChar)c;
|
||||
@ -211,11 +261,10 @@ protected:
|
||||
hash=(hash*37+c)*37+hashCode(node);
|
||||
}
|
||||
protected:
|
||||
// TODO: 10 -> max(BT/UCT max list lengths)
|
||||
Node *equal[10]; // NULL means "has final value".
|
||||
Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
|
||||
int32_t length;
|
||||
int32_t values[10];
|
||||
UChar units[10];
|
||||
int32_t values[kMaxBranchLinearSubNodeLength];
|
||||
UChar units[kMaxBranchLinearSubNodeLength];
|
||||
};
|
||||
|
||||
class SplitBranchNode : public BranchNode {
|
||||
@ -226,6 +275,7 @@ protected:
|
||||
unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
protected:
|
||||
UChar unit;
|
||||
Node *lessThan;
|
||||
@ -240,10 +290,24 @@ protected:
|
||||
length(len), next(subNode) {}
|
||||
virtual UBool operator==(const Node &other) const;
|
||||
virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
protected:
|
||||
int32_t length;
|
||||
Node *next; // A branch sub-node.
|
||||
};
|
||||
|
||||
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
|
||||
Node *nextNode) const = 0;
|
||||
|
||||
virtual int32_t write(int32_t byte) = 0;
|
||||
virtual int32_t writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) = 0;
|
||||
virtual int32_t writeValueAndFinal(int32_t i, UBool final) = 0;
|
||||
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
|
||||
virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
|
||||
|
||||
private:
|
||||
// No ICU "poor man's RTTI" for this class nor its subclasses.
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartriebuilder.h
|
||||
@ -179,17 +179,7 @@ UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result,
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
if(buildOption==UDICTTRIE_BUILD_FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* UDICTTRIE_BUILD_SMALL */ {
|
||||
createCompactBuilder(2*elementsLength, errorCode);
|
||||
Node *root=makeNode(0, elementsLength, 0, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
root->markRightEdgesFirst(-1);
|
||||
root->write(*this);
|
||||
}
|
||||
deleteCompactBuilder();
|
||||
}
|
||||
DictTrieBuilder::build(buildOption, elementsLength, errorCode);
|
||||
if(uchars==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
@ -198,295 +188,63 @@ UCharTrieBuilder::build(UDictTrieBuildOption buildOption, UnicodeString &result,
|
||||
return result;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
void
|
||||
UCharTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
int32_t type;
|
||||
if(unitIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
writeValueAndFinal(value, TRUE); // final-value node
|
||||
return;
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
const UCharTrieElement &minElement=elements[start];
|
||||
const UCharTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minUnit=minElement.charAt(unitIndex, strings);
|
||||
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastUnitIndex=unitIndex;
|
||||
while(++lastUnitIndex<minStringLength &&
|
||||
minElement.charAt(lastUnitIndex, strings)==
|
||||
maxElement.charAt(lastUnitIndex, strings)) {}
|
||||
writeNode(start, limit, lastUnitIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const UChar *s=minElement.getString(strings).getBuffer();
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
while(length>UCharTrie::kMaxLinearMatchLength) {
|
||||
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
|
||||
length-=UCharTrie::kMaxLinearMatchLength;
|
||||
write(s+lastUnitIndex, UCharTrie::kMaxLinearMatchLength);
|
||||
write(UCharTrie::kMinLinearMatch+UCharTrie::kMaxLinearMatchLength-1);
|
||||
}
|
||||
write(s+unitIndex, length);
|
||||
type=UCharTrie::kMinLinearMatch+length-1;
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
writeBranchSubNode(start, limit, unitIndex, length);
|
||||
if(--length<UCharTrie::kMinLinearMatch) {
|
||||
type=length;
|
||||
} else {
|
||||
write(length);
|
||||
type=0;
|
||||
}
|
||||
}
|
||||
writeValueAndType(hasValue, value, type);
|
||||
int32_t
|
||||
UCharTrieBuilder::getElementStringLength(int32_t i) const {
|
||||
return elements[i].getStringLength(strings);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
void
|
||||
UCharTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
|
||||
UChar middleUnits[16];
|
||||
int32_t lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
UChar unit;
|
||||
do {
|
||||
unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Encode the less-than branch first.
|
||||
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
|
||||
writeBranchSubNode(start, i, unitIndex, length/2);
|
||||
lessThan[ltLength]=ucharsLength;
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t starts[UCharTrie::kMaxBranchLinearSubNodeLength];
|
||||
UBool final[UCharTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
int32_t unitNumber=0;
|
||||
UChar
|
||||
UCharTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
|
||||
return elements[i].charAt(unitIndex, strings);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::getElementValue(int32_t i) const {
|
||||
return elements[i].getValue();
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
|
||||
const UCharTrieElement &firstElement=elements[first];
|
||||
const UCharTrieElement &lastElement=elements[last];
|
||||
int32_t minStringLength=firstElement.getStringLength(strings);
|
||||
while(++unitIndex<minStringLength &&
|
||||
firstElement.charAt(unitIndex, strings)==
|
||||
lastElement.charAt(unitIndex, strings)) {}
|
||||
return unitIndex;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
return length;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
|
||||
do {
|
||||
int32_t i=starts[unitNumber]=start;
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
final[unitNumber]= start==i-1 && unitIndex+1==elements[start].getStringLength(strings);
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
starts[unitNumber]=start;
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t jumpTargets[UCharTrie::kMaxBranchLinearSubNodeLength-1];
|
||||
do {
|
||||
--unitNumber;
|
||||
if(!final[unitNumber]) {
|
||||
writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
|
||||
jumpTargets[unitNumber]=ucharsLength;
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
writeNode(start, limit, unitIndex+1);
|
||||
write(elements[start].charAt(unitIndex, strings));
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
start=starts[unitNumber];
|
||||
int32_t value;
|
||||
if(final[unitNumber]) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=elements[start].getValue();
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
value=ucharsLength-jumpTargets[unitNumber];
|
||||
}
|
||||
writeValueAndFinal(value, final[unitNumber]);
|
||||
write(elements[start].charAt(unitIndex, strings));
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
writeDelta(ucharsLength-lessThan[ltLength]); // less-than
|
||||
write(middleUnits[ltLength]);
|
||||
}
|
||||
} while(--count>0);
|
||||
return i;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
DictTrieBuilder::Node *
|
||||
UCharTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
int32_t
|
||||
UCharTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
UBool hasValue=FALSE;
|
||||
int32_t value=0;
|
||||
if(unitIndex==elements[start].getStringLength(strings)) {
|
||||
// An intermediate or final value.
|
||||
value=elements[start++].getValue();
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value, errorCode);
|
||||
}
|
||||
hasValue=TRUE;
|
||||
}
|
||||
ValueNode *node;
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
const UCharTrieElement &minElement=elements[start];
|
||||
const UCharTrieElement &maxElement=elements[limit-1];
|
||||
int32_t minUnit=minElement.charAt(unitIndex, strings);
|
||||
int32_t maxUnit=maxElement.charAt(unitIndex, strings);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int32_t minStringLength=minElement.getStringLength(strings);
|
||||
int32_t lastUnitIndex=unitIndex;
|
||||
while(++lastUnitIndex<minStringLength &&
|
||||
minElement.charAt(lastUnitIndex, strings)==
|
||||
maxElement.charAt(lastUnitIndex, strings)) {}
|
||||
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
const UChar *s=minElement.getString(strings).getBuffer();
|
||||
int32_t length=lastUnitIndex-unitIndex;
|
||||
while(length>UCharTrie::kMaxLinearMatchLength) {
|
||||
lastUnitIndex-=UCharTrie::kMaxLinearMatchLength;
|
||||
length-=UCharTrie::kMaxLinearMatchLength;
|
||||
node=new UCTLinearMatchNode(
|
||||
s+lastUnitIndex,
|
||||
UCharTrie::kMaxLinearMatchLength,
|
||||
nextNode);
|
||||
node=(ValueNode *)registerNode(node, errorCode);
|
||||
nextNode=node;
|
||||
}
|
||||
node=new UCTLinearMatchNode(s+unitIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int32_t length=0; // Number of different units at unitIndex.
|
||||
int32_t i=start;
|
||||
do {
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
|
||||
node=new UCTBranchHeadNode(length, subNode);
|
||||
}
|
||||
if(hasValue && node!=NULL) {
|
||||
node->setValue(value);
|
||||
}
|
||||
return registerNode(node, errorCode);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
DictTrieBuilder::Node *
|
||||
UCharTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UChar middleUnits[16];
|
||||
Node *lessThan[16];
|
||||
int32_t ltLength=0;
|
||||
while(length>UCharTrie::kMaxBranchLinearSubNodeLength) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int32_t count=length/2;
|
||||
int32_t i=start;
|
||||
UChar unit;
|
||||
do {
|
||||
unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
} while(--count>0);
|
||||
// Create the less-than branch.
|
||||
unit=middleUnits[ltLength]=elements[i].charAt(unitIndex, strings); // middle unit
|
||||
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
length=length-length/2;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
UCTListBranchNode *listNode=new UCTListBranchNode();
|
||||
if(listNode==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int32_t unitNumber=0;
|
||||
do {
|
||||
int32_t i=start;
|
||||
UChar unit=elements[i++].charAt(unitIndex, strings);
|
||||
while(unit==elements[i].charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
if(start==i-1 && unitIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add(unit, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
|
||||
}
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
UChar unit=elements[start].charAt(unitIndex, strings);
|
||||
if(start==limit-1 && unitIndex+1==elements[start].getStringLength(strings)) {
|
||||
listNode->add(unit, elements[start].getValue());
|
||||
} else {
|
||||
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
|
||||
}
|
||||
Node *node=registerNode(listNode, errorCode);
|
||||
// Create the split-branch nodes.
|
||||
while(ltLength>0) {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new UCTSplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTFinalValueNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
offset=b.writeValueAndFinal(value, TRUE);
|
||||
return i;
|
||||
}
|
||||
|
||||
UCharTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
|
||||
@ -511,76 +269,16 @@ UCharTrieBuilder::UCTLinearMatchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
b.write(s, length);
|
||||
offset=b.writeValueAndType(hasValue, value, minLinearMatch()+length-1);
|
||||
offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTListBranchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int32_t unitNumber=length-1;
|
||||
Node *rightEdge=equal[unitNumber];
|
||||
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
|
||||
do {
|
||||
--unitNumber;
|
||||
if(equal[unitNumber]!=NULL) {
|
||||
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
if(rightEdge==NULL) {
|
||||
b.writeValueAndFinal(values[unitNumber], TRUE);
|
||||
} else {
|
||||
rightEdge->write(builder);
|
||||
}
|
||||
b.write(units[unitNumber]);
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
while(--unitNumber>=0) {
|
||||
int32_t value;
|
||||
UBool isFinal;
|
||||
if(equal[unitNumber]==NULL) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
value=values[unitNumber];
|
||||
isFinal=TRUE;
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
U_ASSERT(equal[unitNumber]->getOffset()>0);
|
||||
value=b.ucharsLength-equal[unitNumber]->getOffset();
|
||||
isFinal=FALSE;
|
||||
}
|
||||
b.writeValueAndFinal(value, isFinal);
|
||||
offset=b.write(units[unitNumber]);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTSplitBranchNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
// Encode the less-than branch first.
|
||||
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
|
||||
// Encode the greater-or-equal branch last because we do not jump for it at all.
|
||||
greaterOrEqual->write(builder);
|
||||
// Write this node.
|
||||
U_ASSERT(lessThan->getOffset()>0);
|
||||
b.writeDelta(b.ucharsLength-lessThan->getOffset()); // less-than
|
||||
offset=b.write(unit);
|
||||
}
|
||||
|
||||
void
|
||||
UCharTrieBuilder::UCTBranchHeadNode::write(DictTrieBuilder &builder) {
|
||||
UCharTrieBuilder &b=(UCharTrieBuilder &)builder;
|
||||
next->write(builder);
|
||||
if(length<=minLinearMatch()) {
|
||||
offset=b.writeValueAndType(hasValue, value, length-1);
|
||||
} else {
|
||||
b.write(length-1);
|
||||
offset=b.writeValueAndType(hasValue, value, 0);
|
||||
}
|
||||
DictTrieBuilder::Node *
|
||||
UCharTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
|
||||
Node *nextNode) const {
|
||||
return new UCTLinearMatchNode(
|
||||
elements[i].getString(strings).getBuffer()+unitIndex,
|
||||
length,
|
||||
nextNode);
|
||||
}
|
||||
|
||||
UBool
|
||||
@ -629,6 +327,11 @@ UCharTrieBuilder::write(const UChar *s, int32_t length) {
|
||||
return ucharsLength;
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
|
||||
return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeValueAndFinal(int32_t i, UBool final) {
|
||||
UChar intUnits[3];
|
||||
@ -675,7 +378,8 @@ UCharTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node)
|
||||
}
|
||||
|
||||
int32_t
|
||||
UCharTrieBuilder::writeDelta(int32_t i) {
|
||||
UCharTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
|
||||
int32_t i=ucharsLength-jumpTarget;
|
||||
UChar intUnits[3];
|
||||
int32_t length;
|
||||
U_ASSERT(i>=0);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Copyright (C) 2010-2011, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: uchartriebuilder.h
|
||||
@ -45,33 +45,21 @@ public:
|
||||
}
|
||||
|
||||
private:
|
||||
void writeNode(int32_t start, int32_t limit, int32_t unitIndex);
|
||||
void writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
|
||||
virtual int32_t getElementStringLength(int32_t i) const;
|
||||
virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const;
|
||||
virtual int32_t getElementValue(int32_t i) const;
|
||||
|
||||
Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
|
||||
Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
|
||||
int32_t length, UErrorCode &errorCode);
|
||||
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const;
|
||||
|
||||
UBool ensureCapacity(int32_t length);
|
||||
int32_t write(int32_t unit);
|
||||
int32_t write(const UChar *s, int32_t length);
|
||||
int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
|
||||
int32_t writeDelta(int32_t i);
|
||||
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const;
|
||||
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const;
|
||||
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const;
|
||||
|
||||
// Compacting builder.
|
||||
virtual UBool matchNodesCanHaveValues() const { return TRUE; }
|
||||
|
||||
// Indirect "friend" access.
|
||||
// Nested classes cannot be friends of UCharTrie unless the whole header is included,
|
||||
// at least with AIX xlC_r,
|
||||
// so this Builder class, which is a friend, provides the necessary value.
|
||||
static int32_t minLinearMatch() { return UCharTrie::kMinLinearMatch; }
|
||||
|
||||
class UCTFinalValueNode : public FinalValueNode {
|
||||
public:
|
||||
UCTFinalValueNode(int32_t v) : FinalValueNode(v) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
virtual int32_t getMaxBranchLinearSubNodeLength() const { return UCharTrie::kMaxBranchLinearSubNodeLength; }
|
||||
virtual int32_t getMinLinearMatch() const { return UCharTrie::kMinLinearMatch; }
|
||||
virtual int32_t getMaxLinearMatchLength() const { return UCharTrie::kMaxLinearMatchLength; }
|
||||
|
||||
class UCTLinearMatchNode : public LinearMatchNode {
|
||||
public:
|
||||
@ -82,26 +70,16 @@ private:
|
||||
const UChar *s;
|
||||
};
|
||||
|
||||
class UCTListBranchNode : public ListBranchNode {
|
||||
public:
|
||||
UCTListBranchNode() : ListBranchNode() {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
|
||||
Node *nextNode) const;
|
||||
|
||||
class UCTSplitBranchNode : public SplitBranchNode {
|
||||
public:
|
||||
UCTSplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
|
||||
: SplitBranchNode(middleUnit, lessThanNode, greaterOrEqualNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
class UCTBranchHeadNode : public BranchHeadNode {
|
||||
public:
|
||||
UCTBranchHeadNode(int32_t len, Node *subNode) : BranchHeadNode(len, subNode) {}
|
||||
virtual void write(DictTrieBuilder &builder);
|
||||
};
|
||||
|
||||
virtual Node *createFinalValueNode(int32_t value) const { return new UCTFinalValueNode(value); }
|
||||
UBool ensureCapacity(int32_t length);
|
||||
virtual int32_t write(int32_t unit);
|
||||
int32_t write(const UChar *s, int32_t length);
|
||||
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length);
|
||||
virtual int32_t writeValueAndFinal(int32_t i, UBool final);
|
||||
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node);
|
||||
virtual int32_t writeDeltaTo(int32_t jumpTarget);
|
||||
|
||||
UnicodeString strings;
|
||||
UCharTrieElement *elements;
|
||||
|
Loading…
Reference in New Issue
Block a user