scuffed-code/icu4c/source/tools/toolutil/dicttriebuilder.cpp
2011-01-02 07:22:36 +00:00

622 lines
20 KiB
C++

/*
*******************************************************************************
* Copyright (C) 2010-2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: dicttriebuilder.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010dec24
* created by: Markus W. Scherer
*
* Base class for dictionary-trie builder classes.
*/
#include <typeinfo> // for 'typeid' to work
#include "unicode/utypes.h"
#include "dicttriebuilder.h"
#include "uassert.h"
#include "uhash.h"
U_CDECL_BEGIN
static int32_t U_CALLCONV
hashDictTrieNode(const UHashTok key) {
return U_NAMESPACE_QUALIFIER DictTrieBuilder::hashNode(key.pointer);
}
static UBool U_CALLCONV
equalDictTrieNodes(const UHashTok key1, const UHashTok key2) {
return U_NAMESPACE_QUALIFIER DictTrieBuilder::equalNodes(key1.pointer, key2.pointer);
}
U_CDECL_END
U_NAMESPACE_BEGIN
DictTrieBuilder::DictTrieBuilder() : nodes(NULL) {}
DictTrieBuilder::~DictTrieBuilder() {
deleteCompactBuilder();
}
void
DictTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return;
}
nodes=uhash_openSize(hashDictTrieNode, equalDictTrieNodes, NULL,
sizeGuess, &errorCode);
if(U_SUCCESS(errorCode) && nodes==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
if(U_SUCCESS(errorCode)) {
uhash_setKeyDeleter(nodes, uhash_deleteUObject);
}
}
void
DictTrieBuilder::deleteCompactBuilder() {
uhash_close(nodes);
nodes=NULL;
}
void
DictTrieBuilder::build(UDictTrieBuildOption buildOption, int32_t elementsLength,
UErrorCode &errorCode) {
if(buildOption==UDICTTRIE_BUILD_FAST) {
writeNode(0, elementsLength, 0);
} else /* UDICTTRIE_BUILD_SMALL */ {
createCompactBuilder(2*elementsLength, errorCode);
Node *root=makeNode(0, elementsLength, 0, errorCode);
if(U_SUCCESS(errorCode)) {
root->markRightEdgesFirst(-1);
root->write(*this);
}
deleteCompactBuilder();
}
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
int32_t
DictTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
UBool hasValue=FALSE;
int32_t value=0;
int32_t type;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return writeValueAndFinal(value, TRUE); // final-value node
}
hasValue=TRUE;
}
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
writeNode(start, limit, lastUnitIndex);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
write(getMinLinearMatch()+maxLinearMatchLength-1);
}
writeElementUnits(start, unitIndex, length);
type=getMinLinearMatch()+length-1;
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
writeBranchSubNode(start, limit, unitIndex, length);
if(--length<getMinLinearMatch()) {
type=length;
} else {
write(length);
type=0;
}
}
return writeValueAndType(hasValue, value, type);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
int32_t
DictTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
UChar middleUnits[kMaxSplitBranchLevels];
int32_t lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Encode the less-than branch first.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t starts[kMaxBranchLinearSubNodeLength];
UBool final[kMaxBranchLinearSubNodeLength-1];
int32_t unitNumber=0;
do {
int32_t i=starts[unitNumber]=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
final[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
starts[unitNumber]=start;
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
do {
--unitNumber;
if(!final[unitNumber]) {
jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
writeNode(start, limit, unitIndex+1);
int32_t offset=write(getElementUnit(start, unitIndex));
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
start=starts[unitNumber];
int32_t value;
if(final[unitNumber]) {
// Write the final value for the one string ending with this unit.
value=getElementValue(start);
} else {
// Write the delta to the start position of the sub-node.
value=offset-jumpTargets[unitNumber];
}
writeValueAndFinal(value, final[unitNumber]);
offset=write(getElementUnit(start, unitIndex));
}
// Write the split-branch nodes.
while(ltLength>0) {
--ltLength;
writeDeltaTo(lessThan[ltLength]);
offset=write(middleUnits[ltLength]);
}
return offset;
}
// Requires start<limit,
// and all strings of the [start..limit[ elements must be sorted and
// have a common prefix of length unitIndex.
DictTrieBuilder::Node *
DictTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UBool hasValue=FALSE;
int32_t value=0;
if(unitIndex==getElementStringLength(start)) {
// An intermediate or final value.
value=getElementValue(start++);
if(start==limit) {
return registerFinalValue(value, errorCode);
}
hasValue=TRUE;
}
Node *node;
// Now all [start..limit[ strings are longer than unitIndex.
int32_t minUnit=getElementUnit(start, unitIndex);
int32_t maxUnit=getElementUnit(limit-1, unitIndex);
if(minUnit==maxUnit) {
// Linear-match node: All strings have the same character at unitIndex.
int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
int32_t length=lastUnitIndex-unitIndex;
int32_t maxLinearMatchLength=getMaxLinearMatchLength();
while(length>maxLinearMatchLength) {
lastUnitIndex-=maxLinearMatchLength;
length-=maxLinearMatchLength;
node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
nextNode=registerNode(node, errorCode);
}
node=createLinearMatchNode(start, unitIndex, length, nextNode);
} else {
// Branch node.
int32_t length=countElementUnits(start, limit, unitIndex);
// length>=2 because minUnit!=maxUnit.
Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
node=new BranchHeadNode(length, subNode);
}
if(hasValue && node!=NULL) {
if(matchNodesCanHaveValues()) {
((ValueNode *)node)->setValue(value);
} else {
node=new IntermediateValueNode(value, registerNode(node, errorCode));
}
}
return registerNode(node, errorCode);
}
// start<limit && all strings longer than unitIndex &&
// length different units at unitIndex
DictTrieBuilder::Node *
DictTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
int32_t length, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
UChar middleUnits[kMaxSplitBranchLevels];
Node *lessThan[kMaxSplitBranchLevels];
int32_t ltLength=0;
while(length>getMaxBranchLinearSubNodeLength()) {
// Branch on the middle unit.
// First, find the middle unit.
int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
// Create the less-than branch.
middleUnits[ltLength]=getElementUnit(i, unitIndex); // middle unit
lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
++ltLength;
// Continue for the greater-or-equal branch.
start=i;
length=length-length/2;
}
if(U_FAILURE(errorCode)) {
return NULL;
}
ListBranchNode *listNode=new ListBranchNode();
if(listNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// For each unit, find its elements array start and whether it has a final value.
int32_t unitNumber=0;
do {
int32_t i=start;
UChar unit=getElementUnit(i++, unitIndex);
i=indexOfElementWithNextUnit(i, unitIndex, unit);
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
}
start=i;
} while(++unitNumber<length-1);
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
UChar unit=getElementUnit(start, unitIndex);
if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
listNode->add(unit, getElementValue(start));
} else {
listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
}
Node *node=registerNode(listNode, errorCode);
// Create the split-branch nodes.
while(ltLength>0) {
--ltLength;
node=registerNode(
new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
}
return node;
}
DictTrieBuilder::Node *
DictTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
const UHashElement *old=uhash_find(nodes, newNode);
if(old!=NULL) {
delete newNode;
return (Node *)old->key.pointer;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if !U_RELEASE
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
DictTrieBuilder::Node *
DictTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
FinalValueNode key(value);
const UHashElement *old=uhash_find(nodes, &key);
if(old!=NULL) {
return (Node *)old->key.pointer;
}
Node *newNode=new FinalValueNode(value);
if(newNode==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// If uhash_puti() returns a non-zero value from an equivalent, previously
// registered node, then uhash_find() failed to find that and we will leak newNode.
#if !U_RELEASE
int32_t oldValue= // Only in debug mode to avoid a compiler warning about unused oldValue.
#endif
uhash_puti(nodes, newNode, 1, &errorCode);
U_ASSERT(oldValue==0);
if(U_FAILURE(errorCode)) {
delete newNode;
return NULL;
}
return newNode;
}
UBool
DictTrieBuilder::hashNode(const void *node) {
return ((const Node *)node)->hashCode();
}
UBool
DictTrieBuilder::equalNodes(const void *left, const void *right) {
return *(const Node *)left==*(const Node *)right;
}
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder)
UBool
DictTrieBuilder::Node::operator==(const Node &other) const {
return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
}
int32_t
DictTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber;
}
return edgeNumber;
}
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(DictTrieBuilder::Node)
UBool
DictTrieBuilder::FinalValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const FinalValueNode &o=(const FinalValueNode &)other;
return value==o.value;
}
void
DictTrieBuilder::FinalValueNode::write(DictTrieBuilder &builder) {
offset=builder.writeValueAndFinal(value, TRUE);
}
UBool
DictTrieBuilder::ValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ValueNode &o=(const ValueNode &)other;
return hasValue==o.hasValue && (!hasValue || value==o.value);
}
UBool
DictTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const IntermediateValueNode &o=(const IntermediateValueNode &)other;
return next==o.next;
}
int32_t
DictTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
DictTrieBuilder::IntermediateValueNode::write(DictTrieBuilder &builder) {
next->write(builder);
offset=builder.writeValueAndFinal(value, FALSE);
}
UBool
DictTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const LinearMatchNode &o=(const LinearMatchNode &)other;
return length==o.length && next==o.next;
}
int32_t
DictTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
UBool
DictTrieBuilder::ListBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const ListBranchNode &o=(const ListBranchNode &)other;
for(int32_t i=0; i<length; ++i) {
if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
return FALSE;
}
}
return TRUE;
}
int32_t
DictTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
int32_t step=0;
int32_t i=length;
do {
Node *edge=equal[--i];
if(edge!=NULL) {
edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
}
// For all but the rightmost edge, decrement the edge number.
step=1;
} while(i>0);
offset=edgeNumber;
}
return edgeNumber;
}
void
DictTrieBuilder::ListBranchNode::write(DictTrieBuilder &builder) {
// Write the sub-nodes in reverse order: The jump lengths are deltas from
// after their own positions, so if we wrote the minUnit sub-node first,
// then its jump delta would be larger.
// Instead we write the minUnit sub-node last, for a shorter delta.
int32_t unitNumber=length-1;
Node *rightEdge=equal[unitNumber];
int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
do {
--unitNumber;
if(equal[unitNumber]!=NULL) {
equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
}
} while(unitNumber>0);
// The maxUnit sub-node is written as the very last one because we do
// not jump for it at all.
unitNumber=length-1;
if(rightEdge==NULL) {
builder.writeValueAndFinal(values[unitNumber], TRUE);
} else {
rightEdge->write(builder);
}
offset=builder.write(units[unitNumber]);
// Write the rest of this node's unit-value pairs.
while(--unitNumber>=0) {
int32_t value;
UBool isFinal;
if(equal[unitNumber]==NULL) {
// Write the final value for the one string ending with this unit.
value=values[unitNumber];
isFinal=TRUE;
} else {
// Write the delta to the start position of the sub-node.
U_ASSERT(equal[unitNumber]->getOffset()>0);
value=offset-equal[unitNumber]->getOffset();
isFinal=FALSE;
}
builder.writeValueAndFinal(value, isFinal);
offset=builder.write(units[unitNumber]);
}
}
UBool
DictTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!Node::operator==(other)) {
return FALSE;
}
const SplitBranchNode &o=(const SplitBranchNode &)other;
return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
}
int32_t
DictTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
firstEdgeNumber=edgeNumber;
edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
}
return edgeNumber;
}
void
DictTrieBuilder::SplitBranchNode::write(DictTrieBuilder &builder) {
// Encode the less-than branch first.
lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
// Encode the greater-or-equal branch last because we do not jump for it at all.
greaterOrEqual->write(builder);
// Write this node.
U_ASSERT(lessThan->getOffset()>0);
builder.writeDeltaTo(lessThan->getOffset()); // less-than
offset=builder.write(unit);
}
UBool
DictTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
if(this==&other) {
return TRUE;
}
if(!ValueNode::operator==(other)) {
return FALSE;
}
const BranchHeadNode &o=(const BranchHeadNode &)other;
return length==o.length && next==o.next;
}
int32_t
DictTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
if(offset==0) {
offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
}
return edgeNumber;
}
void
DictTrieBuilder::BranchHeadNode::write(DictTrieBuilder &builder) {
next->write(builder);
if(length<=builder.getMinLinearMatch()) {
offset=builder.writeValueAndType(hasValue, value, length-1);
} else {
builder.write(length-1);
offset=builder.writeValueAndType(hasValue, value, 0);
}
}
U_NAMESPACE_END