ICU-8167 faster, smaller trie builder; adds to dynamic trie rather than list of (string, value) pairs
X-SVN-Rev: 29350
This commit is contained in:
parent
e99eaf0aca
commit
f9b8fd9ad8
@ -11,7 +11,6 @@
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Builder class for BytesTrie.
|
||||
@ -26,6 +25,20 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
*/
|
||||
public BytesTrieBuilder() {}
|
||||
|
||||
// Used in add() to wrap the bytes into a CharSequence for StringTrieBuilder.addImpl().
|
||||
private static final class BytesAsCharSequence implements CharSequence {
|
||||
public BytesAsCharSequence(byte[] sequence, int length) {
|
||||
s=sequence;
|
||||
len=length;
|
||||
}
|
||||
public char charAt(int i) { return (char)(s[i]&0xff); }
|
||||
public int length() { return len; }
|
||||
public CharSequence subSequence(int start, int end) { return null; }
|
||||
|
||||
private byte[] s;
|
||||
private int len;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a (byte sequence, value) pair.
|
||||
* The byte sequence must be unique.
|
||||
@ -37,26 +50,7 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
* @return this
|
||||
*/
|
||||
public BytesTrieBuilder add(byte[] sequence, int length, int value) {
|
||||
if(bytesLength>0) {
|
||||
// Cannot add elements after building.
|
||||
throw new IllegalStateException("Cannot add (string, value) pairs after build().");
|
||||
}
|
||||
// Binary search in the sorted elements array to find
|
||||
// a) whether the byte sequence is a duplicate, and, if not,
|
||||
// b) the insertion index.
|
||||
int start=0, limit=elements.size();
|
||||
while(start<limit) {
|
||||
int i=(start+limit)/2;
|
||||
int diff=elements.get(i).compareStringTo(sequence, length, strings);
|
||||
if(diff<0) { // elements[i]<sequence
|
||||
start=i+1;
|
||||
} else if(diff==0) {
|
||||
throw new IllegalArgumentException("Duplicate string.");
|
||||
} else { // elements[i]>sequence
|
||||
limit=i;
|
||||
}
|
||||
}
|
||||
elements.add(start, new Element(sequence, length, value, this));
|
||||
addImpl(new BytesAsCharSequence(sequence, length), value);
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -72,7 +66,7 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
* @return A new BytesTrie for the add()ed data.
|
||||
*/
|
||||
public BytesTrie build(StringTrieBuilder.Option buildOption) {
|
||||
buildImpl(buildOption);
|
||||
buildBytes(buildOption);
|
||||
return new BytesTrie(bytes, bytes.length-bytesLength);
|
||||
}
|
||||
|
||||
@ -92,27 +86,16 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
* The buffer is not read-only and array() can be called.
|
||||
*/
|
||||
public ByteBuffer buildByteBuffer(StringTrieBuilder.Option buildOption) {
|
||||
buildImpl(buildOption);
|
||||
buildBytes(buildOption);
|
||||
return ByteBuffer.wrap(bytes, bytes.length-bytesLength, bytesLength);
|
||||
}
|
||||
|
||||
private void buildImpl(StringTrieBuilder.Option buildOption) {
|
||||
if(bytesLength>0) {
|
||||
// Already built.
|
||||
return;
|
||||
}
|
||||
if(elements.isEmpty()) {
|
||||
throw new IndexOutOfBoundsException("No (string, value) pairs were added.");
|
||||
}
|
||||
private void buildBytes(StringTrieBuilder.Option buildOption) {
|
||||
// Create and byte-serialize the trie for the elements.
|
||||
int capacity=stringsLength;
|
||||
if(capacity<1024) {
|
||||
capacity=1024;
|
||||
if(bytes==null) {
|
||||
bytes=new byte[1024];
|
||||
}
|
||||
if(bytes==null || bytes.length<capacity) {
|
||||
bytes=new byte[capacity];
|
||||
}
|
||||
build(buildOption, elements.size());
|
||||
buildImpl(buildOption);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -121,167 +104,18 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
* @return this
|
||||
*/
|
||||
public BytesTrieBuilder clear() {
|
||||
stringsLength=0;
|
||||
elements.clear();
|
||||
clearImpl();
|
||||
bytes=null;
|
||||
bytesLength=0;
|
||||
return this;
|
||||
}
|
||||
|
||||
private static final class Element {
|
||||
public Element(byte[] sequence, int length, int val, BytesTrieBuilder builder) {
|
||||
if(length>0xffff) {
|
||||
// Too long: We store the length in 1 or 2 bytes.
|
||||
throw new IndexOutOfBoundsException("The maximum byte sequence length is 0xffff.");
|
||||
}
|
||||
int offset=builder.stringsLength;
|
||||
if(length>0xff) {
|
||||
offset=~offset;
|
||||
builder.stringsAppend((byte)(length>>8));
|
||||
}
|
||||
builder.stringsAppend((byte)length);
|
||||
stringOffset=offset;
|
||||
value=val;
|
||||
builder.stringsAppend(sequence, length);
|
||||
}
|
||||
|
||||
public int getStringLength(byte[] strings) /*const*/ {
|
||||
int offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
return strings[offset]&0xff;
|
||||
} else {
|
||||
offset=~offset;
|
||||
return ((strings[offset]&0xff)<<8)|(strings[offset+1]&0xff);
|
||||
}
|
||||
}
|
||||
|
||||
public byte charAt(int index, byte[] strings) /*const*/ { return strings[getStringOffset(strings)+index]; }
|
||||
|
||||
public int getValue() /*const*/ { return value; }
|
||||
|
||||
public int compareStringTo(byte[] other, int otherLength, byte[] strings) /*const*/ {
|
||||
int thisOffset=stringOffset;
|
||||
int thisLength;
|
||||
if(thisOffset>=0) {
|
||||
thisLength=strings[thisOffset++]&0xff;
|
||||
} else {
|
||||
thisOffset=~thisOffset;
|
||||
thisLength=((strings[thisOffset]&0xff)<<8)|(strings[thisOffset+1]&0xff);
|
||||
thisOffset+=2;
|
||||
}
|
||||
int lengthDiff=thisLength-otherLength;
|
||||
int commonLength;
|
||||
if(lengthDiff<=0) {
|
||||
commonLength=thisLength;
|
||||
} else {
|
||||
commonLength=otherLength;
|
||||
}
|
||||
int otherOffset=0;
|
||||
while(commonLength>0) {
|
||||
int diff=(strings[thisOffset++]&0xff)-(other[otherOffset++]&0xff);
|
||||
if(diff!=0) {
|
||||
return diff;
|
||||
}
|
||||
--commonLength;
|
||||
}
|
||||
return lengthDiff;
|
||||
}
|
||||
|
||||
private int getStringOffset(byte[] strings) /*const*/ { // C++: const char *data(strings)
|
||||
int offset=stringOffset;
|
||||
if(offset>=0) {
|
||||
++offset;
|
||||
} else {
|
||||
offset=~offset+2;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
// If the stringOffset is non-negative, then the first strings byte contains
|
||||
// the string length.
|
||||
// If the stringOffset is negative, then the first two strings bytes contain
|
||||
// the string length (big-endian), and the offset needs to be bit-inverted.
|
||||
// (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
|
||||
private int stringOffset;
|
||||
private int value;
|
||||
};
|
||||
|
||||
protected int getElementStringLength(int i) /*const*/ {
|
||||
return elements.get(i).getStringLength(strings);
|
||||
}
|
||||
protected char getElementUnit(int i, int byteIndex) /*const*/ {
|
||||
return (char)(elements.get(i).charAt(byteIndex, strings)&0xff);
|
||||
}
|
||||
protected int getElementValue(int i) /*const*/ {
|
||||
return elements.get(i).getValue();
|
||||
}
|
||||
|
||||
protected int getLimitOfLinearMatch(int first, int last, int byteIndex) /*const*/ {
|
||||
Element firstElement=elements.get(first);
|
||||
Element lastElement=elements.get(last);
|
||||
int minStringLength=firstElement.getStringLength(strings);
|
||||
while(++byteIndex<minStringLength &&
|
||||
firstElement.charAt(byteIndex, strings)==
|
||||
lastElement.charAt(byteIndex, strings)) {}
|
||||
return byteIndex;
|
||||
}
|
||||
|
||||
protected int countElementUnits(int start, int limit, int byteIndex) /*const*/ {
|
||||
int length=0; // Number of different bytes at byteIndex.
|
||||
int i=start;
|
||||
do {
|
||||
starts_.add(i);
|
||||
byte b=elements.get(i++).charAt(byteIndex, strings);
|
||||
while(i<limit && b==elements.get(i).charAt(byteIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
return length;
|
||||
}
|
||||
|
||||
protected boolean matchNodesCanHaveValues() /*const*/ { return false; }
|
||||
|
||||
protected int getMaxBranchLinearSubNodeLength() /*const*/ { return BytesTrie.kMaxBranchLinearSubNodeLength; }
|
||||
protected int getMinLinearMatch() /*const*/ { return BytesTrie.kMinLinearMatch; }
|
||||
protected int getMaxLinearMatchLength() /*const*/ { return BytesTrie.kMaxLinearMatchLength; }
|
||||
|
||||
private static final class BTLinearMatchNode extends LinearMatchNode {
|
||||
public BTLinearMatchNode(int offset, int len, Node nextNode, BytesTrieBuilder b) {
|
||||
super(len, nextNode);
|
||||
btBuilder=b;
|
||||
stringOffset=offset;
|
||||
hash=hash*37+b.stringHashCode(offset, len);
|
||||
}
|
||||
public boolean equals(Object other) /*const*/ {
|
||||
if(this==other) {
|
||||
return true;
|
||||
}
|
||||
if(!super.equals(other)) {
|
||||
return false;
|
||||
}
|
||||
BTLinearMatchNode o=(BTLinearMatchNode)other;
|
||||
return btBuilder.stringsAreEqual(stringOffset, length, o.stringOffset, o.length);
|
||||
}
|
||||
public void write(StringTrieBuilder builder) {
|
||||
BytesTrieBuilder b=(BytesTrieBuilder)builder;
|
||||
next.write(builder);
|
||||
b.write(stringOffset, length);
|
||||
offset=b.write(b.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
private BytesTrieBuilder btBuilder;
|
||||
private int stringOffset;
|
||||
}
|
||||
|
||||
protected Node createLinearMatchNode(int i, int byteIndex, int length, Node nextNode) /*const*/ {
|
||||
return new BTLinearMatchNode(
|
||||
elements.get(i).getStringOffset(strings)+byteIndex,
|
||||
length,
|
||||
nextNode,
|
||||
this);
|
||||
}
|
||||
|
||||
private void ensureCapacity(int length) {
|
||||
if(length>bytes.length) {
|
||||
int newCapacity=bytes.length;
|
||||
@ -301,11 +135,15 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
bytes[bytes.length-bytesLength]=(byte)b;
|
||||
return bytesLength;
|
||||
}
|
||||
private int write(int offset, int length) {
|
||||
protected int write(int offset, int length) {
|
||||
int newLength=bytesLength+length;
|
||||
ensureCapacity(newLength);
|
||||
bytesLength=newLength;
|
||||
System.arraycopy(strings, offset, bytes, bytes.length-bytesLength, length);
|
||||
int bytesOffset=bytes.length-bytesLength;
|
||||
while(length>0) {
|
||||
bytes[bytesOffset++]=(byte)strings.charAt(offset++);
|
||||
--length;
|
||||
}
|
||||
return bytesLength;
|
||||
}
|
||||
private int write(byte[] b, int length) {
|
||||
@ -315,9 +153,6 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
System.arraycopy(b, 0, bytes, bytes.length-bytesLength, length);
|
||||
return bytesLength;
|
||||
}
|
||||
protected int writeElementUnits(int i, int byteIndex, int length) {
|
||||
return write(elements.get(i).getStringOffset(strings)+byteIndex, length);
|
||||
}
|
||||
|
||||
// For writeValueAndFinal() and writeDeltaTo().
|
||||
private final byte[] intBytes=new byte[5];
|
||||
@ -392,56 +227,6 @@ public final class BytesTrieBuilder extends StringTrieBuilder {
|
||||
return write(intBytes, length);
|
||||
}
|
||||
|
||||
private void ensureStringsCapacity(int length) {
|
||||
if(strings==null) {
|
||||
strings=new byte[Math.max(1024, 2*length)];
|
||||
} else if(length>strings.length) {
|
||||
byte[] newStrings=new byte[Math.max(4*strings.length, 2*length)];
|
||||
System.arraycopy(strings, 0, newStrings, 0, stringsLength);
|
||||
strings=newStrings;
|
||||
}
|
||||
}
|
||||
|
||||
private void stringsAppend(byte b) {
|
||||
ensureStringsCapacity(stringsLength+1);
|
||||
strings[stringsLength++]=b;
|
||||
}
|
||||
|
||||
private void stringsAppend(byte[] b, int length) {
|
||||
ensureStringsCapacity(stringsLength+length);
|
||||
System.arraycopy(b, 0, strings, stringsLength, length);
|
||||
stringsLength+=length;
|
||||
}
|
||||
|
||||
private boolean stringsAreEqual(int offset1, int length1, int offset2, int length2) {
|
||||
if(length1!=length2) {
|
||||
return false;
|
||||
}
|
||||
if(offset1==offset2) {
|
||||
return true;
|
||||
}
|
||||
while(length1>0) {
|
||||
if(strings[offset1++]!=strings[offset2++]) {
|
||||
return false;
|
||||
}
|
||||
--length1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private int stringHashCode(int offset, int length) {
|
||||
int hash=0;
|
||||
while(length>0) {
|
||||
hash=hash*37+(strings[offset++]&0xff);
|
||||
--length;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
private byte[] strings;
|
||||
private int stringsLength;
|
||||
private ArrayList<Element> elements=new ArrayList<Element>();
|
||||
|
||||
// Byte serialization of the trie.
|
||||
// Grows from the back: bytesLength measures from the end of the buffer!
|
||||
private byte[] bytes;
|
||||
|
@ -11,7 +11,6 @@
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import java.nio.CharBuffer;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Builder class for CharsTrie.
|
||||
@ -36,26 +35,7 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
* @return this
|
||||
*/
|
||||
public CharsTrieBuilder add(CharSequence s, int value) {
|
||||
if(charsLength>0) {
|
||||
// Cannot add elements after building.
|
||||
throw new IllegalStateException("Cannot add (string, value) pairs after build().");
|
||||
}
|
||||
// Binary search in the sorted elements array to find
|
||||
// a) whether the string is a duplicate, and, if not,
|
||||
// b) the insertion index.
|
||||
int start=0, limit=elements.size();
|
||||
while(start<limit) {
|
||||
int i=(start+limit)/2;
|
||||
int diff=elements.get(i).compareStringTo(s, strings);
|
||||
if(diff<0) { // elements[i]<sequence
|
||||
start=i+1;
|
||||
} else if(diff==0) {
|
||||
throw new IllegalArgumentException("Duplicate string.");
|
||||
} else { // elements[i]>s
|
||||
limit=i;
|
||||
}
|
||||
}
|
||||
elements.add(start, new Element(s, value, strings));
|
||||
addImpl(s, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -84,27 +64,16 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
* @return A CharSequence with the char-serialized CharsTrie for the add()ed data.
|
||||
*/
|
||||
public CharSequence buildCharSequence(StringTrieBuilder.Option buildOption) {
|
||||
buildImpl(buildOption);
|
||||
buildChars(buildOption);
|
||||
return CharBuffer.wrap(chars, chars.length-charsLength, charsLength);
|
||||
}
|
||||
|
||||
public void buildImpl(StringTrieBuilder.Option buildOption) {
|
||||
if(charsLength>0) {
|
||||
// Already built.
|
||||
return;
|
||||
}
|
||||
if(elements.isEmpty()) {
|
||||
throw new IndexOutOfBoundsException("No (string, value) pairs were added.");
|
||||
}
|
||||
public void buildChars(StringTrieBuilder.Option buildOption) {
|
||||
// Create and char-serialize the trie for the elements.
|
||||
int capacity=strings.length();
|
||||
if(capacity<1024) {
|
||||
capacity=1024;
|
||||
if(chars==null) {
|
||||
chars=new char[1024];
|
||||
}
|
||||
if(chars==null || chars.length<capacity) {
|
||||
chars=new char[capacity];
|
||||
}
|
||||
build(buildOption, elements.size());
|
||||
buildImpl(buildOption);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -113,144 +82,18 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
* @return this
|
||||
*/
|
||||
public CharsTrieBuilder clear() {
|
||||
strings.setLength(0);
|
||||
elements.clear();
|
||||
clearImpl();
|
||||
chars=null;
|
||||
charsLength=0;
|
||||
return this;
|
||||
}
|
||||
|
||||
private static final class Element {
|
||||
public Element(CharSequence s, int val, StringBuilder strings) {
|
||||
int length=s.length();
|
||||
if(length>0xffff) {
|
||||
// Too long: We store the length in 1 unit.
|
||||
throw new IndexOutOfBoundsException("The maximum string length is 0xffff.");
|
||||
}
|
||||
stringOffset=strings.length();
|
||||
strings.append((char)length);
|
||||
value=val;
|
||||
strings.append(s);
|
||||
}
|
||||
|
||||
public int getStringLength(CharSequence strings) /*const*/ {
|
||||
return strings.charAt(stringOffset);
|
||||
}
|
||||
|
||||
public char charAt(int index, CharSequence strings) /*const*/ {
|
||||
return strings.charAt(stringOffset+1+index);
|
||||
}
|
||||
|
||||
public int getValue() /*const*/ { return value; }
|
||||
|
||||
public int compareStringTo(CharSequence other, CharSequence strings) /*const*/ {
|
||||
// TODO: Add CharSequence comparison function to UTF16 class.
|
||||
int thisOffset=stringOffset;
|
||||
int thisLength=strings.charAt(thisOffset++);
|
||||
int otherLength=other.length();
|
||||
int lengthDiff=thisLength-otherLength;
|
||||
int commonLength;
|
||||
if(lengthDiff<=0) {
|
||||
commonLength=thisLength;
|
||||
} else {
|
||||
commonLength=otherLength;
|
||||
}
|
||||
int otherOffset=0;
|
||||
while(commonLength>0) {
|
||||
int diff=(int)strings.charAt(thisOffset++)-(int)other.charAt(otherOffset++);
|
||||
if(diff!=0) {
|
||||
return diff;
|
||||
}
|
||||
--commonLength;
|
||||
}
|
||||
return lengthDiff;
|
||||
}
|
||||
|
||||
private int getStringOffset(CharSequence strings) /*const*/ { return stringOffset+1; }
|
||||
|
||||
// The first strings unit contains the string length.
|
||||
// (Compared with a stringLength field here, this saves 2 bytes per string.)
|
||||
private int stringOffset;
|
||||
private int value;
|
||||
}
|
||||
|
||||
protected int getElementStringLength(int i) /*const*/ {
|
||||
return elements.get(i).getStringLength(strings);
|
||||
}
|
||||
protected char getElementUnit(int i, int unitIndex) /*const*/ {
|
||||
return elements.get(i).charAt(unitIndex, strings);
|
||||
}
|
||||
protected int getElementValue(int i) /*const*/ {
|
||||
return elements.get(i).getValue();
|
||||
}
|
||||
|
||||
protected int getLimitOfLinearMatch(int first, int last, int unitIndex) /*const*/ {
|
||||
Element firstElement=elements.get(first);
|
||||
Element lastElement=elements.get(last);
|
||||
int minStringLength=firstElement.getStringLength(strings);
|
||||
while(++unitIndex<minStringLength &&
|
||||
firstElement.charAt(unitIndex, strings)==
|
||||
lastElement.charAt(unitIndex, strings)) {}
|
||||
return unitIndex;
|
||||
}
|
||||
|
||||
protected int countElementUnits(int start, int limit, int unitIndex) /*const*/ {
|
||||
int length=0; // Number of different units at unitIndex.
|
||||
int i=start;
|
||||
do {
|
||||
starts_.add(i);
|
||||
char unit=elements.get(i++).charAt(unitIndex, strings);
|
||||
while(i<limit && unit==elements.get(i).charAt(unitIndex, strings)) {
|
||||
++i;
|
||||
}
|
||||
++length;
|
||||
} while(i<limit);
|
||||
return length;
|
||||
}
|
||||
|
||||
protected boolean matchNodesCanHaveValues() /*const*/ { return true; }
|
||||
|
||||
protected int getMaxBranchLinearSubNodeLength() /*const*/ { return CharsTrie.kMaxBranchLinearSubNodeLength; }
|
||||
protected int getMinLinearMatch() /*const*/ { return CharsTrie.kMinLinearMatch; }
|
||||
protected int getMaxLinearMatchLength() /*const*/ { return CharsTrie.kMaxLinearMatchLength; }
|
||||
|
||||
private static final class UCTLinearMatchNode extends LinearMatchNode {
|
||||
public UCTLinearMatchNode(int offset, int len, Node nextNode, CharsTrieBuilder b) {
|
||||
super(len, nextNode);
|
||||
ctBuilder=b;
|
||||
stringOffset=offset;
|
||||
hash=hash*37+b.stringHashCode(offset, len);
|
||||
}
|
||||
public boolean equals(Object other) /*const*/ {
|
||||
if(this==other) {
|
||||
return true;
|
||||
}
|
||||
if(!super.equals(other)) {
|
||||
return false;
|
||||
}
|
||||
UCTLinearMatchNode o=(UCTLinearMatchNode)other;
|
||||
return ctBuilder.stringsAreEqual(stringOffset, length, o.stringOffset, o.length);
|
||||
}
|
||||
public void write(StringTrieBuilder builder) {
|
||||
CharsTrieBuilder b=(CharsTrieBuilder)builder;
|
||||
next.write(builder);
|
||||
b.write(stringOffset, length);
|
||||
offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
private CharsTrieBuilder ctBuilder;
|
||||
private int stringOffset;
|
||||
};
|
||||
|
||||
protected Node createLinearMatchNode(int i, int unitIndex, int length,
|
||||
Node nextNode) /*const*/ {
|
||||
return new UCTLinearMatchNode(
|
||||
elements.get(i).getStringOffset(strings)+unitIndex,
|
||||
length,
|
||||
nextNode,
|
||||
this);
|
||||
}
|
||||
|
||||
private void ensureCapacity(int length) {
|
||||
if(length>chars.length) {
|
||||
int newCapacity=chars.length;
|
||||
@ -270,7 +113,7 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
chars[chars.length-charsLength]=(char)unit;
|
||||
return charsLength;
|
||||
}
|
||||
private int write(int offset, int length) {
|
||||
protected int write(int offset, int length) {
|
||||
int newLength=charsLength+length;
|
||||
ensureCapacity(newLength);
|
||||
charsLength=newLength;
|
||||
@ -288,9 +131,6 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
System.arraycopy(s, 0, chars, chars.length-charsLength, length);
|
||||
return charsLength;
|
||||
}
|
||||
protected int writeElementUnits(int i, int unitIndex, int length) {
|
||||
return write(elements.get(i).getStringOffset(strings)+unitIndex, length);
|
||||
}
|
||||
|
||||
// For writeValueAndFinal(), writeValueAndType() and writeDeltaTo().
|
||||
private final char[] intUnits=new char[3];
|
||||
@ -356,35 +196,6 @@ public final class CharsTrieBuilder extends StringTrieBuilder {
|
||||
return write(intUnits, length);
|
||||
}
|
||||
|
||||
private boolean stringsAreEqual(int offset1, int length1, int offset2, int length2) {
|
||||
// TODO: Make/use public function; see Normalizer2Impl.UTF16Plus.
|
||||
if(length1!=length2) {
|
||||
return false;
|
||||
}
|
||||
if(offset1==offset2) {
|
||||
return true;
|
||||
}
|
||||
while(length1>0) {
|
||||
if(strings.charAt(offset1++)!=strings.charAt(offset2++)) {
|
||||
return false;
|
||||
}
|
||||
--length1;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private int stringHashCode(int offset, int length) {
|
||||
int hash=0;
|
||||
while(length>0) {
|
||||
hash=hash*37+strings.charAt(offset++);
|
||||
--length;
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
private StringBuilder strings=new StringBuilder();
|
||||
private ArrayList<Element> elements=new ArrayList<Element>();
|
||||
|
||||
// char serialization of the trie.
|
||||
// Grows from the back: charsLength measures from the end of the buffer!
|
||||
private char[] chars;
|
||||
|
@ -42,282 +42,60 @@ public abstract class StringTrieBuilder {
|
||||
|
||||
protected StringTrieBuilder() {}
|
||||
|
||||
protected final void createCompactBuilder(int sizeGuess) {
|
||||
nodes=new HashMap<Node, Node>(sizeGuess);
|
||||
lookupFinalValueNode=new FinalValueNode(0);
|
||||
}
|
||||
protected final void deleteCompactBuilder() {
|
||||
nodes=null;
|
||||
lookupFinalValueNode=null;
|
||||
}
|
||||
|
||||
protected final void build(Option buildOption, int elementsLength) {
|
||||
if(buildOption==Option.FAST) {
|
||||
writeNode(0, elementsLength, 0);
|
||||
} else /* Option.SMALL */ {
|
||||
createCompactBuilder(2*elementsLength);
|
||||
Node root=makeNode(0, elementsLength, 0);
|
||||
root.markRightEdgesFirst(-1);
|
||||
root.write(this);
|
||||
deleteCompactBuilder();
|
||||
protected void addImpl(CharSequence s, int value) {
|
||||
if(state!=State.ADDING) {
|
||||
// Cannot add elements after building.
|
||||
throw new IllegalStateException("Cannot add (string, value) pairs after build().");
|
||||
}
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
protected int writeNode(int start, int limit, int unitIndex) {
|
||||
boolean hasValue=false;
|
||||
int value=0;
|
||||
int type;
|
||||
if(unitIndex==getElementStringLength(start)) {
|
||||
// An intermediate or final value.
|
||||
value=getElementValue(start++);
|
||||
if(start==limit) {
|
||||
return writeValueAndFinal(value, true); // final-value node
|
||||
}
|
||||
hasValue=true;
|
||||
if(s.length()>0xffff) {
|
||||
// Too long: Limited by iterator internals, and by builder recursion depth.
|
||||
throw new IndexOutOfBoundsException("The maximum string length is 0xffff.");
|
||||
}
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
int minUnit=getElementUnit(start, unitIndex);
|
||||
int maxUnit=getElementUnit(limit-1, unitIndex);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
|
||||
writeNode(start, limit, lastUnitIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
int length=lastUnitIndex-unitIndex;
|
||||
int maxLinearMatchLength=getMaxLinearMatchLength();
|
||||
while(length>maxLinearMatchLength) {
|
||||
lastUnitIndex-=maxLinearMatchLength;
|
||||
length-=maxLinearMatchLength;
|
||||
writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
|
||||
write(getMinLinearMatch()+maxLinearMatchLength-1);
|
||||
}
|
||||
writeElementUnits(start, unitIndex, length);
|
||||
type=getMinLinearMatch()+length-1;
|
||||
if(root==null) {
|
||||
root=createSuffixNode(s, 0, value);
|
||||
} else {
|
||||
// Branch node.
|
||||
int startsOffset=starts_.size();
|
||||
int length=countElementUnits(start, limit, unitIndex);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
writeBranchSubNode(start, limit, unitIndex, length, startsOffset);
|
||||
starts_.truncate(startsOffset);
|
||||
if(--length<getMinLinearMatch()) {
|
||||
type=length;
|
||||
} else {
|
||||
write(length);
|
||||
type=0;
|
||||
}
|
||||
root=root.add(this, s, 0, value);
|
||||
}
|
||||
return writeValueAndType(hasValue, value, type);
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
protected int writeBranchSubNode(int start, int limit, int unitIndex, int length,
|
||||
int startsOffset) {
|
||||
int ltLength=0;
|
||||
while(length>getMaxBranchLinearSubNodeLength()) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int i=starts_.get(startsOffset+length/2);
|
||||
// Encode the less-than branch first.
|
||||
middleUnits_.append(getElementUnit(i, unitIndex)); // middle unit
|
||||
jumpTargets_.add(writeBranchSubNode(start, i, unitIndex, length/2, startsOffset));
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
startsOffset+=length/2;
|
||||
length=length-length/2;
|
||||
}
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int isFinal=0;
|
||||
int unitNumber=0;
|
||||
do {
|
||||
int i=starts_.get(startsOffset+unitNumber+1);
|
||||
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
|
||||
isFinal|=(1<<unitNumber);
|
||||
protected final void buildImpl(Option buildOption) {
|
||||
switch(state) {
|
||||
case ADDING:
|
||||
if(root==null) {
|
||||
throw new IndexOutOfBoundsException("No (string, value) pairs were added.");
|
||||
}
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
|
||||
// Write the sub-nodes in reverse order: The jump lengths are deltas from
|
||||
// after their own positions, so if we wrote the minUnit sub-node first,
|
||||
// then its jump delta would be larger.
|
||||
// Instead we write the minUnit sub-node last, for a shorter delta.
|
||||
int jumpTargetsOffset=jumpTargets_.size();
|
||||
do {
|
||||
--unitNumber;
|
||||
if((isFinal&(1<<unitNumber))==0) {
|
||||
jumpTargets_.add(writeNode(starts_.get(startsOffset+unitNumber),
|
||||
starts_.get(startsOffset+unitNumber+1), unitIndex+1));
|
||||
}
|
||||
} while(unitNumber>0);
|
||||
// The maxUnit sub-node is written as the very last one because we do
|
||||
// not jump for it at all.
|
||||
unitNumber=length-1;
|
||||
writeNode(start, limit, unitIndex+1);
|
||||
int offset=write(getElementUnit(start, unitIndex));
|
||||
// Write the rest of this node's unit-value pairs.
|
||||
int jumpTargetsIndex=jumpTargetsOffset;
|
||||
while(--unitNumber>=0) {
|
||||
start=starts_.get(startsOffset+unitNumber);
|
||||
if((isFinal&(1<<unitNumber))!=0) {
|
||||
// Write the final value for the one string ending with this unit.
|
||||
writeValueAndFinal(getElementValue(start), true);
|
||||
} else {
|
||||
// Write the delta to the start position of the sub-node.
|
||||
writeValueAndFinal(offset-jumpTargets_.get(jumpTargetsIndex++), false);
|
||||
}
|
||||
offset=write(getElementUnit(start, unitIndex));
|
||||
state=State.BUILDING;
|
||||
break;
|
||||
case BUILDING:
|
||||
// Building must have failed.
|
||||
throw new IndexOutOfBoundsException("Builder failed and must be clear()ed.");
|
||||
case BUILT:
|
||||
return; // Nothing more to do.
|
||||
}
|
||||
// Write the split-branch nodes.
|
||||
int lessThanIndex=jumpTargetsOffset;
|
||||
if(ltLength>0) {
|
||||
int middleUnitsIndex=middleUnits_.length();
|
||||
do {
|
||||
--ltLength;
|
||||
writeDeltaTo(jumpTargets_.get(--lessThanIndex));
|
||||
offset=write(middleUnits_.charAt(--middleUnitsIndex));
|
||||
} while(ltLength>0);
|
||||
middleUnits_.setLength(middleUnitsIndex);
|
||||
}
|
||||
jumpTargets_.truncate(lessThanIndex);
|
||||
return offset;
|
||||
// Note: Building should be a little faster if we skipped registerNode() calls
|
||||
// (or make it return the input node without checking whether it is unique).
|
||||
// We would still need to fix-up linear-match nodes (for their maximum length)
|
||||
// and branch nodes (turning dynamic branch nodes into trees of runtime-equivalent nodes),
|
||||
// but the HashMap/hashCode()/equals() would be omitted.
|
||||
//
|
||||
// Drawbacks:
|
||||
// a) Requires an API option, and users having to think about the choice.
|
||||
// b) Requires additional testing. (And unit tests take more time.)
|
||||
// c) Somewhat more code and complexity.
|
||||
// d) Sometimes larger trie serializations when duplicates could be removed.
|
||||
root=root.register(this);
|
||||
root.markRightEdgesFirst(-1);
|
||||
root.write(this);
|
||||
state=State.BUILT;
|
||||
}
|
||||
|
||||
// Requires start<limit,
|
||||
// and all strings of the [start..limit[ elements must be sorted and
|
||||
// have a common prefix of length unitIndex.
|
||||
protected Node makeNode(int start, int limit, int unitIndex) {
|
||||
boolean hasValue=false;
|
||||
int value=0;
|
||||
if(unitIndex==getElementStringLength(start)) {
|
||||
// An intermediate or final value.
|
||||
value=getElementValue(start++);
|
||||
if(start==limit) {
|
||||
return registerFinalValue(value);
|
||||
}
|
||||
hasValue=true;
|
||||
}
|
||||
Node node;
|
||||
// Now all [start..limit[ strings are longer than unitIndex.
|
||||
int minUnit=getElementUnit(start, unitIndex);
|
||||
int maxUnit=getElementUnit(limit-1, unitIndex);
|
||||
if(minUnit==maxUnit) {
|
||||
// Linear-match node: All strings have the same character at unitIndex.
|
||||
int lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
|
||||
Node nextNode=makeNode(start, limit, lastUnitIndex);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
int length=lastUnitIndex-unitIndex;
|
||||
int maxLinearMatchLength=getMaxLinearMatchLength();
|
||||
while(length>maxLinearMatchLength) {
|
||||
lastUnitIndex-=maxLinearMatchLength;
|
||||
length-=maxLinearMatchLength;
|
||||
node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
|
||||
nextNode=registerNode(node);
|
||||
}
|
||||
node=createLinearMatchNode(start, unitIndex, length, nextNode);
|
||||
} else {
|
||||
// Branch node.
|
||||
int startsOffset=starts_.size();
|
||||
int length=countElementUnits(start, limit, unitIndex);
|
||||
// length>=2 because minUnit!=maxUnit.
|
||||
Node subNode=makeBranchSubNode(start, limit, unitIndex, length, startsOffset);
|
||||
starts_.truncate(startsOffset);
|
||||
node=new BranchHeadNode(length, subNode);
|
||||
}
|
||||
if(hasValue && node!=null) {
|
||||
if(matchNodesCanHaveValues()) {
|
||||
((ValueNode )node).setValue(value);
|
||||
} else {
|
||||
node=new IntermediateValueNode(value, registerNode(node));
|
||||
}
|
||||
}
|
||||
return registerNode(node);
|
||||
protected void clearImpl() {
|
||||
strings.setLength(0);
|
||||
nodes.clear();
|
||||
root=null;
|
||||
state=State.ADDING;
|
||||
}
|
||||
|
||||
// start<limit && all strings longer than unitIndex &&
|
||||
// length different units at unitIndex
|
||||
protected Node makeBranchSubNode(int start, int limit, int unitIndex,
|
||||
int length, int startsOffset) {
|
||||
int ltLength=0;
|
||||
while(length>getMaxBranchLinearSubNodeLength()) {
|
||||
// Branch on the middle unit.
|
||||
// First, find the middle unit.
|
||||
int i=starts_.get(startsOffset+length/2);
|
||||
// Create the less-than branch.
|
||||
middleUnits_.append(getElementUnit(i, unitIndex)); // middle unit
|
||||
lessThan_.add(makeBranchSubNode(start, i, unitIndex, length/2, startsOffset));
|
||||
++ltLength;
|
||||
// Continue for the greater-or-equal branch.
|
||||
start=i;
|
||||
startsOffset+=length/2;
|
||||
length=length-length/2;
|
||||
}
|
||||
ListBranchNode listNode=new ListBranchNode();
|
||||
// For each unit, find its elements array start and whether it has a final value.
|
||||
int unitNumber=0;
|
||||
do {
|
||||
char unit=getElementUnit(start, unitIndex);
|
||||
int i=starts_.get(startsOffset+unitNumber+1);
|
||||
if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
|
||||
listNode.add(unit, getElementValue(start));
|
||||
} else {
|
||||
listNode.add(unit, makeNode(start, i, unitIndex+1));
|
||||
}
|
||||
start=i;
|
||||
} while(++unitNumber<length-1);
|
||||
// unitNumber==length-1, and the maxUnit elements range is [start..limit[
|
||||
char unit=getElementUnit(start, unitIndex);
|
||||
if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
|
||||
listNode.add(unit, getElementValue(start));
|
||||
} else {
|
||||
listNode.add(unit, makeNode(start, limit, unitIndex+1));
|
||||
}
|
||||
Node node=registerNode(listNode);
|
||||
// Create the split-branch nodes.
|
||||
if(ltLength>0) {
|
||||
int lessThanIndex=lessThan_.size();
|
||||
int middleUnitsIndex=middleUnits_.length();
|
||||
do {
|
||||
--ltLength;
|
||||
node=registerNode(
|
||||
new SplitBranchNode(middleUnits_.charAt(--middleUnitsIndex),
|
||||
lessThan_.get(--lessThanIndex), node));
|
||||
} while(ltLength>0);
|
||||
lessThan_.truncate(lessThanIndex);
|
||||
middleUnits_.setLength(middleUnitsIndex);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
protected abstract int getElementStringLength(int i) /*const*/;
|
||||
protected abstract char getElementUnit(int i, int unitIndex) /*const*/;
|
||||
protected abstract int getElementValue(int i) /*const*/;
|
||||
|
||||
// Finds the first unit index after this one where
|
||||
// the first and last element have different units again.
|
||||
protected abstract int getLimitOfLinearMatch(int first, int last, int unitIndex) /*const*/;
|
||||
|
||||
// Number of different bytes at unitIndex.
|
||||
protected abstract int countElementUnits(int start, int limit, int unitIndex) /*const*/;
|
||||
|
||||
protected abstract boolean matchNodesCanHaveValues() /*const*/;
|
||||
|
||||
protected abstract int getMaxBranchLinearSubNodeLength() /*const*/;
|
||||
protected abstract int getMinLinearMatch() /*const*/;
|
||||
protected abstract int getMaxLinearMatchLength() /*const*/;
|
||||
|
||||
// max(BytesTrie::kMaxBranchLinearSubNodeLength, CharsTrie::kMaxBranchLinearSubNodeLength).
|
||||
protected static final int kMaxBranchLinearSubNodeLength=5;
|
||||
|
||||
// Maximum number of nested split-branch levels for a branch on all 2^16 possible char units.
|
||||
// log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
|
||||
protected static final int kMaxSplitBranchLevels=14;
|
||||
|
||||
/**
|
||||
* Makes sure that there is only one unique node registered that is
|
||||
* equivalent to newNode.
|
||||
@ -325,7 +103,7 @@ public abstract class StringTrieBuilder {
|
||||
* @return newNode if it is the first of its kind, or
|
||||
* an equivalent node if newNode is a duplicate.
|
||||
*/
|
||||
protected final Node registerNode(Node newNode) {
|
||||
private final Node registerNode(Node newNode) {
|
||||
Node old=nodes.get(newNode);
|
||||
if(old!=null) {
|
||||
return old;
|
||||
@ -344,36 +122,51 @@ public abstract class StringTrieBuilder {
|
||||
* @param value A final value.
|
||||
* @return A FinalValueNode with the given value.
|
||||
*/
|
||||
protected final Node registerFinalValue(int value) {
|
||||
lookupFinalValueNode.setValue(value);
|
||||
Node old=nodes.get(lookupFinalValueNode);
|
||||
if(old!=null) {
|
||||
return old;
|
||||
private final ValueNode registerFinalValue(int value) {
|
||||
lookupFinalValueNode.setFinalValue(value);
|
||||
Node oldNode=nodes.get(lookupFinalValueNode);
|
||||
if(oldNode!=null) {
|
||||
return (ValueNode)oldNode;
|
||||
}
|
||||
Node newNode=new FinalValueNode(value);
|
||||
ValueNode newNode=new ValueNode();
|
||||
newNode.setFinalValue(value);
|
||||
// If put() returns a non-null value from an equivalent, previously
|
||||
// registered node, then get() failed to find that and we will leak newNode.
|
||||
Node oldValue=nodes.put(newNode, newNode);
|
||||
assert(oldValue==null);
|
||||
oldNode=nodes.put(newNode, newNode);
|
||||
assert(oldNode==null);
|
||||
return newNode;
|
||||
}
|
||||
|
||||
// Hash set of nodes, maps from nodes to integer 1.
|
||||
protected HashMap<Node, Node> nodes;
|
||||
protected FinalValueNode lookupFinalValueNode;
|
||||
|
||||
protected static abstract class Node {
|
||||
public Node(int initialHash) {
|
||||
hash=initialHash;
|
||||
private static abstract class Node {
|
||||
public Node() {
|
||||
offset=0;
|
||||
}
|
||||
// hashCode() and equals() for use with registerNode() and the nodes hash.
|
||||
@Override
|
||||
public final int hashCode() /*const*/ { return hash; }
|
||||
public abstract int hashCode() /*const*/;
|
||||
// Base class equals() compares the actual class types.
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
return this==other || (this.getClass()==other.getClass() && hash==((Node)other).hash);
|
||||
return this==other || this.getClass()==other.getClass();
|
||||
}
|
||||
/**
|
||||
* Recursive method for adding a new (string, value) pair.
|
||||
* Matches the remaining part of s from start,
|
||||
* and adds a new node where there is a mismatch.
|
||||
* @return this or a replacement Node
|
||||
*/
|
||||
public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* Recursive method for registering unique nodes,
|
||||
* after all (string, value) pairs have been added.
|
||||
* Final-value nodes are pre-registered while add()ing (string, value) pairs.
|
||||
* Other nodes created while add()ing registerNode() themselves later
|
||||
* and might replace themselves with new types of nodes for write()ing.
|
||||
* @return The registered version of this node which implements write().
|
||||
*/
|
||||
public Node register(StringTrieBuilder builder) { return this; }
|
||||
/**
|
||||
* Traverses the Node graph and numbers branch edges, with rightmost edges first.
|
||||
* This is to avoid writing a duplicate node twice.
|
||||
@ -423,55 +216,29 @@ public abstract class StringTrieBuilder {
|
||||
}
|
||||
public final int getOffset() /*const*/ { return offset; }
|
||||
|
||||
protected int hash;
|
||||
protected int offset;
|
||||
}
|
||||
|
||||
// This class should not be overridden because
|
||||
// registerFinalValue() compares a stack-allocated FinalValueNode
|
||||
// (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
|
||||
// with the input node, and the
|
||||
// !Node::operator==(other) used inside FinalValueNode::operator==(other)
|
||||
// will be false if the typeid's are different.
|
||||
protected static final class FinalValueNode extends Node {
|
||||
public FinalValueNode(int v) {
|
||||
super(0x111111*37+v);
|
||||
// Used directly for final values, and as as a superclass for
|
||||
// match nodes with intermediate values.
|
||||
private static class ValueNode extends Node {
|
||||
public ValueNode() {}
|
||||
public final void setValue(int v) {
|
||||
assert(!hasValue);
|
||||
hasValue=true;
|
||||
value=v;
|
||||
}
|
||||
private void setFinalValue(int v) {
|
||||
hasValue=true;
|
||||
value=v;
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(this==other) {
|
||||
return true;
|
||||
public int hashCode() /*const*/ {
|
||||
int hash=0x111111;
|
||||
if(hasValue) {
|
||||
hash=hash*37+value;
|
||||
}
|
||||
if(!super.equals(other)) {
|
||||
return false;
|
||||
}
|
||||
FinalValueNode o=(FinalValueNode)other;
|
||||
return value==o.value;
|
||||
}
|
||||
@Override
|
||||
public void write(StringTrieBuilder builder) {
|
||||
offset=builder.writeValueAndFinal(value, true);
|
||||
}
|
||||
|
||||
protected int value;
|
||||
|
||||
/**
|
||||
* Must be called only by registerFinalValue() and only on the lookupFinalValueNode.
|
||||
* This is a workaround: C++ just stack-allocates a FinalValueNode
|
||||
* inside registerFinalValue().
|
||||
* In Java, we keep a FinalValueNode instance and modify it.
|
||||
* Otherwise, FinalValueNode instances are immutable.
|
||||
*/
|
||||
private void setValue(int v) {
|
||||
hash=0x111111*37+v;
|
||||
value=v;
|
||||
}
|
||||
}
|
||||
|
||||
protected static abstract class ValueNode extends Node {
|
||||
public ValueNode(int initialHash) {
|
||||
super(initialHash);
|
||||
return hash;
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
@ -484,23 +251,35 @@ public abstract class StringTrieBuilder {
|
||||
ValueNode o=(ValueNode)other;
|
||||
return hasValue==o.hasValue && (!hasValue || value==o.value);
|
||||
}
|
||||
public final void setValue(int v) {
|
||||
hasValue=true;
|
||||
value=v;
|
||||
hash=hash*37+v;
|
||||
@Override
|
||||
public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
|
||||
if(start==s.length()) {
|
||||
throw new IllegalArgumentException("Duplicate string.");
|
||||
}
|
||||
// Replace self with a node for the remaining string suffix and value.
|
||||
ValueNode node=builder.createSuffixNode(s, start, sValue);
|
||||
node.setValue(value);
|
||||
return node;
|
||||
}
|
||||
@Override
|
||||
public void write(StringTrieBuilder builder) {
|
||||
offset=builder.writeValueAndFinal(value, true);
|
||||
}
|
||||
|
||||
protected boolean hasValue;
|
||||
protected int value;
|
||||
}
|
||||
|
||||
protected static final class IntermediateValueNode extends ValueNode {
|
||||
private static final class IntermediateValueNode extends ValueNode {
|
||||
public IntermediateValueNode(int v, Node nextNode) {
|
||||
super(0x222222*37+nextNode.hashCode());
|
||||
next=nextNode;
|
||||
setValue(v);
|
||||
}
|
||||
@Override
|
||||
public int hashCode() /*const*/ {
|
||||
return (0x222222*37+value)*37+next.hashCode();
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(this==other) {
|
||||
return true;
|
||||
@ -524,16 +303,19 @@ public abstract class StringTrieBuilder {
|
||||
offset=builder.writeValueAndFinal(value, false);
|
||||
}
|
||||
|
||||
protected Node next;
|
||||
private Node next;
|
||||
}
|
||||
|
||||
protected static abstract class LinearMatchNode extends ValueNode {
|
||||
public LinearMatchNode(int len, Node nextNode) {
|
||||
super((0x333333*37+len)*37+nextNode.hashCode());
|
||||
private static final class LinearMatchNode extends ValueNode {
|
||||
public LinearMatchNode(CharSequence builderStrings, int sOffset, int len, Node nextNode) {
|
||||
strings=builderStrings;
|
||||
stringOffset=sOffset;
|
||||
length=len;
|
||||
next=nextNode;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() /*const*/ { return hash; }
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(this==other) {
|
||||
return true;
|
||||
@ -542,7 +324,108 @@ public abstract class StringTrieBuilder {
|
||||
return false;
|
||||
}
|
||||
LinearMatchNode o=(LinearMatchNode)other;
|
||||
return length==o.length && next==o.next;
|
||||
if(length!=o.length || next!=o.next) {
|
||||
return false;
|
||||
}
|
||||
for(int i=stringOffset, j=o.stringOffset, limit=stringOffset+length; i<limit; ++i, ++j) {
|
||||
if(strings.charAt(i)!=strings.charAt(j)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@Override
|
||||
public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
|
||||
if(start==s.length()) {
|
||||
if(hasValue) {
|
||||
throw new IllegalArgumentException("Duplicate string.");
|
||||
} else {
|
||||
setValue(sValue);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
int limit=stringOffset+length;
|
||||
for(int i=stringOffset; i<limit; ++i, ++start) {
|
||||
if(start==s.length()) {
|
||||
// s is a prefix with a new value. Split self into two linear-match nodes.
|
||||
int prefixLength=i-stringOffset;
|
||||
LinearMatchNode suffixNode=new LinearMatchNode(strings, i, length-prefixLength, next);
|
||||
suffixNode.setValue(sValue);
|
||||
length=prefixLength;
|
||||
next=suffixNode;
|
||||
return this;
|
||||
}
|
||||
char thisChar=strings.charAt(i);
|
||||
char newChar=s.charAt(start);
|
||||
if(thisChar!=newChar) {
|
||||
// Mismatch, insert a branch node.
|
||||
DynamicBranchNode branchNode=new DynamicBranchNode();
|
||||
// Reuse this node for one of the remaining substrings, if any.
|
||||
Node result, thisSuffixNode;
|
||||
if(i==stringOffset) {
|
||||
// Mismatch on first character, turn this node into a suffix.
|
||||
if(hasValue) {
|
||||
// Move the value for prefix length "start" to the new node.
|
||||
branchNode.setValue(value);
|
||||
value=0;
|
||||
hasValue=false;
|
||||
}
|
||||
++stringOffset;
|
||||
--length;
|
||||
thisSuffixNode= length>0 ? this : next;
|
||||
// C++: if(length==0) { delete this; }
|
||||
result=branchNode;
|
||||
} else if(i==limit-1) {
|
||||
// Mismatch on last character, keep this node for the prefix.
|
||||
--length;
|
||||
thisSuffixNode=next;
|
||||
next=branchNode;
|
||||
result=this;
|
||||
} else {
|
||||
// Mismatch on intermediate character, keep this node for the prefix.
|
||||
int prefixLength=i-stringOffset;
|
||||
++i; // Suffix start offset (after thisChar).
|
||||
thisSuffixNode=new LinearMatchNode(
|
||||
strings, i, length-(prefixLength+1), next);
|
||||
length=prefixLength;
|
||||
next=branchNode;
|
||||
result=this;
|
||||
}
|
||||
ValueNode newSuffixNode=builder.createSuffixNode(s, start+1, sValue);
|
||||
branchNode.add(thisChar, thisSuffixNode);
|
||||
branchNode.add(newChar, newSuffixNode);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
// s matches all of this node's characters.
|
||||
next=next.add(builder, s, start, sValue);
|
||||
return this;
|
||||
}
|
||||
@Override
|
||||
public Node register(StringTrieBuilder builder) {
|
||||
next=next.register(builder);
|
||||
// Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
|
||||
int maxLinearMatchLength=builder.getMaxLinearMatchLength();
|
||||
while(length>maxLinearMatchLength) {
|
||||
int nextOffset=stringOffset+length-maxLinearMatchLength;
|
||||
length-=maxLinearMatchLength;
|
||||
LinearMatchNode suffixNode=
|
||||
new LinearMatchNode(strings, nextOffset, maxLinearMatchLength, next);
|
||||
suffixNode.setHashCode();
|
||||
next=builder.registerNode(suffixNode);
|
||||
}
|
||||
Node result;
|
||||
if(hasValue && !builder.matchNodesCanHaveValues()) {
|
||||
int intermediateValue=value;
|
||||
value=0;
|
||||
hasValue=false;
|
||||
setHashCode();
|
||||
result=new IntermediateValueNode(intermediateValue, builder.registerNode(this));
|
||||
} else {
|
||||
setHashCode();
|
||||
result=this;
|
||||
}
|
||||
return builder.registerNode(result);
|
||||
}
|
||||
@Override
|
||||
public int markRightEdgesFirst(int edgeNumber) {
|
||||
@ -551,22 +434,134 @@ public abstract class StringTrieBuilder {
|
||||
}
|
||||
return edgeNumber;
|
||||
}
|
||||
|
||||
protected int length;
|
||||
public Node next;
|
||||
}
|
||||
|
||||
protected static abstract class BranchNode extends Node {
|
||||
public BranchNode(int initialHash) {
|
||||
super(initialHash);
|
||||
@Override
|
||||
public void write(StringTrieBuilder builder) {
|
||||
next.write(builder);
|
||||
builder.write(stringOffset, length);
|
||||
offset=builder.writeValueAndType(hasValue, value, builder.getMinLinearMatch()+length-1);
|
||||
}
|
||||
|
||||
// Must be called just before registerNode(this).
|
||||
private void setHashCode() /*const*/ {
|
||||
hash=(0x333333*37+length)*37+next.hashCode();
|
||||
if(hasValue) {
|
||||
hash=hash*37+value;
|
||||
}
|
||||
for(int i=stringOffset, limit=stringOffset+length; i<limit; ++i) {
|
||||
hash=hash*37+strings.charAt(i);
|
||||
}
|
||||
}
|
||||
|
||||
private CharSequence strings;
|
||||
private int stringOffset;
|
||||
private int length;
|
||||
private Node next;
|
||||
private int hash;
|
||||
}
|
||||
|
||||
private static final class DynamicBranchNode extends ValueNode {
|
||||
public DynamicBranchNode() {}
|
||||
// c must not be in chars yet.
|
||||
public void add(char c, Node node) {
|
||||
int i=find(c);
|
||||
chars.insert(i, c);
|
||||
equal.add(i, node);
|
||||
}
|
||||
@Override
|
||||
public Node add(StringTrieBuilder builder, CharSequence s, int start, int sValue) {
|
||||
if(start==s.length()) {
|
||||
if(hasValue) {
|
||||
throw new IllegalArgumentException("Duplicate string.");
|
||||
} else {
|
||||
setValue(sValue);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
char c=s.charAt(start++);
|
||||
int i=find(c);
|
||||
if(i<chars.length() && c==chars.charAt(i)) {
|
||||
equal.set(i, equal.get(i).add(builder, s, start, sValue));
|
||||
} else {
|
||||
chars.insert(i, c);
|
||||
equal.add(i, builder.createSuffixNode(s, start, sValue));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
@Override
|
||||
public Node register(StringTrieBuilder builder) {
|
||||
Node subNode=register(builder, 0, chars.length());
|
||||
BranchHeadNode head=new BranchHeadNode(chars.length(), subNode);
|
||||
Node result=head;
|
||||
if(hasValue) {
|
||||
if(builder.matchNodesCanHaveValues()) {
|
||||
head.setValue(value);
|
||||
} else {
|
||||
result=new IntermediateValueNode(value, builder.registerNode(head));
|
||||
}
|
||||
}
|
||||
return builder.registerNode(result);
|
||||
}
|
||||
private Node register(StringTrieBuilder builder, int start, int limit) {
|
||||
int length=limit-start;
|
||||
if(length>builder.getMaxBranchLinearSubNodeLength()) {
|
||||
// Branch on the middle unit.
|
||||
int middle=start+length/2;
|
||||
return builder.registerNode(
|
||||
new SplitBranchNode(
|
||||
chars.charAt(middle),
|
||||
register(builder, start, middle),
|
||||
register(builder, middle, limit)));
|
||||
}
|
||||
ListBranchNode listNode=new ListBranchNode(length);
|
||||
do {
|
||||
char c=chars.charAt(start);
|
||||
Node node=equal.get(start);
|
||||
if(node.getClass()==ValueNode.class) {
|
||||
// Final value.
|
||||
listNode.add(c, ((ValueNode)node).value);
|
||||
} else {
|
||||
listNode.add(c, node.register(builder));
|
||||
}
|
||||
} while(++start<limit);
|
||||
return builder.registerNode(listNode);
|
||||
}
|
||||
|
||||
private int find(char c) {
|
||||
int start=0;
|
||||
int limit=chars.length();
|
||||
while(start<limit) {
|
||||
int i=(start+limit)/2;
|
||||
char middleChar=chars.charAt(i);
|
||||
if(c<middleChar) {
|
||||
limit=i;
|
||||
} else if(c==middleChar) {
|
||||
return i;
|
||||
} else {
|
||||
start=i+1;
|
||||
}
|
||||
}
|
||||
return start;
|
||||
}
|
||||
|
||||
private StringBuilder chars=new StringBuilder();
|
||||
private ArrayList<Node> equal=new ArrayList<Node>();
|
||||
}
|
||||
|
||||
private static abstract class BranchNode extends Node {
|
||||
public BranchNode() {}
|
||||
@Override
|
||||
public int hashCode() /*const*/ { return hash; }
|
||||
|
||||
protected int hash;
|
||||
protected int firstEdgeNumber;
|
||||
}
|
||||
|
||||
protected static final class ListBranchNode extends BranchNode {
|
||||
public ListBranchNode() {
|
||||
super(0x444444);
|
||||
private static final class ListBranchNode extends BranchNode {
|
||||
public ListBranchNode(int capacity) {
|
||||
hash=0x444444*37+capacity;
|
||||
equal=new Node[capacity];
|
||||
values=new int[capacity];
|
||||
units=new char[capacity];
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
@ -661,16 +656,20 @@ public abstract class StringTrieBuilder {
|
||||
hash=(hash*37+c)*37+node.hashCode();
|
||||
}
|
||||
|
||||
protected Node[] equal=new Node[kMaxBranchLinearSubNodeLength]; // null means "has final value".
|
||||
protected int length;
|
||||
protected int[] values=new int[kMaxBranchLinearSubNodeLength];
|
||||
protected char[] units=new char[kMaxBranchLinearSubNodeLength];
|
||||
// Note: We could try to reduce memory allocations
|
||||
// by replacing these per-node arrays with per-builder ArrayLists and
|
||||
// (for units) a StringBuilder (or even use its strings for the units too).
|
||||
// It remains to be seen whether that would improve performance.
|
||||
private Node[] equal; // null means "has final value".
|
||||
private int length;
|
||||
private int[] values;
|
||||
private char[] units;
|
||||
}
|
||||
|
||||
protected static final class SplitBranchNode extends BranchNode {
|
||||
private static final class SplitBranchNode extends BranchNode {
|
||||
public SplitBranchNode(char middleUnit, Node lessThanNode, Node greaterOrEqualNode) {
|
||||
super(((0x555555*37+middleUnit)*37+
|
||||
lessThanNode.hashCode())*37+greaterOrEqualNode.hashCode());
|
||||
hash=((0x555555*37+middleUnit)*37+
|
||||
lessThanNode.hashCode())*37+greaterOrEqualNode.hashCode();
|
||||
unit=middleUnit;
|
||||
lessThan=lessThanNode;
|
||||
greaterOrEqual=greaterOrEqualNode;
|
||||
@ -707,19 +706,22 @@ public abstract class StringTrieBuilder {
|
||||
offset=builder.write(unit);
|
||||
}
|
||||
|
||||
protected char unit;
|
||||
protected Node lessThan;
|
||||
protected Node greaterOrEqual;
|
||||
private char unit;
|
||||
private Node lessThan;
|
||||
private Node greaterOrEqual;
|
||||
}
|
||||
|
||||
// Branch head node, for writing the actual node lead unit.
|
||||
protected static final class BranchHeadNode extends ValueNode {
|
||||
private static final class BranchHeadNode extends ValueNode {
|
||||
public BranchHeadNode(int len, Node subNode) {
|
||||
super((0x666666*37+len)*37+subNode.hashCode());
|
||||
length=len;
|
||||
next=subNode;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() /*const*/ {
|
||||
return (0x666666*37+length)*37+next.hashCode();
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object other) {
|
||||
if(this==other) {
|
||||
return true;
|
||||
@ -748,32 +750,42 @@ public abstract class StringTrieBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
protected int length;
|
||||
protected Node next; // A branch sub-node.
|
||||
private int length;
|
||||
private Node next; // A branch sub-node.
|
||||
}
|
||||
|
||||
protected abstract Node createLinearMatchNode(int i, int unitIndex, int length,
|
||||
Node nextNode) /*const*/;
|
||||
private ValueNode createSuffixNode(CharSequence s, int start, int sValue) {
|
||||
ValueNode node=registerFinalValue(sValue);
|
||||
if(start<s.length()) {
|
||||
int offset=strings.length();
|
||||
strings.append(s, start, s.length());
|
||||
node=new LinearMatchNode(strings, offset, s.length()-start, node);
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
protected abstract boolean matchNodesCanHaveValues() /*const*/;
|
||||
|
||||
protected abstract int getMaxBranchLinearSubNodeLength() /*const*/;
|
||||
protected abstract int getMinLinearMatch() /*const*/;
|
||||
protected abstract int getMaxLinearMatchLength() /*const*/;
|
||||
|
||||
protected abstract int write(int unit);
|
||||
protected abstract int writeElementUnits(int i, int unitIndex, int length);
|
||||
protected abstract int write(int offset, int length);
|
||||
protected abstract int writeValueAndFinal(int i, boolean isFinal);
|
||||
protected abstract int writeValueAndType(boolean hasValue, int value, int node);
|
||||
protected abstract int writeDeltaTo(int jumpTarget);
|
||||
|
||||
// Temporary storage for recursive builder functions.
|
||||
// In C++, each function stack-allocates segments of these.
|
||||
// In Java, we reuse growable arrays to minimize allocations.
|
||||
@SuppressWarnings("serial")
|
||||
protected final class MyList<T> extends ArrayList<T> {
|
||||
// ArrayList.removeRange() is protected, so we have to subclass it
|
||||
// to truncate the list.
|
||||
public void truncate(int newLength) {
|
||||
removeRange(newLength, size());
|
||||
}
|
||||
protected enum State {
|
||||
ADDING, BUILDING, BUILT
|
||||
}
|
||||
protected MyList<Integer> starts_=new MyList<Integer>();
|
||||
private MyList<Integer> jumpTargets_=new MyList<Integer>();
|
||||
private MyList<Node> lessThan_=new MyList<Node>();
|
||||
private StringBuilder middleUnits_=new StringBuilder();
|
||||
protected State state=State.ADDING;
|
||||
|
||||
// Strings and sub-strings for linear-match nodes.
|
||||
protected StringBuilder strings=new StringBuilder();
|
||||
private Node root;
|
||||
|
||||
// Hash set of nodes, maps from nodes to integer 1.
|
||||
private HashMap<Node, Node> nodes=new HashMap<Node, Node>();
|
||||
private ValueNode lookupFinalValueNode=new ValueNode();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user