ICU-8167 port UCharsTrie and BytesTrie::next(string) to Java

X-SVN-Rev: 29276
This commit is contained in:
Markus Scherer 2011-01-07 05:25:40 +00:00
parent c12ce73a18
commit 9a43f08e82
2 changed files with 1043 additions and 12 deletions

View File

@ -10,6 +10,7 @@
package com.ibm.icu.impl;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.NoSuchElementException;
@ -58,9 +59,10 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
public static final class State {
public State() {}
private byte[] bytes;
private int root;
private int pos;
private int remainingMatchLength;
};
}
/**
* Saves the state of this trie.
@ -68,6 +70,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
*/
public BytesTrie saveState(State state) /*const*/ {
state.bytes=bytes_;
state.root=root_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return this;
@ -81,7 +84,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
* @see #reset
*/
public BytesTrie resetToState(State state) {
if(bytes_==state.bytes && bytes_!=null) {
if(bytes_==state.bytes && bytes_!=null && root_==state.root) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
} else {
@ -203,7 +206,81 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
* </pre>
* @return The match/value Result.
*/
// public Result next(const char *s, int length);
public Result next(byte[] s, int sIndex, int sLimit) {
if(sIndex>=sLimit) {
// Empty input.
return current();
}
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input byte, if there is one.
// Continue a linear-match node.
byte inByte;
for(;;) {
if(sIndex==sLimit) {
remainingMatchLength_=length;
pos_=pos;
int node;
return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ?
valueResults_[node&kValueIsFinal] : Result.NO_VALUE;
}
inByte=s[sIndex++];
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inByte!=bytes_[pos]) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
}
for(;;) {
int node=bytes_[pos++]&0xff;
if(node<kMinLinearMatch) {
Result result=branchNext(pos, node, inByte&0xff);
if(result==Result.NO_MATCH) {
return Result.NO_MATCH;
}
// Fetch the next input byte, if there is one.
if(sIndex==sLimit) {
return result;
}
if(result==Result.FINAL_VALUE) {
// No further matching bytes.
stop();
return Result.NO_MATCH;
}
inByte=s[sIndex++];
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
} else if(node<kMinValueLead) {
// Match length+1 bytes.
length=node-kMinLinearMatch; // Actual match length minus 1.
if(inByte!=bytes_[pos]) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
break;
} else if((node&kValueIsFinal)!=0) {
// No further matching bytes.
stop();
return Result.NO_MATCH;
} else {
// Skip intermediate value.
pos=skipValue(pos, node);
// The next node must not also be a value node.
assert((bytes_[pos]&0xff)<kMinValueLead);
}
}
}
}
/**
* Returns a matching byte sequence's value if called immediately after
@ -222,7 +299,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
/**
* Determines whether all byte sequences reachable from the current state
* map to the same value, and if so, returns that value.
* @return the unique value in bits 32..1 with bit 0 set,
* @return The unique value in bits 32..1 with bit 0 set,
* if all byte sequences reachable from the current state
* map to the same value; otherwise returns 0.
*/
@ -242,7 +319,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
* That is, each byte b for which it would be next(b)!=Result.NO_MATCH now.
* @param out Each next byte is 0-extended to a char and appended to this object.
* (Only uses the out.append(c) method.)
* @return the number of bytes which continue the byte sequence from here
* @return The number of bytes which continue the byte sequence from here.
*/
public int getNextBytes(Appendable out) /*const*/ {
int pos=pos_;
@ -281,7 +358,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
* @return A new BytesTrie.Iterator.
*/
public Iterator iterator() {
return new Iterator(bytes_, root_, remainingMatchLength_, 0);
return new Iterator(bytes_, pos_, remainingMatchLength_, 0);
}
/**
@ -291,7 +368,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
* @return A new BytesTrie.Iterator.
*/
public Iterator iterator(int maxStringLength) {
return new Iterator(bytes_, root_, remainingMatchLength_, maxStringLength);
return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength);
}
/**
@ -315,10 +392,13 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
}
public int stringLength() { return length; }
public byte charAt(int index) { return bytes[index]; }
public byte byteAt(int index) { return bytes[index]; }
public void copyStringTo(byte[] dest, int destOffset) {
System.arraycopy(bytes, 0, dest, destOffset, length);
}
public ByteBuffer stringAsByteBuffer() {
return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer();
}
public int value;
@ -355,7 +435,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
entry_=new Entry(maxLength_!=0 ? maxLength_ : 32);
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str.
// Pending linear-match node, append remaining bytes to entry_.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
@ -450,7 +530,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
return entry_; // Reached a final value.
}
} else {
// Linear-match node, append length bytes to str_.
// Linear-match node, append length bytes to entry_.
int length=node-kMinLinearMatch+1;
if(maxLength_>0 && entry_.length+length>maxLength_) {
entry_.append(bytes_, pos, maxLength_-entry_.length);
@ -519,7 +599,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
// and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
// (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
// but the code looks more confusing that way.)
ArrayList<Long> stack_=new ArrayList<Long>();
private ArrayList<Long> stack_=new ArrayList<Long>();
}
private void stop() {
@ -699,7 +779,7 @@ public final class BytesTrie implements Cloneable, Iterable<BytesTrie.Entry> {
return Result.NO_MATCH;
}
// Helper functions for hasUniqueValue().
// Helper functions for getUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
// uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().

View File

@ -0,0 +1,951 @@
/*
*******************************************************************************
* Copyright (C) 2011, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* created on: 2011jan06
* created by: Markus W. Scherer
* ported from ICU4C ucharstrie.h/.cpp
*/
package com.ibm.icu.impl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.NoSuchElementException;
import com.ibm.icu.impl.BytesTrie.Result;
import com.ibm.icu.text.UTF16;
/**
* Light-weight, non-const reader class for a CharsTrie.
* Traverses a char-serialized data structure with minimal state,
* for mapping strings (16-bit-unit sequences) to non-negative integer values.
*
* @author Markus W. Scherer
*/
public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
/**
* Constructs a CharsTrie reader instance.
* @param trieChars CharSequence that contains the serialized trie.
* @param offset Root offset of the trie in the CharSequence.
*/
public CharsTrie(CharSequence trieChars, int offset) {
chars_=trieChars;
pos_=root_=offset;
remainingMatchLength_=-1;
}
/**
* Clones this trie reader object and its state,
* but not the char array which will be shared.
* @return A shallow clone of this trie.
*/
@Override
public Object clone() throws CloneNotSupportedException {
return super.clone(); // A shallow copy is just what we need.
}
/**
* Resets this trie to its initial state.
*/
public CharsTrie reset() {
pos_=root_;
remainingMatchLength_=-1;
return this;
}
/**
* CharsTrie state object, for saving a trie's current state
* and resetting the trie back to this state later.
*/
public static final class State {
public State() {}
private CharSequence chars;
private int root;
private int pos;
private int remainingMatchLength;
}
/**
* Saves the state of this trie.
* @see #resetToState
*/
public CharsTrie saveState(State state) /*const*/ {
state.chars=chars_;
state.root=root_;
state.pos=pos_;
state.remainingMatchLength=remainingMatchLength_;
return this;
}
/**
* Resets this trie to the saved state.
* @throws IllegalArgumentException if the state object contains no state,
* or the state of a different trie
* @see #saveState
* @see #reset
*/
public CharsTrie resetToState(State state) {
if(chars_==state.chars && chars_!=null && root_==state.root) {
pos_=state.pos;
remainingMatchLength_=state.remainingMatchLength;
} else {
throw new IllegalArgumentException("incompatible trie state");
}
return this;
}
/**
* Determines whether the string so far matches, whether it has a value,
* and whether another input char can continue a matching string.
* @return The match/value Result.
*/
public Result current() /*const*/ {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
} else {
int node;
return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
}
}
/**
* Traverses the trie from the initial state for this input char.
* Equivalent to reset().next(inUnit).
* @return The match/value Result.
*/
public Result first(int inUnit) {
remainingMatchLength_=-1;
return nextImpl(root_, inUnit);
}
/**
* Traverses the trie from the initial state for the
* one or two UTF-16 code units for this input code point.
* Equivalent to reset().nextForCodePoint(cp).
* @return The match/value Result.
*/
public Result firstForCodePoint(int cp) {
return cp<=0xffff ?
first(cp) :
(first(UTF16.getLeadSurrogate(cp)).hasNext() ?
next(UTF16.getTrailSurrogate(cp)) :
Result.NO_MATCH);
}
/**
* Traverses the trie from the current state for this input char.
* @return The match/value Result.
*/
public Result next(int inUnit) {
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Remaining part of a linear-match node.
if(inUnit==chars_.charAt(pos++)) {
remainingMatchLength_=--length;
pos_=pos;
int node;
return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
return nextImpl(pos, inUnit);
}
/**
* Traverses the trie from the current state for the
* one or two UTF-16 code units for this input code point.
* @return The match/value Result.
*/
public Result nextForCodePoint(int cp) {
return cp<=0xffff ?
next(cp) :
(next(UTF16.getLeadSurrogate(cp)).hasNext() ?
next(UTF16.getTrailSurrogate(cp)) :
Result.NO_MATCH);
}
/**
* Traverses the trie from the current state for this string.
* Equivalent to
* <pre>
* Result result=current();
* for(each c in s)
* if(!result.hasNext()) return Result.NO_MATCH;
* result=next(c);
* return result;
* </pre>
* @return The match/value Result.
*/
public Result next(CharSequence s, int sIndex, int sLimit) {
if(sIndex>=sLimit) {
// Empty input.
return current();
}
int pos=pos_;
if(pos<0) {
return Result.NO_MATCH;
}
int length=remainingMatchLength_; // Actual remaining match length minus 1.
for(;;) {
// Fetch the next input unit, if there is one.
// Continue a linear-match node.
char inUnit;
for(;;) {
if(sIndex==sLimit) {
remainingMatchLength_=length;
pos_=pos;
int node;
return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
}
inUnit=s.charAt(sIndex++);
if(length<0) {
remainingMatchLength_=length;
break;
}
if(inUnit!=chars_.charAt(pos)) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
}
int node=chars_.charAt(pos++);
for(;;) {
if(node<kMinLinearMatch) {
Result result=branchNext(pos, node, inUnit);
if(result==Result.NO_MATCH) {
return Result.NO_MATCH;
}
// Fetch the next input unit, if there is one.
if(sIndex==sLimit) {
return result;
}
if(result==Result.FINAL_VALUE) {
// No further matching units.
stop();
return Result.NO_MATCH;
}
inUnit=s.charAt(sIndex++);
pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
node=chars_.charAt(pos++);
} else if(node<kMinValueLead) {
// Match length+1 units.
length=node-kMinLinearMatch; // Actual match length minus 1.
if(inUnit!=chars_.charAt(pos)) {
stop();
return Result.NO_MATCH;
}
++pos;
--length;
break;
} else if((node&kValueIsFinal)!=0) {
// No further matching units.
stop();
return Result.NO_MATCH;
} else {
// Skip intermediate value.
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
}
}
/**
* Returns a matching string's value if called immediately after
* current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
* getValue() can be called multiple times.
*
* Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
*/
public int getValue() /*const*/ {
int pos=pos_;
int leadUnit=chars_.charAt(pos++);
assert(leadUnit>=kMinValueLead);
return (leadUnit&kValueIsFinal)!=0 ?
readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
}
/**
* Determines whether all strings reachable from the current state
* map to the same value, and if so, returns that value.
* @return The unique value in bits 32..1 with bit 0 set,
* if all strings reachable from the current state
* map to the same value; otherwise returns 0.
*/
public long getUniqueValue() /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
// Skip the rest of a pending linear-match node.
long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
// Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
return (uniqueValue<<31)>>31;
}
/**
* Finds each char which continues the string from the current state.
* That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
* @param out Each next char is appended to this object.
* (Only uses the out.append(c) method.)
* @return The number of chars which continue the string from here.
*/
public int getNextChars(Appendable out) /*const*/ {
int pos=pos_;
if(pos<0) {
return 0;
}
if(remainingMatchLength_>=0) {
append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node.
return 1;
}
int node=chars_.charAt(pos++);
if(node>=kMinValueLead) {
if((node&kValueIsFinal)!=0) {
return 0;
} else {
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
if(node<kMinLinearMatch) {
if(node==0) {
node=chars_.charAt(pos++);
}
getNextBranchChars(chars_, pos, ++node, out);
return node;
} else {
// First unit of the linear-match node.
append(out, chars_.charAt(pos));
return 1;
}
}
/**
* Iterates from the current state of this trie.
* @return A new CharsTrie.Iterator.
*/
public Iterator iterator() {
return new Iterator(chars_, pos_, remainingMatchLength_, 0);
}
/**
* Iterates from the current state of this trie.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @return A new CharsTrie.Iterator.
*/
public Iterator iterator(int maxStringLength) {
return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
}
/**
* Iterates from the root of a char-serialized BytesTrie.
* @param trieChars CharSequence that contains the serialized trie.
* @param offset Root offset of the trie in the CharSequence.
* @param maxStringLength If 0, the iterator returns full strings.
* Otherwise, the iterator returns strings with this maximum length.
* @return A new CharsTrie.Iterator.
*/
public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
return new Iterator(trieChars, offset, -1, 0);
}
/**
* Return value type for the Iterator.
*/
public static final class Entry {
public CharSequence chars;
public int value;
}
/**
* Iterator for all of the (string, value) pairs in a CharsTrie.
*/
public static final class Iterator implements java.util.Iterator<Entry> {
private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
chars_=trieChars;
pos_=initialPos_=offset;
remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
maxLength_=maxStringLength;
int length=remainingMatchLength_; // Actual remaining match length minus 1.
if(length>=0) {
// Pending linear-match node, append remaining bytes to str_.
++length;
if(maxLength_>0 && length>maxLength_) {
length=maxLength_; // This will leave remainingMatchLength>=0 as a signal.
}
str_.append(chars_, pos_, pos_+length);
pos_+=length;
remainingMatchLength_-=length;
}
}
/**
* Resets this iterator to its initial state.
*/
public Iterator reset() {
pos_=initialPos_;
remainingMatchLength_=initialRemainingMatchLength_;
skipValue_=false;
int length=remainingMatchLength_+1; // Remaining match length.
if(maxLength_>0 && length>maxLength_) {
length=maxLength_;
}
str_.setLength(length);
pos_+=length;
remainingMatchLength_-=length;
stack_.clear();
return this;
}
/**
* @return true if there are more elements.
*/
public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
/**
* Finds the next (string, value) pair if there is one.
*
* If the string is truncated to the maximum length and does not
* have a real value, then the value is set to -1.
* In this case, this "not a real value" is indistinguishable from
* a real value of -1.
* @return An Entry with the string and value of the next element.
* @throw NoSuchElementException - iteration has no more elements.
*/
public Entry next() {
int pos=pos_;
if(pos<0) {
if(stack_.isEmpty()) {
throw new NoSuchElementException();
}
// Pop the state off the stack and continue with the next outbound edge of
// the branch node.
long top=stack_.remove(stack_.size()-1);
int length=(int)top;
pos=(int)(top>>32);
str_.setLength(length&0xffff);
length>>>=16;
if(length>1) {
pos=branchNext(pos, length);
if(pos<0) {
return entry_; // Reached a final value.
}
} else {
str_.append(chars_.charAt(pos++));
}
}
if(remainingMatchLength_>=0) {
// We only get here if we started in a pending linear-match node
// with more than maxLength remaining units.
return truncateAndStop();
}
for(;;) {
int node=chars_.charAt(pos++);
if(node>=kMinValueLead) {
if(skipValue_) {
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
skipValue_=false;
} else {
// Deliver value for the string so far.
boolean isFinal=(node&kValueIsFinal)!=0;
if(isFinal) {
entry_.value=readValue(chars_, pos, node&0x7fff);
} else {
entry_.value=readNodeValue(chars_, pos, node);
}
if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
pos_=-1;
} else {
// We cannot skip the value right here because it shares its
// lead unit with a match node which we have to evaluate
// next time.
// Instead, keep pos_ on the node lead unit itself.
pos_=pos-1;
skipValue_=true;
}
entry_.chars=str_;
return entry_;
}
}
if(maxLength_>0 && str_.length()==maxLength_) {
return truncateAndStop();
}
if(node<kMinLinearMatch) {
if(node==0) {
node=chars_.charAt(pos++);
}
pos=branchNext(pos, node+1);
if(pos<0) {
return entry_; // Reached a final value.
}
} else {
// Linear-match node, append length units to str_.
int length=node-kMinLinearMatch+1;
if(maxLength_>0 && str_.length()+length>maxLength_) {
str_.append(chars_, pos, pos+maxLength_-str_.length());
return truncateAndStop();
}
str_.append(chars_, pos, pos+length);
pos+=length;
}
}
}
/**
* Iterator.remove() is not supported.
* @throws UnsupportedOperationException (always)
*/
public void remove() {
throw new UnsupportedOperationException();
}
private Entry truncateAndStop() {
pos_=-1;
entry_.chars=str_;
entry_.value=-1; // no real value for str
return entry_;
}
private int branchNext(int pos, int length) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
// Push state for the greater-or-equal edge.
stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
// Follow the less-than edge.
length>>=1;
pos=jumpByDelta(chars_, pos);
}
// List of key-value pairs where values are either final values or jump deltas.
// Read the first (key, value) pair.
char trieUnit=chars_.charAt(pos++);
int node=chars_.charAt(pos++);
boolean isFinal=(node&kValueIsFinal)!=0;
int value=readValue(chars_, pos, node&=0x7fff);
pos=skipValue(pos, node);
stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
str_.append(trieUnit);
if(isFinal) {
pos_=-1;
entry_.chars=str_;
entry_.value=value;
return -1;
} else {
return pos+value;
}
}
private CharSequence chars_;
private int pos_;
private int initialPos_;
private int remainingMatchLength_;
private int initialRemainingMatchLength_;
private boolean skipValue_; // Skip intermediate value which was already delivered.
private StringBuilder str_;
private int maxLength_;
private Entry entry_=new Entry();
// The stack stores longs for backtracking to another
// outbound edge of a branch node.
// Each long has the offset in chars_ in bits 62..32,
// the str_.length() from before the node in bits 15..0,
// and the remaining branch length in bits 31..16.
// (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
// but the code looks more confusing that way.)
private ArrayList<Long> stack_=new ArrayList<Long>();
}
private void stop() {
pos_=-1;
}
// Reads a compact 32-bit integer.
// pos is already after the leadUnit, and the lead unit has bit 15 reset.
private static int readValue(CharSequence chars, int pos, int leadUnit) {
int value;
if(leadUnit<kMinTwoUnitValueLead) {
value=leadUnit;
} else if(leadUnit<kThreeUnitValueLead) {
value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
} else {
value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
}
return value;
}
private static int skipValue(int pos, int leadUnit) {
if(leadUnit>=kMinTwoUnitValueLead) {
if(leadUnit<kThreeUnitValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
private static int skipValue(CharSequence chars, int pos) {
int leadUnit=chars.charAt(pos++);
return skipValue(pos, leadUnit&0x7fff);
}
private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
int value;
if(leadUnit<kMinTwoUnitNodeValueLead) {
value=(leadUnit>>6)-1;
} else if(leadUnit<kThreeUnitNodeValueLead) {
value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
} else {
value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
}
return value;
}
private static int skipNodeValue(int pos, int leadUnit) {
assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
if(leadUnit>=kMinTwoUnitNodeValueLead) {
if(leadUnit<kThreeUnitNodeValueLead) {
++pos;
} else {
pos+=2;
}
}
return pos;
}
private static int jumpByDelta(CharSequence chars, int pos) {
int delta=chars.charAt(pos++);
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
pos+=2;
} else {
delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
}
}
return pos+delta;
}
private static int skipDelta(CharSequence chars, int pos) {
int delta=chars.charAt(pos++);
if(delta>=kMinTwoUnitDeltaLead) {
if(delta==kThreeUnitDeltaLead) {
pos+=2;
} else {
++pos;
}
}
return pos;
}
private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
// Handles a branch node for both next(unit) and next(string).
private Result branchNext(int pos, int length, int inUnit) {
// Branch according to the current unit.
if(length==0) {
length=chars_.charAt(pos++);
}
++length;
// The length of the branch is the number of units to select from.
// The data structure encodes a binary search.
while(length>kMaxBranchLinearSubNodeLength) {
if(inUnit<chars_.charAt(pos++)) {
length>>=1;
pos=jumpByDelta(chars_, pos);
} else {
length=length-(length>>1);
pos=skipDelta(chars_, pos);
}
}
// Drop down to linear search for the last few units.
// length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
// and divides length by 2.
do {
if(inUnit==chars_.charAt(pos++)) {
Result result;
int node=chars_.charAt(pos);
if((node&kValueIsFinal)!=0) {
// Leave the final value for getValue() to read.
result=Result.FINAL_VALUE;
} else {
// Use the non-final value as the jump delta.
++pos;
// int delta=readValue(pos, node);
int delta;
if(node<kMinTwoUnitValueLead) {
delta=node;
} else if(node<kThreeUnitValueLead) {
delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
} else {
delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
pos+=2;
}
// end readValue()
pos+=delta;
node=chars_.charAt(pos);
result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
}
pos_=pos;
return result;
}
--length;
pos=skipValue(chars_, pos);
} while(length>1);
if(inUnit==chars_.charAt(pos++)) {
pos_=pos;
int node=chars_.charAt(pos);
return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
} else {
stop();
return Result.NO_MATCH;
}
}
// Requires remainingLength_<0.
private Result nextImpl(int pos, int inUnit) {
int node=chars_.charAt(pos++);
for(;;) {
if(node<kMinLinearMatch) {
return branchNext(pos, node, inUnit);
} else if(node<kMinValueLead) {
// Match the first of length+1 units.
int length=node-kMinLinearMatch; // Actual match length minus 1.
if(inUnit==chars_.charAt(pos++)) {
remainingMatchLength_=--length;
pos_=pos;
return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
valueResults_[node>>15] : Result.NO_VALUE;
} else {
// No match.
break;
}
} else if((node&kValueIsFinal)!=0) {
// No further matching units.
break;
} else {
// Skip intermediate value.
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
stop();
return Result.NO_MATCH;
}
// Helper functions for getUniqueValue().
// Recursively finds a unique value (or whether there is not a unique one)
// from a branch.
// uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
// On return, if not 0, then bits 63..33 contain the updated non-negative pos.
private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
long uniqueValue) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
length=length-(length>>1);
pos=skipDelta(chars, pos);
}
do {
++pos; // ignore a comparison unit
// handle its value
int node=chars.charAt(pos++);
boolean isFinal=(node&kValueIsFinal)!=0;
node&=0x7fff;
int value=readValue(chars, pos, node);
pos=skipValue(pos, node);
if(isFinal) {
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
} else {
uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
if(uniqueValue==0) {
return 0;
}
}
} while(--length>1);
// ignore the last comparison byte
return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
}
// Recursively finds a unique value (or whether there is not a unique one)
// starting from a position on a node lead unit.
// uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
// Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
int node=chars.charAt(pos++);
for(;;) {
if(node<kMinLinearMatch) {
if(node==0) {
node=chars.charAt(pos++);
}
uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
if(uniqueValue==0) {
return 0;
}
pos=(int)(uniqueValue>>>33);
node=chars.charAt(pos++);
} else if(node<kMinValueLead) {
// linear-match node
pos+=node-kMinLinearMatch+1; // Ignore the match units.
node=chars.charAt(pos++);
} else {
boolean isFinal=(node&kValueIsFinal)!=0;
int value;
if(isFinal) {
value=readValue(chars, pos, node&0x7fff);
} else {
value=readNodeValue(chars, pos, node);
}
if(uniqueValue!=0) {
if(value!=(int)(uniqueValue>>1)) {
return 0;
}
} else {
uniqueValue=((long)value<<1)|1;
}
if(isFinal) {
return uniqueValue;
}
pos=skipNodeValue(pos, node);
node&=kNodeTypeMask;
}
}
}
// Helper functions for getNextChars().
// getNextChars() when pos is on a branch node.
private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
while(length>kMaxBranchLinearSubNodeLength) {
++pos; // ignore the comparison unit
getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
length=length-(length>>1);
pos=skipDelta(chars, pos);
}
do {
append(out, chars.charAt(pos++));
pos=skipValue(chars, pos);
} while(--length>1);
append(out, chars.charAt(pos));
}
private static void append(Appendable out, int c) {
try {
out.append((char)c);
} catch(IOException e) {
throw new RuntimeException(e);
}
}
// CharsTrie data structure
//
// The trie consists of a series of char-serialized nodes for incremental
// Unicode string/char sequence matching. (char=16-bit unsigned integer)
// The root node is at the beginning of the trie data.
//
// Types of nodes are distinguished by their node lead unit ranges.
// After each node, except a final-value node, another node follows to
// encode match values or continue matching further units.
//
// Node types:
// - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
// The value is for the string/char sequence so far.
// - Match node, optionally with an intermediate value in a different compact format.
// The value, if present, is for the string/char sequence so far.
//
// Aside from the value, which uses the node lead unit's high bits:
//
// - Linear-match node: Matches a number of units.
// - Branch node: Branches to other nodes according to the current input unit.
// The node unit is the length of the branch (number of units to select from)
// minus 1. It is followed by a sub-node:
// - If the length is at most kMaxBranchLinearSubNodeLength, then
// there are length-1 (key, value) pairs and then one more comparison unit.
// If one of the key units matches, then the value is either a final value for
// the string so far, or a "jump" delta to the next node.
// If the last unit matches, then matching continues with the next node.
// (Values have the same encoding as final-value nodes.)
// - If the length is greater than kMaxBranchLinearSubNodeLength, then
// there is one unit and one "jump" delta.
// If the input unit is less than the sub-node unit, then "jump" by delta to
// the next sub-node which will have a length of length/2.
// (The delta has its own compact encoding.)
// Otherwise, skip the "jump" delta to the next sub-node
// which will have a length of length-length/2.
// Match-node lead unit values, after masking off intermediate-value bits:
// 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
// the length is one more than the next unit.
// For a branch sub-node with at most this many entries, we drop down
// to a linear search.
/*package*/ static final int kMaxBranchLinearSubNodeLength=5;
// 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
/*package*/ static final int kMinLinearMatch=0x30;
/*package*/ static final int kMaxLinearMatchLength=0x10;
// Match-node lead unit bits 14..6 for the optional intermediate value.
// If these bits are 0, then there is no intermediate value.
// Otherwise, see the *NodeValue* constants below.
/*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
/*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f
// A final-value node has bit 15 set.
/*package*/ static final int kValueIsFinal=0x8000;
// Compact value: After testing and masking off bit 15, use the following thresholds.
/*package*/ static final int kMaxOneUnitValue=0x3fff;
/*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
/*package*/ static final int kThreeUnitValueLead=0x7fff;
/*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
// Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
/*package*/ static final int kMaxOneUnitNodeValue=0xff;
/*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
/*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
/*package*/ static final int kMaxTwoUnitNodeValue=
((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
// Compact delta integers.
/*package*/ static final int kMaxOneUnitDelta=0xfbff;
/*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
/*package*/ static final int kThreeUnitDeltaLead=0xffff;
/*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
// Fixed value referencing the CharsTrie words.
private CharSequence chars_;
private int root_;
// Iterator variables.
// Pointer to next trie unit to read. NULL if no more matches.
private int pos_;
// Remaining length of a linear-match node, minus 1. Negative if not in such a node.
private int remainingMatchLength_;
}