diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/BytesTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/BytesTrie.java index e8dc8cce36..15f45d8013 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/impl/BytesTrie.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/BytesTrie.java @@ -10,6 +10,7 @@ package com.ibm.icu.impl; import java.io.IOException; +import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.NoSuchElementException; @@ -58,9 +59,10 @@ public final class BytesTrie implements Cloneable, Iterable { public static final class State { public State() {} private byte[] bytes; + private int root; private int pos; private int remainingMatchLength; - }; + } /** * Saves the state of this trie. @@ -68,6 +70,7 @@ public final class BytesTrie implements Cloneable, Iterable { */ public BytesTrie saveState(State state) /*const*/ { state.bytes=bytes_; + state.root=root_; state.pos=pos_; state.remainingMatchLength=remainingMatchLength_; return this; @@ -81,7 +84,7 @@ public final class BytesTrie implements Cloneable, Iterable { * @see #reset */ public BytesTrie resetToState(State state) { - if(bytes_==state.bytes && bytes_!=null) { + if(bytes_==state.bytes && bytes_!=null && root_==state.root) { pos_=state.pos; remainingMatchLength_=state.remainingMatchLength; } else { @@ -203,7 +206,81 @@ public final class BytesTrie implements Cloneable, Iterable { * * @return The match/value Result. */ - // public Result next(const char *s, int length); + public Result next(byte[] s, int sIndex, int sLimit) { + if(sIndex>=sLimit) { + // Empty input. + return current(); + } + int pos=pos_; + if(pos<0) { + return Result.NO_MATCH; + } + int length=remainingMatchLength_; // Actual remaining match length minus 1. + for(;;) { + // Fetch the next input byte, if there is one. + // Continue a linear-match node. + byte inByte; + for(;;) { + if(sIndex==sLimit) { + remainingMatchLength_=length; + pos_=pos; + int node; + return (length<0 && (node=(bytes_[pos]&0xff))>=kMinValueLead) ? + valueResults_[node&kValueIsFinal] : Result.NO_VALUE; + } + inByte=s[sIndex++]; + if(length<0) { + remainingMatchLength_=length; + break; + } + if(inByte!=bytes_[pos]) { + stop(); + return Result.NO_MATCH; + } + ++pos; + --length; + } + for(;;) { + int node=bytes_[pos++]&0xff; + if(node { /** * Determines whether all byte sequences reachable from the current state * map to the same value, and if so, returns that value. - * @return the unique value in bits 32..1 with bit 0 set, + * @return The unique value in bits 32..1 with bit 0 set, * if all byte sequences reachable from the current state * map to the same value; otherwise returns 0. */ @@ -242,7 +319,7 @@ public final class BytesTrie implements Cloneable, Iterable { * That is, each byte b for which it would be next(b)!=Result.NO_MATCH now. * @param out Each next byte is 0-extended to a char and appended to this object. * (Only uses the out.append(c) method.) - * @return the number of bytes which continue the byte sequence from here + * @return The number of bytes which continue the byte sequence from here. */ public int getNextBytes(Appendable out) /*const*/ { int pos=pos_; @@ -281,7 +358,7 @@ public final class BytesTrie implements Cloneable, Iterable { * @return A new BytesTrie.Iterator. */ public Iterator iterator() { - return new Iterator(bytes_, root_, remainingMatchLength_, 0); + return new Iterator(bytes_, pos_, remainingMatchLength_, 0); } /** @@ -291,7 +368,7 @@ public final class BytesTrie implements Cloneable, Iterable { * @return A new BytesTrie.Iterator. */ public Iterator iterator(int maxStringLength) { - return new Iterator(bytes_, root_, remainingMatchLength_, maxStringLength); + return new Iterator(bytes_, pos_, remainingMatchLength_, maxStringLength); } /** @@ -315,10 +392,13 @@ public final class BytesTrie implements Cloneable, Iterable { } public int stringLength() { return length; } - public byte charAt(int index) { return bytes[index]; } + public byte byteAt(int index) { return bytes[index]; } public void copyStringTo(byte[] dest, int destOffset) { System.arraycopy(bytes, 0, dest, destOffset, length); } + public ByteBuffer stringAsByteBuffer() { + return ByteBuffer.wrap(bytes, 0, length).asReadOnlyBuffer(); + } public int value; @@ -355,7 +435,7 @@ public final class BytesTrie implements Cloneable, Iterable { entry_=new Entry(maxLength_!=0 ? maxLength_ : 32); int length=remainingMatchLength_; // Actual remaining match length minus 1. if(length>=0) { - // Pending linear-match node, append remaining bytes to str. + // Pending linear-match node, append remaining bytes to entry_. ++length; if(maxLength_>0 && length>maxLength_) { length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. @@ -450,7 +530,7 @@ public final class BytesTrie implements Cloneable, Iterable { return entry_; // Reached a final value. } } else { - // Linear-match node, append length bytes to str_. + // Linear-match node, append length bytes to entry_. int length=node-kMinLinearMatch+1; if(maxLength_>0 && entry_.length+length>maxLength_) { entry_.append(bytes_, pos, maxLength_-entry_.length); @@ -519,7 +599,7 @@ public final class BytesTrie implements Cloneable, Iterable { // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, // but the code looks more confusing that way.) - ArrayList stack_=new ArrayList(); + private ArrayList stack_=new ArrayList(); } private void stop() { @@ -699,7 +779,7 @@ public final class BytesTrie implements Cloneable, Iterable { return Result.NO_MATCH; } - // Helper functions for hasUniqueValue(). + // Helper functions for getUniqueValue(). // Recursively finds a unique value (or whether there is not a unique one) // from a branch. // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). diff --git a/icu4j/main/classes/core/src/com/ibm/icu/impl/CharsTrie.java b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharsTrie.java new file mode 100644 index 0000000000..52b306cba2 --- /dev/null +++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/CharsTrie.java @@ -0,0 +1,951 @@ +/* +******************************************************************************* +* Copyright (C) 2011, International Business Machines +* Corporation and others. All Rights Reserved. +******************************************************************************* +* created on: 2011jan06 +* created by: Markus W. Scherer +* ported from ICU4C ucharstrie.h/.cpp +*/ + +package com.ibm.icu.impl; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.NoSuchElementException; + +import com.ibm.icu.impl.BytesTrie.Result; +import com.ibm.icu.text.UTF16; + +/** + * Light-weight, non-const reader class for a CharsTrie. + * Traverses a char-serialized data structure with minimal state, + * for mapping strings (16-bit-unit sequences) to non-negative integer values. + * + * @author Markus W. Scherer + */ +public final class CharsTrie implements Cloneable, Iterable { + /** + * Constructs a CharsTrie reader instance. + * @param trieChars CharSequence that contains the serialized trie. + * @param offset Root offset of the trie in the CharSequence. + */ + public CharsTrie(CharSequence trieChars, int offset) { + chars_=trieChars; + pos_=root_=offset; + remainingMatchLength_=-1; + } + + /** + * Clones this trie reader object and its state, + * but not the char array which will be shared. + * @return A shallow clone of this trie. + */ + @Override + public Object clone() throws CloneNotSupportedException { + return super.clone(); // A shallow copy is just what we need. + } + + /** + * Resets this trie to its initial state. + */ + public CharsTrie reset() { + pos_=root_; + remainingMatchLength_=-1; + return this; + } + + /** + * CharsTrie state object, for saving a trie's current state + * and resetting the trie back to this state later. + */ + public static final class State { + public State() {} + private CharSequence chars; + private int root; + private int pos; + private int remainingMatchLength; + } + + /** + * Saves the state of this trie. + * @see #resetToState + */ + public CharsTrie saveState(State state) /*const*/ { + state.chars=chars_; + state.root=root_; + state.pos=pos_; + state.remainingMatchLength=remainingMatchLength_; + return this; + } + + /** + * Resets this trie to the saved state. + * @throws IllegalArgumentException if the state object contains no state, + * or the state of a different trie + * @see #saveState + * @see #reset + */ + public CharsTrie resetToState(State state) { + if(chars_==state.chars && chars_!=null && root_==state.root) { + pos_=state.pos; + remainingMatchLength_=state.remainingMatchLength; + } else { + throw new IllegalArgumentException("incompatible trie state"); + } + return this; + } + + /** + * Determines whether the string so far matches, whether it has a value, + * and whether another input char can continue a matching string. + * @return The match/value Result. + */ + public Result current() /*const*/ { + int pos=pos_; + if(pos<0) { + return Result.NO_MATCH; + } else { + int node; + return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? + valueResults_[node>>15] : Result.NO_VALUE; + } + } + + /** + * Traverses the trie from the initial state for this input char. + * Equivalent to reset().next(inUnit). + * @return The match/value Result. + */ + public Result first(int inUnit) { + remainingMatchLength_=-1; + return nextImpl(root_, inUnit); + } + + /** + * Traverses the trie from the initial state for the + * one or two UTF-16 code units for this input code point. + * Equivalent to reset().nextForCodePoint(cp). + * @return The match/value Result. + */ + public Result firstForCodePoint(int cp) { + return cp<=0xffff ? + first(cp) : + (first(UTF16.getLeadSurrogate(cp)).hasNext() ? + next(UTF16.getTrailSurrogate(cp)) : + Result.NO_MATCH); + } + + /** + * Traverses the trie from the current state for this input char. + * @return The match/value Result. + */ + public Result next(int inUnit) { + int pos=pos_; + if(pos<0) { + return Result.NO_MATCH; + } + int length=remainingMatchLength_; // Actual remaining match length minus 1. + if(length>=0) { + // Remaining part of a linear-match node. + if(inUnit==chars_.charAt(pos++)) { + remainingMatchLength_=--length; + pos_=pos; + int node; + return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? + valueResults_[node>>15] : Result.NO_VALUE; + } else { + stop(); + return Result.NO_MATCH; + } + } + return nextImpl(pos, inUnit); + } + + /** + * Traverses the trie from the current state for the + * one or two UTF-16 code units for this input code point. + * @return The match/value Result. + */ + public Result nextForCodePoint(int cp) { + return cp<=0xffff ? + next(cp) : + (next(UTF16.getLeadSurrogate(cp)).hasNext() ? + next(UTF16.getTrailSurrogate(cp)) : + Result.NO_MATCH); + } + + /** + * Traverses the trie from the current state for this string. + * Equivalent to + *
+     * Result result=current();
+     * for(each c in s)
+     *   if(!result.hasNext()) return Result.NO_MATCH;
+     *   result=next(c);
+     * return result;
+     * 
+ * @return The match/value Result. + */ + public Result next(CharSequence s, int sIndex, int sLimit) { + if(sIndex>=sLimit) { + // Empty input. + return current(); + } + int pos=pos_; + if(pos<0) { + return Result.NO_MATCH; + } + int length=remainingMatchLength_; // Actual remaining match length minus 1. + for(;;) { + // Fetch the next input unit, if there is one. + // Continue a linear-match node. + char inUnit; + for(;;) { + if(sIndex==sLimit) { + remainingMatchLength_=length; + pos_=pos; + int node; + return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? + valueResults_[node>>15] : Result.NO_VALUE; + } + inUnit=s.charAt(sIndex++); + if(length<0) { + remainingMatchLength_=length; + break; + } + if(inUnit!=chars_.charAt(pos)) { + stop(); + return Result.NO_MATCH; + } + ++pos; + --length; + } + int node=chars_.charAt(pos++); + for(;;) { + if(node=kMinValueLead); + return (leadUnit&kValueIsFinal)!=0 ? + readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); + } + + /** + * Determines whether all strings reachable from the current state + * map to the same value, and if so, returns that value. + * @return The unique value in bits 32..1 with bit 0 set, + * if all strings reachable from the current state + * map to the same value; otherwise returns 0. + */ + public long getUniqueValue() /*const*/ { + int pos=pos_; + if(pos<0) { + return 0; + } + // Skip the rest of a pending linear-match node. + long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); + // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. + return (uniqueValue<<31)>>31; + } + + /** + * Finds each char which continues the string from the current state. + * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. + * @param out Each next char is appended to this object. + * (Only uses the out.append(c) method.) + * @return The number of chars which continue the string from here. + */ + public int getNextChars(Appendable out) /*const*/ { + int pos=pos_; + if(pos<0) { + return 0; + } + if(remainingMatchLength_>=0) { + append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. + return 1; + } + int node=chars_.charAt(pos++); + if(node>=kMinValueLead) { + if((node&kValueIsFinal)!=0) { + return 0; + } else { + pos=skipNodeValue(pos, node); + node&=kNodeTypeMask; + } + } + if(node { + private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { + chars_=trieChars; + pos_=initialPos_=offset; + remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; + maxLength_=maxStringLength; + int length=remainingMatchLength_; // Actual remaining match length minus 1. + if(length>=0) { + // Pending linear-match node, append remaining bytes to str_. + ++length; + if(maxLength_>0 && length>maxLength_) { + length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. + } + str_.append(chars_, pos_, pos_+length); + pos_+=length; + remainingMatchLength_-=length; + } + } + + /** + * Resets this iterator to its initial state. + */ + public Iterator reset() { + pos_=initialPos_; + remainingMatchLength_=initialRemainingMatchLength_; + skipValue_=false; + int length=remainingMatchLength_+1; // Remaining match length. + if(maxLength_>0 && length>maxLength_) { + length=maxLength_; + } + str_.setLength(length); + pos_+=length; + remainingMatchLength_-=length; + stack_.clear(); + return this; + } + + /** + * @return true if there are more elements. + */ + public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } + + /** + * Finds the next (string, value) pair if there is one. + * + * If the string is truncated to the maximum length and does not + * have a real value, then the value is set to -1. + * In this case, this "not a real value" is indistinguishable from + * a real value of -1. + * @return An Entry with the string and value of the next element. + * @throw NoSuchElementException - iteration has no more elements. + */ + public Entry next() { + int pos=pos_; + if(pos<0) { + if(stack_.isEmpty()) { + throw new NoSuchElementException(); + } + // Pop the state off the stack and continue with the next outbound edge of + // the branch node. + long top=stack_.remove(stack_.size()-1); + int length=(int)top; + pos=(int)(top>>32); + str_.setLength(length&0xffff); + length>>>=16; + if(length>1) { + pos=branchNext(pos, length); + if(pos<0) { + return entry_; // Reached a final value. + } + } else { + str_.append(chars_.charAt(pos++)); + } + } + if(remainingMatchLength_>=0) { + // We only get here if we started in a pending linear-match node + // with more than maxLength remaining units. + return truncateAndStop(); + } + for(;;) { + int node=chars_.charAt(pos++); + if(node>=kMinValueLead) { + if(skipValue_) { + pos=skipNodeValue(pos, node); + node&=kNodeTypeMask; + skipValue_=false; + } else { + // Deliver value for the string so far. + boolean isFinal=(node&kValueIsFinal)!=0; + if(isFinal) { + entry_.value=readValue(chars_, pos, node&0x7fff); + } else { + entry_.value=readNodeValue(chars_, pos, node); + } + if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { + pos_=-1; + } else { + // We cannot skip the value right here because it shares its + // lead unit with a match node which we have to evaluate + // next time. + // Instead, keep pos_ on the node lead unit itself. + pos_=pos-1; + skipValue_=true; + } + entry_.chars=str_; + return entry_; + } + } + if(maxLength_>0 && str_.length()==maxLength_) { + return truncateAndStop(); + } + if(node0 && str_.length()+length>maxLength_) { + str_.append(chars_, pos, pos+maxLength_-str_.length()); + return truncateAndStop(); + } + str_.append(chars_, pos, pos+length); + pos+=length; + } + } + } + + /** + * Iterator.remove() is not supported. + * @throws UnsupportedOperationException (always) + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + private Entry truncateAndStop() { + pos_=-1; + entry_.chars=str_; + entry_.value=-1; // no real value for str + return entry_; + } + + private int branchNext(int pos, int length) { + while(length>kMaxBranchLinearSubNodeLength) { + ++pos; // ignore the comparison unit + // Push state for the greater-or-equal edge. + stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); + // Follow the less-than edge. + length>>=1; + pos=jumpByDelta(chars_, pos); + } + // List of key-value pairs where values are either final values or jump deltas. + // Read the first (key, value) pair. + char trieUnit=chars_.charAt(pos++); + int node=chars_.charAt(pos++); + boolean isFinal=(node&kValueIsFinal)!=0; + int value=readValue(chars_, pos, node&=0x7fff); + pos=skipValue(pos, node); + stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); + str_.append(trieUnit); + if(isFinal) { + pos_=-1; + entry_.chars=str_; + entry_.value=value; + return -1; + } else { + return pos+value; + } + } + + private CharSequence chars_; + private int pos_; + private int initialPos_; + private int remainingMatchLength_; + private int initialRemainingMatchLength_; + private boolean skipValue_; // Skip intermediate value which was already delivered. + + private StringBuilder str_; + private int maxLength_; + private Entry entry_=new Entry(); + + // The stack stores longs for backtracking to another + // outbound edge of a branch node. + // Each long has the offset in chars_ in bits 62..32, + // the str_.length() from before the node in bits 15..0, + // and the remaining branch length in bits 31..16. + // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, + // but the code looks more confusing that way.) + private ArrayList stack_=new ArrayList(); + } + + private void stop() { + pos_=-1; + } + + // Reads a compact 32-bit integer. + // pos is already after the leadUnit, and the lead unit has bit 15 reset. + private static int readValue(CharSequence chars, int pos, int leadUnit) { + int value; + if(leadUnit=kMinTwoUnitValueLead) { + if(leadUnit>6)-1; + } else if(leadUnit=kMinTwoUnitNodeValueLead) { + if(leadUnit=kMinTwoUnitDeltaLead) { + if(delta==kThreeUnitDeltaLead) { + delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); + pos+=2; + } else { + delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); + } + } + return pos+delta; + } + + private static int skipDelta(CharSequence chars, int pos) { + int delta=chars.charAt(pos++); + if(delta>=kMinTwoUnitDeltaLead) { + if(delta==kThreeUnitDeltaLead) { + pos+=2; + } else { + ++pos; + } + } + return pos; + } + + private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; + + // Handles a branch node for both next(unit) and next(string). + private Result branchNext(int pos, int length, int inUnit) { + // Branch according to the current unit. + if(length==0) { + length=chars_.charAt(pos++); + } + ++length; + // The length of the branch is the number of units to select from. + // The data structure encodes a binary search. + while(length>kMaxBranchLinearSubNodeLength) { + if(inUnit>=1; + pos=jumpByDelta(chars_, pos); + } else { + length=length-(length>>1); + pos=skipDelta(chars_, pos); + } + } + // Drop down to linear search for the last few units. + // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 + // and divides length by 2. + do { + if(inUnit==chars_.charAt(pos++)) { + Result result; + int node=chars_.charAt(pos); + if((node&kValueIsFinal)!=0) { + // Leave the final value for getValue() to read. + result=Result.FINAL_VALUE; + } else { + // Use the non-final value as the jump delta. + ++pos; + // int delta=readValue(pos, node); + int delta; + if(node=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; + } + pos_=pos; + return result; + } + --length; + pos=skipValue(chars_, pos); + } while(length>1); + if(inUnit==chars_.charAt(pos++)) { + pos_=pos; + int node=chars_.charAt(pos); + return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; + } else { + stop(); + return Result.NO_MATCH; + } + } + + // Requires remainingLength_<0. + private Result nextImpl(int pos, int inUnit) { + int node=chars_.charAt(pos++); + for(;;) { + if(node=kMinValueLead) ? + valueResults_[node>>15] : Result.NO_VALUE; + } else { + // No match. + break; + } + } else if((node&kValueIsFinal)!=0) { + // No further matching units. + break; + } else { + // Skip intermediate value. + pos=skipNodeValue(pos, node); + node&=kNodeTypeMask; + } + } + stop(); + return Result.NO_MATCH; + } + + // Helper functions for getUniqueValue(). + // Recursively finds a unique value (or whether there is not a unique one) + // from a branch. + // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). + // On return, if not 0, then bits 63..33 contain the updated non-negative pos. + private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, + long uniqueValue) { + while(length>kMaxBranchLinearSubNodeLength) { + ++pos; // ignore the comparison unit + uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); + if(uniqueValue==0) { + return 0; + } + length=length-(length>>1); + pos=skipDelta(chars, pos); + } + do { + ++pos; // ignore a comparison unit + // handle its value + int node=chars.charAt(pos++); + boolean isFinal=(node&kValueIsFinal)!=0; + node&=0x7fff; + int value=readValue(chars, pos, node); + pos=skipValue(pos, node); + if(isFinal) { + if(uniqueValue!=0) { + if(value!=(int)(uniqueValue>>1)) { + return 0; + } + } else { + uniqueValue=((long)value<<1)|1; + } + } else { + uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); + if(uniqueValue==0) { + return 0; + } + } + } while(--length>1); + // ignore the last comparison byte + return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); + } + // Recursively finds a unique value (or whether there is not a unique one) + // starting from a position on a node lead unit. + // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. + // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. + private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { + int node=chars.charAt(pos++); + for(;;) { + if(node>>33); + node=chars.charAt(pos++); + } else if(node>1)) { + return 0; + } + } else { + uniqueValue=((long)value<<1)|1; + } + if(isFinal) { + return uniqueValue; + } + pos=skipNodeValue(pos, node); + node&=kNodeTypeMask; + } + } + } + + // Helper functions for getNextChars(). + // getNextChars() when pos is on a branch node. + private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { + while(length>kMaxBranchLinearSubNodeLength) { + ++pos; // ignore the comparison unit + getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); + length=length-(length>>1); + pos=skipDelta(chars, pos); + } + do { + append(out, chars.charAt(pos++)); + pos=skipValue(chars, pos); + } while(--length>1); + append(out, chars.charAt(pos)); + } + private static void append(Appendable out, int c) { + try { + out.append((char)c); + } catch(IOException e) { + throw new RuntimeException(e); + } + } + + // CharsTrie data structure + // + // The trie consists of a series of char-serialized nodes for incremental + // Unicode string/char sequence matching. (char=16-bit unsigned integer) + // The root node is at the beginning of the trie data. + // + // Types of nodes are distinguished by their node lead unit ranges. + // After each node, except a final-value node, another node follows to + // encode match values or continue matching further units. + // + // Node types: + // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. + // The value is for the string/char sequence so far. + // - Match node, optionally with an intermediate value in a different compact format. + // The value, if present, is for the string/char sequence so far. + // + // Aside from the value, which uses the node lead unit's high bits: + // + // - Linear-match node: Matches a number of units. + // - Branch node: Branches to other nodes according to the current input unit. + // The node unit is the length of the branch (number of units to select from) + // minus 1. It is followed by a sub-node: + // - If the length is at most kMaxBranchLinearSubNodeLength, then + // there are length-1 (key, value) pairs and then one more comparison unit. + // If one of the key units matches, then the value is either a final value for + // the string so far, or a "jump" delta to the next node. + // If the last unit matches, then matching continues with the next node. + // (Values have the same encoding as final-value nodes.) + // - If the length is greater than kMaxBranchLinearSubNodeLength, then + // there is one unit and one "jump" delta. + // If the input unit is less than the sub-node unit, then "jump" by delta to + // the next sub-node which will have a length of length/2. + // (The delta has its own compact encoding.) + // Otherwise, skip the "jump" delta to the next sub-node + // which will have a length of length-length/2. + + // Match-node lead unit values, after masking off intermediate-value bits: + + // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise + // the length is one more than the next unit. + + // For a branch sub-node with at most this many entries, we drop down + // to a linear search. + /*package*/ static final int kMaxBranchLinearSubNodeLength=5; + + // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. + /*package*/ static final int kMinLinearMatch=0x30; + /*package*/ static final int kMaxLinearMatchLength=0x10; + + // Match-node lead unit bits 14..6 for the optional intermediate value. + // If these bits are 0, then there is no intermediate value. + // Otherwise, see the *NodeValue* constants below. + /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 + /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f + + // A final-value node has bit 15 set. + /*package*/ static final int kValueIsFinal=0x8000; + + // Compact value: After testing and masking off bit 15, use the following thresholds. + /*package*/ static final int kMaxOneUnitValue=0x3fff; + + /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 + /*package*/ static final int kThreeUnitValueLead=0x7fff; + + /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff + + // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. + /*package*/ static final int kMaxOneUnitNodeValue=0xff; + /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 + /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; + + /*package*/ static final int kMaxTwoUnitNodeValue= + ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff + + // Compact delta integers. + /*package*/ static final int kMaxOneUnitDelta=0xfbff; + /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 + /*package*/ static final int kThreeUnitDeltaLead=0xffff; + + /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff + + // Fixed value referencing the CharsTrie words. + private CharSequence chars_; + private int root_; + + // Iterator variables. + + // Pointer to next trie unit to read. NULL if no more matches. + private int pos_; + // Remaining length of a linear-match node, minus 1. Negative if not in such a node. + private int remainingMatchLength_; +}