ICU-1897
int trie builder X-SVN-Rev: 9127
This commit is contained in:
parent
eae341aae7
commit
4d686cffdb
453
icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java
Normal file
453
icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java
Normal file
@ -0,0 +1,453 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2000, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*
|
||||
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java,v $
|
||||
* $Date: 2002/07/12 22:02:06 $
|
||||
* $Revision: 1.1 $
|
||||
*
|
||||
******************************************************************************
|
||||
*/
|
||||
|
||||
package com.ibm.icu.impl;
|
||||
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Builder lass to manipulate and generate a trie.
|
||||
* This is useful for ICU data in primitive types.
|
||||
* Provides a compact way to store information that is indexed by Unicode
|
||||
* values, such as character properties, types, keyboard values, etc. This is
|
||||
* very useful when you have a block of Unicode data that contains significant
|
||||
* values while the rest of the Unicode data is unused in the application or
|
||||
* when you have a lot of redundance, such as where all 21,000 Han ideographs
|
||||
* have the same value. However, lookup is much faster than a hash table.
|
||||
* A trie of any primitive data type serves two purposes:
|
||||
* <UL type = round>
|
||||
* <LI>Fast access of the indexed values.
|
||||
* <LI>Smaller memory footprint.
|
||||
* </UL>
|
||||
* This is a direct port from the ICU4C version
|
||||
* @version $Revision: 1.1 $
|
||||
* @author Syn Wee Quek
|
||||
*/
|
||||
public class IntTrieBuilder extends TrieBuilder
|
||||
{
|
||||
// public constructor ----------------------------------------------
|
||||
|
||||
/**
|
||||
* Copy constructor
|
||||
*/
|
||||
public IntTrieBuilder(IntTrieBuilder table)
|
||||
{
|
||||
super(table);
|
||||
m_data_ = new int[m_dataCapacity_];
|
||||
System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
|
||||
m_initialValue_ = table.m_initialValue_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a build table
|
||||
* @param aliasdata data to be filled into table
|
||||
* @param maxdatalength maximum data length allowed in table
|
||||
* @param initialvalue inital data value
|
||||
* @param latin1linear is latin 1 to be linear
|
||||
* @return updated table
|
||||
*/
|
||||
public IntTrieBuilder(int aliasdata[], int maxdatalength,
|
||||
int initialvalue, boolean latin1linear)
|
||||
{
|
||||
super();
|
||||
if (maxdatalength < DATA_BLOCK_LENGTH_ || (latin1linear
|
||||
&& maxdatalength < 1024)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Argument maxdatalength is too small");
|
||||
}
|
||||
|
||||
if (aliasdata != null) {
|
||||
m_data_ = aliasdata;
|
||||
}
|
||||
else {
|
||||
m_data_ = new int[maxdatalength];
|
||||
}
|
||||
|
||||
// preallocate and reset the first data block (block index 0)
|
||||
int j = DATA_BLOCK_LENGTH_;
|
||||
|
||||
if (latin1linear) {
|
||||
// preallocate and reset the first block (number 0) and Latin-1
|
||||
// (U+0000..U+00ff) after that made sure above that
|
||||
// maxDataLength >= 1024
|
||||
// set indexes to point to consecutive data blocks
|
||||
int i = 0;
|
||||
do {
|
||||
// do this at least for trie->index[0] even if that block is
|
||||
// only partly used for Latin-1
|
||||
m_index_[i ++] = j;
|
||||
j += DATA_BLOCK_LENGTH_;
|
||||
} while (i < (256 >> SHIFT_));
|
||||
}
|
||||
|
||||
m_dataLength_ = j;
|
||||
// reset the initially allocated blocks to the initial value
|
||||
Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
|
||||
m_initialValue_ = initialvalue;
|
||||
m_dataCapacity_ = maxdatalength;
|
||||
m_isLatin1Linear_ = latin1linear;
|
||||
m_isCompacted_ = false;
|
||||
}
|
||||
|
||||
// public methods -------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Gets a 32 bit data from the table data
|
||||
* @param ch codepoint which data is to be retrieved
|
||||
* @return the 32 bit data
|
||||
*/
|
||||
public int getValue(int ch)
|
||||
{
|
||||
// valid, uncompacted trie and valid c?
|
||||
if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int block = m_index_[ch >> SHIFT_];
|
||||
return m_data_[Math.abs(block) + (ch & MASK_)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a 32 bit data in the table data
|
||||
* @param ch codepoint which data is to be set
|
||||
* @param value to set
|
||||
* @return true if the set is successful, otherwise
|
||||
* if the table has been compacted return false
|
||||
*/
|
||||
public boolean setValue(int ch, int value)
|
||||
{
|
||||
// valid, uncompacted trie and valid c?
|
||||
if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int block = getDataBlock(ch);
|
||||
if (block < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
m_data_[block + (ch & MASK_)] = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializes the build table with 32 bit data
|
||||
* @param datamanipulate builder raw fold method implementation
|
||||
* @param triedatamanipulate result trie fold method
|
||||
* @return a new trie
|
||||
*/
|
||||
public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate,
|
||||
Trie.DataManipulate triedatamanipulate)
|
||||
{
|
||||
if (datamanipulate == null) {
|
||||
throw new IllegalArgumentException("Parameters can not be null");
|
||||
}
|
||||
// fold and compact if necessary, also checks that indexLength is
|
||||
// within limits
|
||||
if (!m_isCompacted_) {
|
||||
// compact once without overlap to improve folding
|
||||
compact(false);
|
||||
// fold the supplementary part of the index array
|
||||
fold(datamanipulate);
|
||||
// compact again with overlap for minimum data array length
|
||||
compact(true);
|
||||
m_isCompacted_ = true;
|
||||
}
|
||||
// is dataLength within limits?
|
||||
if (m_dataLength_ >= MAX_DATA_LENGTH_) {
|
||||
throw new ArrayIndexOutOfBoundsException("Data length too small");
|
||||
}
|
||||
|
||||
char index[] = new char[m_indexLength_];
|
||||
int data[] = new int[m_dataLength_];
|
||||
// write the index (stage 1) array and the 32-bit data (stage 2) array
|
||||
// write 16-bit index values shifted right by INDEX_SHIFT_
|
||||
for (int i = 0; i < m_indexLength_; i ++) {
|
||||
index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
|
||||
}
|
||||
// write 32-bit data values
|
||||
System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
|
||||
|
||||
int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
|
||||
options |= OPTIONS_DATA_IS_32_BIT_;
|
||||
if (m_isLatin1Linear_) {
|
||||
options |= OPTIONS_LATIN1_IS_LINEAR_;
|
||||
}
|
||||
return new IntTrie(index, data, m_initialValue_, options,
|
||||
triedatamanipulate);
|
||||
}
|
||||
|
||||
// public data member ---------------------------------------------
|
||||
|
||||
protected int m_data_[];
|
||||
protected boolean m_isDataAllocated_;
|
||||
protected int m_initialValue_;
|
||||
|
||||
// private methods ------------------------------------------------------
|
||||
|
||||
/**
|
||||
* No error checking for illegal arguments.
|
||||
* @param ch codepoint to look for
|
||||
* @return -1 if no new data block available (out of memory in data array)
|
||||
*/
|
||||
private int getDataBlock(int ch)
|
||||
{
|
||||
ch >>= SHIFT_;
|
||||
int indexValue = m_index_[ch];
|
||||
if (indexValue > 0) {
|
||||
return indexValue;
|
||||
}
|
||||
|
||||
// allocate a new data block
|
||||
int newBlock = m_dataLength_;
|
||||
int newTop = newBlock + DATA_BLOCK_LENGTH_;
|
||||
if (newTop > m_dataCapacity_) {
|
||||
// out of memory in the data array
|
||||
return -1;
|
||||
}
|
||||
m_dataLength_ = newTop;
|
||||
m_index_[ch] = newBlock;
|
||||
|
||||
// copy-on-write for a block from a setRange()
|
||||
Arrays.fill(m_data_, newBlock, newBlock + DATA_BLOCK_LENGTH_,
|
||||
m_initialValue_);
|
||||
return newBlock;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compact a folded build-time trie.
|
||||
* The compaction
|
||||
* - removes blocks that are identical with earlier ones
|
||||
* - overlaps adjacent blocks as much as possible (if overlap == true)
|
||||
* - moves blocks in steps of the data granularity
|
||||
*
|
||||
* It does not
|
||||
* - try to move and overlap blocks that are not already adjacent
|
||||
* - try to move and overlap blocks that overlap with multiple values in
|
||||
* the overlap region
|
||||
* @param overlap flag
|
||||
*/
|
||||
private void compact(boolean overlap)
|
||||
{
|
||||
if (m_isCompacted_) {
|
||||
return; // nothing left to do
|
||||
}
|
||||
|
||||
// compaction
|
||||
// initialize the index map with "block is used/unused" flags
|
||||
findUnusedBlocks();
|
||||
|
||||
// if Latin-1 is preallocated and linear, then do not compact Latin-1
|
||||
// data
|
||||
int overlapStart = DATA_BLOCK_LENGTH_;
|
||||
if (m_isLatin1Linear_ && SHIFT_ <= 8) {
|
||||
overlapStart += 256;
|
||||
}
|
||||
|
||||
int newStart = DATA_BLOCK_LENGTH_;
|
||||
int prevEnd = newStart - 1;
|
||||
for (int start = newStart; start < m_dataLength_;) {
|
||||
// start: index of first entry of current block
|
||||
// prevEnd: index to last entry of previous block
|
||||
// newStart: index where the current block is to be moved
|
||||
// skip blocks that are not used
|
||||
if (m_map_[start >> SHIFT_] < 0) {
|
||||
// advance start to the next block
|
||||
start += DATA_BLOCK_LENGTH_;
|
||||
// leave prevEnd and newStart with the previous block!
|
||||
continue;
|
||||
}
|
||||
// search for an identical block
|
||||
if (start >= overlapStart) {
|
||||
int i = findSameDataBlock(m_data_, newStart, start,
|
||||
overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH_);
|
||||
if (i >= 0) {
|
||||
// found an identical block, set the other block's index
|
||||
// value for the current block
|
||||
m_map_[start >> SHIFT_] = i;
|
||||
// advance start to the next block
|
||||
start += DATA_BLOCK_LENGTH_;
|
||||
// leave prevEnd and newStart with the previous block!
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// see if the beginning of this block can be overlapped with the
|
||||
// end of the previous block
|
||||
// x: first value in the current block
|
||||
int x = m_data_[start];
|
||||
int i = 0;
|
||||
if (x == m_data_[prevEnd] && overlap && start >= overlapStart)
|
||||
{
|
||||
// overlap by at least one
|
||||
for (i = 1; i < DATA_BLOCK_LENGTH_
|
||||
&& x == m_data_[start + i]
|
||||
&& x == m_data_[prevEnd - i]; ++ i)
|
||||
{
|
||||
}
|
||||
|
||||
// overlap by i, rounded down for the data block granularity
|
||||
i &= ~(DATA_GRANULARITY_ - 1);
|
||||
}
|
||||
if (i > 0) {
|
||||
// some overlap
|
||||
m_map_[start >> SHIFT_] = newStart - i;
|
||||
// move the non-overlapping indexes to their new positions
|
||||
start += i;
|
||||
for (i = DATA_BLOCK_LENGTH_ - i; i > 0; -- i) {
|
||||
m_data_[newStart ++] = m_data_[start ++];
|
||||
}
|
||||
}
|
||||
else if (newStart < start) {
|
||||
// no overlap, just move the indexes to their new positions
|
||||
m_map_[start >> SHIFT_] = newStart;
|
||||
for (i = DATA_BLOCK_LENGTH_; i > 0; -- i) {
|
||||
m_data_[newStart ++] = m_data_[start ++];
|
||||
}
|
||||
}
|
||||
else { // no overlap && newStart==start
|
||||
m_map_[start >> SHIFT_] = start;
|
||||
newStart += DATA_BLOCK_LENGTH_;
|
||||
start = newStart;
|
||||
}
|
||||
|
||||
prevEnd = newStart - 1;
|
||||
}
|
||||
|
||||
// now adjust the index (stage 1) table
|
||||
for (int i = 0; i < m_indexLength_; ++ i) {
|
||||
m_index_[i] = m_map_[Math.abs(m_index_[i]) >> SHIFT_];
|
||||
}
|
||||
m_dataLength_ = newStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the same data block
|
||||
* @param data array
|
||||
* @param dataLength
|
||||
* @param otherBlock
|
||||
* @param step
|
||||
*/
|
||||
private static final int findSameDataBlock(int data[], int dataLength,
|
||||
int otherBlock, int step)
|
||||
{
|
||||
// ensure that we do not even partially get past dataLength
|
||||
dataLength -= DATA_BLOCK_LENGTH_;
|
||||
|
||||
for (int block = 0; block <= dataLength; block += step) {
|
||||
int i = 0;
|
||||
for (i = 0; i < DATA_BLOCK_LENGTH_; ++ i) {
|
||||
if (data[block + i] != data[otherBlock + i]) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i == DATA_BLOCK_LENGTH_) {
|
||||
return block;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fold the normalization data for supplementary code points into
|
||||
* a compact area on top of the BMP-part of the trie index,
|
||||
* with the lead surrogates indexing this compact area.
|
||||
*
|
||||
* Duplicate the index values for lead surrogates:
|
||||
* From inside the BMP area, where some may be overridden with folded values,
|
||||
* to just after the BMP area, where they can be retrieved for
|
||||
* code point lookups.
|
||||
* @param manipulate fold implementation
|
||||
*/
|
||||
private final void fold(DataManipulate manipulate)
|
||||
{
|
||||
int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
|
||||
int index[] = m_index_;
|
||||
// copy the lead surrogate indexes into a temporary array
|
||||
System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0,
|
||||
SURROGATE_BLOCK_COUNT_);
|
||||
|
||||
// to protect the copied lead surrogate values,
|
||||
// mark all their indexes as repeat blocks
|
||||
// (causes copy-on-write)
|
||||
for (char c = 0xd800; c <= 0xdbff; ++ c) {
|
||||
int block = index[c >> SHIFT_];
|
||||
if (block > 0) {
|
||||
index[c >> SHIFT_] =- block;
|
||||
}
|
||||
}
|
||||
|
||||
// Fold significant index values into the area just after the BMP
|
||||
// indexes.
|
||||
// In case the first lead surrogate has significant data,
|
||||
// its index block must be used first (in which case the folding is a
|
||||
// no-op).
|
||||
// Later all folded index blocks are moved up one to insert the copied
|
||||
// lead surrogate indexes.
|
||||
int indexLength = BMP_INDEX_LENGTH_;
|
||||
// search for any index (stage 1) entries for supplementary code points
|
||||
for (int c = 0x10000; c < 0x110000;) {
|
||||
if (index[c >> SHIFT_] != 0) {
|
||||
// there is data, treat the full block for a lead surrogate
|
||||
c &= ~0x3ff;
|
||||
// is there an identical index block?
|
||||
int block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
|
||||
// get a folded value for [c..c+0x400[ and, if 0, set it for
|
||||
// the lead surrogate
|
||||
int value = manipulate.getFoldedValue(c,
|
||||
block + SURROGATE_BLOCK_COUNT_);
|
||||
if (value != 0) {
|
||||
if (!setValue(0xd7c0 + (c >> 10), value)) {
|
||||
// data table overflow
|
||||
throw new ArrayIndexOutOfBoundsException(
|
||||
"Data table overflow");
|
||||
}
|
||||
// if we did not find an identical index block...
|
||||
if (block == indexLength) {
|
||||
// move the actual index (stage 1) entries from the
|
||||
// supplementary position to the new one
|
||||
System.arraycopy(index, c >> SHIFT_, index, indexLength,
|
||||
SURROGATE_BLOCK_COUNT_ << 2);
|
||||
indexLength += SURROGATE_BLOCK_COUNT_;
|
||||
}
|
||||
}
|
||||
c += 0x400;
|
||||
}
|
||||
else {
|
||||
c += DATA_BLOCK_LENGTH_;
|
||||
}
|
||||
}
|
||||
|
||||
// index array overflow?
|
||||
// This is to guarantee that a folding offset is of the form
|
||||
// UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
|
||||
// If the index is too large, then n>=1024 and more than 10 bits are
|
||||
// necessary.
|
||||
// In fact, it can only ever become n==1024 with completely unfoldable
|
||||
// data and the additional block of duplicated values for lead
|
||||
// surrogates.
|
||||
if (indexLength >= MAX_INDEX_LENGTH_) {
|
||||
throw new ArrayIndexOutOfBoundsException("Index table overflow");
|
||||
}
|
||||
// make space for the lead surrogate index block and insert it between
|
||||
// the BMP indexes and the folded ones
|
||||
System.arraycopy(index, BMP_INDEX_LENGTH_, index,
|
||||
BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
|
||||
indexLength - BMP_INDEX_LENGTH_);
|
||||
System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
|
||||
SURROGATE_BLOCK_COUNT_);
|
||||
indexLength += SURROGATE_BLOCK_COUNT_;
|
||||
m_indexLength_ = indexLength;
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user