diff --git a/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java b/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java new file mode 100644 index 0000000000..aaa8c529f8 --- /dev/null +++ b/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java @@ -0,0 +1,453 @@ +/* +****************************************************************************** +* Copyright (C) 1996-2000, International Business Machines Corporation and * +* others. All Rights Reserved. * +****************************************************************************** +* +* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java,v $ +* $Date: 2002/07/12 22:02:06 $ +* $Revision: 1.1 $ +* +****************************************************************************** +*/ + +package com.ibm.icu.impl; + +import com.ibm.icu.lang.UCharacter; +import java.util.Arrays; + +/** + * Builder lass to manipulate and generate a trie. + * This is useful for ICU data in primitive types. + * Provides a compact way to store information that is indexed by Unicode + * values, such as character properties, types, keyboard values, etc. This is + * very useful when you have a block of Unicode data that contains significant + * values while the rest of the Unicode data is unused in the application or + * when you have a lot of redundance, such as where all 21,000 Han ideographs + * have the same value. However, lookup is much faster than a hash table. + * A trie of any primitive data type serves two purposes: + * + * This is a direct port from the ICU4C version + * @version $Revision: 1.1 $ + * @author Syn Wee Quek + */ +public class IntTrieBuilder extends TrieBuilder +{ + // public constructor ---------------------------------------------- + + /** + * Copy constructor + */ + public IntTrieBuilder(IntTrieBuilder table) + { + super(table); + m_data_ = new int[m_dataCapacity_]; + System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_); + m_initialValue_ = table.m_initialValue_; + } + + /** + * Constructs a build table + * @param aliasdata data to be filled into table + * @param maxdatalength maximum data length allowed in table + * @param initialvalue inital data value + * @param latin1linear is latin 1 to be linear + * @return updated table + */ + public IntTrieBuilder(int aliasdata[], int maxdatalength, + int initialvalue, boolean latin1linear) + { + super(); + if (maxdatalength < DATA_BLOCK_LENGTH_ || (latin1linear + && maxdatalength < 1024)) { + throw new IllegalArgumentException( + "Argument maxdatalength is too small"); + } + + if (aliasdata != null) { + m_data_ = aliasdata; + } + else { + m_data_ = new int[maxdatalength]; + } + + // preallocate and reset the first data block (block index 0) + int j = DATA_BLOCK_LENGTH_; + + if (latin1linear) { + // preallocate and reset the first block (number 0) and Latin-1 + // (U+0000..U+00ff) after that made sure above that + // maxDataLength >= 1024 + // set indexes to point to consecutive data blocks + int i = 0; + do { + // do this at least for trie->index[0] even if that block is + // only partly used for Latin-1 + m_index_[i ++] = j; + j += DATA_BLOCK_LENGTH_; + } while (i < (256 >> SHIFT_)); + } + + m_dataLength_ = j; + // reset the initially allocated blocks to the initial value + Arrays.fill(m_data_, 0, m_dataLength_, initialvalue); + m_initialValue_ = initialvalue; + m_dataCapacity_ = maxdatalength; + m_isLatin1Linear_ = latin1linear; + m_isCompacted_ = false; + } + + // public methods ------------------------------------------------------- + + /** + * Gets a 32 bit data from the table data + * @param ch codepoint which data is to be retrieved + * @return the 32 bit data + */ + public int getValue(int ch) + { + // valid, uncompacted trie and valid c? + if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { + return 0; + } + + int block = m_index_[ch >> SHIFT_]; + return m_data_[Math.abs(block) + (ch & MASK_)]; + } + + /** + * Sets a 32 bit data in the table data + * @param ch codepoint which data is to be set + * @param value to set + * @return true if the set is successful, otherwise + * if the table has been compacted return false + */ + public boolean setValue(int ch, int value) + { + // valid, uncompacted trie and valid c? + if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) { + return false; + } + + int block = getDataBlock(ch); + if (block < 0) { + return false; + } + + m_data_[block + (ch & MASK_)] = value; + return true; + } + + /** + * Serializes the build table with 32 bit data + * @param datamanipulate builder raw fold method implementation + * @param triedatamanipulate result trie fold method + * @return a new trie + */ + public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate, + Trie.DataManipulate triedatamanipulate) + { + if (datamanipulate == null) { + throw new IllegalArgumentException("Parameters can not be null"); + } + // fold and compact if necessary, also checks that indexLength is + // within limits + if (!m_isCompacted_) { + // compact once without overlap to improve folding + compact(false); + // fold the supplementary part of the index array + fold(datamanipulate); + // compact again with overlap for minimum data array length + compact(true); + m_isCompacted_ = true; + } + // is dataLength within limits? + if (m_dataLength_ >= MAX_DATA_LENGTH_) { + throw new ArrayIndexOutOfBoundsException("Data length too small"); + } + + char index[] = new char[m_indexLength_]; + int data[] = new int[m_dataLength_]; + // write the index (stage 1) array and the 32-bit data (stage 2) array + // write 16-bit index values shifted right by INDEX_SHIFT_ + for (int i = 0; i < m_indexLength_; i ++) { + index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_); + } + // write 32-bit data values + System.arraycopy(m_data_, 0, data, 0, m_dataLength_); + + int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_); + options |= OPTIONS_DATA_IS_32_BIT_; + if (m_isLatin1Linear_) { + options |= OPTIONS_LATIN1_IS_LINEAR_; + } + return new IntTrie(index, data, m_initialValue_, options, + triedatamanipulate); + } + + // public data member --------------------------------------------- + + protected int m_data_[]; + protected boolean m_isDataAllocated_; + protected int m_initialValue_; + + // private methods ------------------------------------------------------ + + /** + * No error checking for illegal arguments. + * @param ch codepoint to look for + * @return -1 if no new data block available (out of memory in data array) + */ + private int getDataBlock(int ch) + { + ch >>= SHIFT_; + int indexValue = m_index_[ch]; + if (indexValue > 0) { + return indexValue; + } + + // allocate a new data block + int newBlock = m_dataLength_; + int newTop = newBlock + DATA_BLOCK_LENGTH_; + if (newTop > m_dataCapacity_) { + // out of memory in the data array + return -1; + } + m_dataLength_ = newTop; + m_index_[ch] = newBlock; + + // copy-on-write for a block from a setRange() + Arrays.fill(m_data_, newBlock, newBlock + DATA_BLOCK_LENGTH_, + m_initialValue_); + return newBlock; + } + + /** + * Compact a folded build-time trie. + * The compaction + * - removes blocks that are identical with earlier ones + * - overlaps adjacent blocks as much as possible (if overlap == true) + * - moves blocks in steps of the data granularity + * + * It does not + * - try to move and overlap blocks that are not already adjacent + * - try to move and overlap blocks that overlap with multiple values in + * the overlap region + * @param overlap flag + */ + private void compact(boolean overlap) + { + if (m_isCompacted_) { + return; // nothing left to do + } + + // compaction + // initialize the index map with "block is used/unused" flags + findUnusedBlocks(); + + // if Latin-1 is preallocated and linear, then do not compact Latin-1 + // data + int overlapStart = DATA_BLOCK_LENGTH_; + if (m_isLatin1Linear_ && SHIFT_ <= 8) { + overlapStart += 256; + } + + int newStart = DATA_BLOCK_LENGTH_; + int prevEnd = newStart - 1; + for (int start = newStart; start < m_dataLength_;) { + // start: index of first entry of current block + // prevEnd: index to last entry of previous block + // newStart: index where the current block is to be moved + // skip blocks that are not used + if (m_map_[start >> SHIFT_] < 0) { + // advance start to the next block + start += DATA_BLOCK_LENGTH_; + // leave prevEnd and newStart with the previous block! + continue; + } + // search for an identical block + if (start >= overlapStart) { + int i = findSameDataBlock(m_data_, newStart, start, + overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH_); + if (i >= 0) { + // found an identical block, set the other block's index + // value for the current block + m_map_[start >> SHIFT_] = i; + // advance start to the next block + start += DATA_BLOCK_LENGTH_; + // leave prevEnd and newStart with the previous block! + continue; + } + } + // see if the beginning of this block can be overlapped with the + // end of the previous block + // x: first value in the current block + int x = m_data_[start]; + int i = 0; + if (x == m_data_[prevEnd] && overlap && start >= overlapStart) + { + // overlap by at least one + for (i = 1; i < DATA_BLOCK_LENGTH_ + && x == m_data_[start + i] + && x == m_data_[prevEnd - i]; ++ i) + { + } + + // overlap by i, rounded down for the data block granularity + i &= ~(DATA_GRANULARITY_ - 1); + } + if (i > 0) { + // some overlap + m_map_[start >> SHIFT_] = newStart - i; + // move the non-overlapping indexes to their new positions + start += i; + for (i = DATA_BLOCK_LENGTH_ - i; i > 0; -- i) { + m_data_[newStart ++] = m_data_[start ++]; + } + } + else if (newStart < start) { + // no overlap, just move the indexes to their new positions + m_map_[start >> SHIFT_] = newStart; + for (i = DATA_BLOCK_LENGTH_; i > 0; -- i) { + m_data_[newStart ++] = m_data_[start ++]; + } + } + else { // no overlap && newStart==start + m_map_[start >> SHIFT_] = start; + newStart += DATA_BLOCK_LENGTH_; + start = newStart; + } + + prevEnd = newStart - 1; + } + + // now adjust the index (stage 1) table + for (int i = 0; i < m_indexLength_; ++ i) { + m_index_[i] = m_map_[Math.abs(m_index_[i]) >> SHIFT_]; + } + m_dataLength_ = newStart; + } + + /** + * Find the same data block + * @param data array + * @param dataLength + * @param otherBlock + * @param step + */ + private static final int findSameDataBlock(int data[], int dataLength, + int otherBlock, int step) + { + // ensure that we do not even partially get past dataLength + dataLength -= DATA_BLOCK_LENGTH_; + + for (int block = 0; block <= dataLength; block += step) { + int i = 0; + for (i = 0; i < DATA_BLOCK_LENGTH_; ++ i) { + if (data[block + i] != data[otherBlock + i]) { + break; + } + } + if (i == DATA_BLOCK_LENGTH_) { + return block; + } + } + return -1; + } + + /** + * Fold the normalization data for supplementary code points into + * a compact area on top of the BMP-part of the trie index, + * with the lead surrogates indexing this compact area. + * + * Duplicate the index values for lead surrogates: + * From inside the BMP area, where some may be overridden with folded values, + * to just after the BMP area, where they can be retrieved for + * code point lookups. + * @param manipulate fold implementation + */ + private final void fold(DataManipulate manipulate) + { + int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_]; + int index[] = m_index_; + // copy the lead surrogate indexes into a temporary array + System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0, + SURROGATE_BLOCK_COUNT_); + + // to protect the copied lead surrogate values, + // mark all their indexes as repeat blocks + // (causes copy-on-write) + for (char c = 0xd800; c <= 0xdbff; ++ c) { + int block = index[c >> SHIFT_]; + if (block > 0) { + index[c >> SHIFT_] =- block; + } + } + + // Fold significant index values into the area just after the BMP + // indexes. + // In case the first lead surrogate has significant data, + // its index block must be used first (in which case the folding is a + // no-op). + // Later all folded index blocks are moved up one to insert the copied + // lead surrogate indexes. + int indexLength = BMP_INDEX_LENGTH_; + // search for any index (stage 1) entries for supplementary code points + for (int c = 0x10000; c < 0x110000;) { + if (index[c >> SHIFT_] != 0) { + // there is data, treat the full block for a lead surrogate + c &= ~0x3ff; + // is there an identical index block? + int block = findSameIndexBlock(index, indexLength, c >> SHIFT_); + // get a folded value for [c..c+0x400[ and, if 0, set it for + // the lead surrogate + int value = manipulate.getFoldedValue(c, + block + SURROGATE_BLOCK_COUNT_); + if (value != 0) { + if (!setValue(0xd7c0 + (c >> 10), value)) { + // data table overflow + throw new ArrayIndexOutOfBoundsException( + "Data table overflow"); + } + // if we did not find an identical index block... + if (block == indexLength) { + // move the actual index (stage 1) entries from the + // supplementary position to the new one + System.arraycopy(index, c >> SHIFT_, index, indexLength, + SURROGATE_BLOCK_COUNT_ << 2); + indexLength += SURROGATE_BLOCK_COUNT_; + } + } + c += 0x400; + } + else { + c += DATA_BLOCK_LENGTH_; + } + } + + // index array overflow? + // This is to guarantee that a folding offset is of the form + // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. + // If the index is too large, then n>=1024 and more than 10 bits are + // necessary. + // In fact, it can only ever become n==1024 with completely unfoldable + // data and the additional block of duplicated values for lead + // surrogates. + if (indexLength >= MAX_INDEX_LENGTH_) { + throw new ArrayIndexOutOfBoundsException("Index table overflow"); + } + // make space for the lead surrogate index block and insert it between + // the BMP indexes and the folded ones + System.arraycopy(index, BMP_INDEX_LENGTH_, index, + BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_, + indexLength - BMP_INDEX_LENGTH_); + System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_, + SURROGATE_BLOCK_COUNT_); + indexLength += SURROGATE_BLOCK_COUNT_; + m_indexLength_ = indexLength; + } +} + \ No newline at end of file