ICU-1897

int trie builder X-SVN-Rev: 9127
2002-07-12 22:02:06 +00:00 · 2002-07-12 22:02:06 +00:00 · 4d686cffdb
commit 4d686cffdb
parent eae341aae7
1 changed files with 453 additions and 0 deletions
--- a/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java
+++ b/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java
@ -0,0 +1,453 @@
+/*
+******************************************************************************
+* Copyright (C) 1996-2000, International Business Machines Corporation and   *
+* others. All Rights Reserved.                                               *
+******************************************************************************
+*
+* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/impl/IntTrieBuilder.java,v $ 
+* $Date: 2002/07/12 22:02:06 $ 
+* $Revision: 1.1 $
+*
+******************************************************************************
+*/
+
+package com.ibm.icu.impl;
+
+import com.ibm.icu.lang.UCharacter;
+import java.util.Arrays;
+
+/**
+ * Builder lass to manipulate and generate a trie.
+ * This is useful for ICU data in primitive types.
+ * Provides a compact way to store information that is indexed by Unicode 
+ * values, such as character properties, types, keyboard values, etc. This is 
+ * very useful when you have a block of Unicode data that contains significant 
+ * values while the rest of the Unicode data is unused in the application or 
+ * when you have a lot of redundance, such as where all 21,000 Han ideographs 
+ * have the same value.  However, lookup is much faster than a hash table.
+ * A trie of any primitive data type serves two purposes:
+ * <UL type = round>
+ *     <LI>Fast access of the indexed values.
+ *     <LI>Smaller memory footprint.
+ * </UL>
+ * This is a direct port from the ICU4C version
+ * @version            $Revision: 1.1 $
+ * @author             Syn Wee Quek
+ */
+public class IntTrieBuilder extends TrieBuilder
+{
+	// public constructor ----------------------------------------------
+		
+	/**
+	 * Copy constructor
+	 */
+	public IntTrieBuilder(IntTrieBuilder table)
+    {
+    	super(table);
+		m_data_ = new int[m_dataCapacity_];
+        System.arraycopy(table.m_data_, 0, m_data_, 0, m_dataLength_);
+        m_initialValue_ = table.m_initialValue_;
+    }
+    
+    /**
+     * Constructs a build table
+     * @param aliasdata data to be filled into table
+     * @param maxdatalength maximum data length allowed in table
+     * @param initialvalue inital data value
+     * @param latin1linear is latin 1 to be linear
+     * @return updated table
+     */
+    public IntTrieBuilder(int aliasdata[], int maxdatalength, 
+                          int initialvalue, boolean latin1linear) 
+    {
+    	super();
+    	if (maxdatalength < DATA_BLOCK_LENGTH_ || (latin1linear 
+	                                               && maxdatalength < 1024)) {
+	        throw new IllegalArgumentException(
+	                                   "Argument maxdatalength is too small");
+	    }
+	    
+	    if (aliasdata != null) {
+	        m_data_ = aliasdata;
+	    } 
+	    else {
+	        m_data_ = new int[maxdatalength];
+	    }
+	
+	    // preallocate and reset the first data block (block index 0)
+	    int j = DATA_BLOCK_LENGTH_;
+	
+	    if (latin1linear) {
+	        // preallocate and reset the first block (number 0) and Latin-1 
+	        // (U+0000..U+00ff) after that made sure above that 
+	        // maxDataLength >= 1024
+	        // set indexes to point to consecutive data blocks
+	        int i = 0;
+	        do {
+	            // do this at least for trie->index[0] even if that block is 
+	            // only partly used for Latin-1
+	            m_index_[i ++] = j;
+	            j += DATA_BLOCK_LENGTH_;
+	        } while (i < (256 >> SHIFT_));
+	    }
+	
+        m_dataLength_ = j;
+	    // reset the initially allocated blocks to the initial value
+        Arrays.fill(m_data_, 0, m_dataLength_, initialvalue);
+	    m_initialValue_ = initialvalue;
+	    m_dataCapacity_ = maxdatalength;
+	    m_isLatin1Linear_ = latin1linear;
+	    m_isCompacted_ = false;
+	}
+
+	// public methods -------------------------------------------------------
+	
+    /**
+     * Gets a 32 bit data from the table data
+     * @param ch codepoint which data is to be retrieved
+     * @return the 32 bit data
+     */
+    public int getValue(int ch) 
+    {
+        // valid, uncompacted trie and valid c?
+        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
+            return 0;
+        }
+    
+        int block = m_index_[ch >> SHIFT_];
+        return m_data_[Math.abs(block) + (ch & MASK_)];
+    }
+    
+    /**
+     * Sets a 32 bit data in the table data
+     * @param ch codepoint which data is to be set
+     * @param value to set
+     * @return true if the set is successful, otherwise 
+     *              if the table has been compacted return false
+     */
+    public boolean setValue(int ch, int value) 
+    {
+        // valid, uncompacted trie and valid c? 
+        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < 0) {
+            return false;
+        }
+    
+        int block = getDataBlock(ch);
+        if (block < 0) {
+            return false;
+        }
+    
+        m_data_[block + (ch & MASK_)] = value;
+        return true;
+    }
+    
+    /**
+     * Serializes the build table with 32 bit data
+     * @param datamanipulate builder raw fold method implementation
+     * @param triedatamanipulate result trie fold method
+     * @return a new trie
+     */
+    public IntTrie serialize(TrieBuilder.DataManipulate datamanipulate, 
+                             Trie.DataManipulate triedatamanipulate)
+    {
+        if (datamanipulate == null) {
+            throw new IllegalArgumentException("Parameters can not be null");
+        }
+        // fold and compact if necessary, also checks that indexLength is 
+        // within limits 
+        if (!m_isCompacted_) {
+            // compact once without overlap to improve folding
+            compact(false);
+            // fold the supplementary part of the index array
+            fold(datamanipulate);
+            // compact again with overlap for minimum data array length
+            compact(true);
+            m_isCompacted_ = true;
+        }
+        // is dataLength within limits? 
+        if (m_dataLength_ >= MAX_DATA_LENGTH_) {
+            throw new ArrayIndexOutOfBoundsException("Data length too small");
+        }
+    
+        char index[] = new char[m_indexLength_];
+        int data[] = new int[m_dataLength_];
+        // write the index (stage 1) array and the 32-bit data (stage 2) array
+        // write 16-bit index values shifted right by INDEX_SHIFT_ 
+        for (int i = 0; i < m_indexLength_; i ++) {
+            index[i] = (char)(m_index_[i] >>> INDEX_SHIFT_);
+        }
+        // write 32-bit data values
+        System.arraycopy(m_data_, 0, data, 0, m_dataLength_);
+        
+        int options = SHIFT_ | (INDEX_SHIFT_ << OPTIONS_INDEX_SHIFT_);
+        options |= OPTIONS_DATA_IS_32_BIT_;
+        if (m_isLatin1Linear_) {
+            options |= OPTIONS_LATIN1_IS_LINEAR_;
+        }
+        return new IntTrie(index, data, m_initialValue_, options, 
+                           triedatamanipulate);
+    }
+    
+	// public data member ---------------------------------------------
+		
+	protected int m_data_[];
+	protected boolean m_isDataAllocated_;
+	protected int m_initialValue_;  
+	
+	// private methods ------------------------------------------------------
+   
+    /**
+     * No error checking for illegal arguments.
+     * @param ch codepoint to look for
+     * @return -1 if no new data block available (out of memory in data array)
+     */
+    private int getDataBlock(int ch) 
+    {
+        ch >>= SHIFT_;
+        int indexValue = m_index_[ch];
+        if (indexValue > 0) {
+            return indexValue;
+        }
+    
+        // allocate a new data block
+        int newBlock = m_dataLength_;
+        int newTop = newBlock + DATA_BLOCK_LENGTH_;
+        if (newTop > m_dataCapacity_) {
+            // out of memory in the data array 
+            return -1;
+        }
+        m_dataLength_ = newTop;
+        m_index_[ch] = newBlock;
+    
+        // copy-on-write for a block from a setRange()
+        Arrays.fill(m_data_, newBlock, newBlock + DATA_BLOCK_LENGTH_, 
+                    m_initialValue_);
+        return newBlock;
+    }
+    
+    /**
+     * Compact a folded build-time trie.
+     * The compaction
+     * - removes blocks that are identical with earlier ones
+     * - overlaps adjacent blocks as much as possible (if overlap == true)
+     * - moves blocks in steps of the data granularity
+     *
+     * It does not
+     * - try to move and overlap blocks that are not already adjacent
+     * - try to move and overlap blocks that overlap with multiple values in 
+     * the overlap region
+     * @param overlap flag
+     */
+    private void compact(boolean overlap) 
+    {
+        if (m_isCompacted_) {
+            return; // nothing left to do
+        }
+    
+        // compaction
+        // initialize the index map with "block is used/unused" flags
+        findUnusedBlocks();
+    
+        // if Latin-1 is preallocated and linear, then do not compact Latin-1 
+        // data
+        int overlapStart = DATA_BLOCK_LENGTH_;
+        if (m_isLatin1Linear_ && SHIFT_ <= 8) {
+            overlapStart += 256;
+        }
+       
+        int newStart = DATA_BLOCK_LENGTH_;
+        int prevEnd = newStart - 1;
+        for (int start = newStart; start < m_dataLength_;) {
+            // start: index of first entry of current block
+            // prevEnd: index to last entry of previous block
+            // newStart: index where the current block is to be moved
+            // skip blocks that are not used 
+            if (m_map_[start >> SHIFT_] < 0) {
+                // advance start to the next block 
+                start += DATA_BLOCK_LENGTH_;
+                // leave prevEnd and newStart with the previous block!
+                continue;
+            }
+            // search for an identical block
+            if (start >= overlapStart) {
+                int i = findSameDataBlock(m_data_, newStart, start,
+                             overlap ? DATA_GRANULARITY_ : DATA_BLOCK_LENGTH_);
+                if (i >= 0) {
+                    // found an identical block, set the other block's index 
+                    // value for the current block
+                    m_map_[start >> SHIFT_] = i;
+                    // advance start to the next block
+                    start += DATA_BLOCK_LENGTH_;
+                    // leave prevEnd and newStart with the previous block!
+                    continue;
+                }
+            }
+            // see if the beginning of this block can be overlapped with the 
+            // end of the previous block
+            // x: first value in the current block 
+            int x = m_data_[start];
+            int i = 0;
+            if (x == m_data_[prevEnd] && overlap && start >= overlapStart) 
+            {
+                // overlap by at least one
+                for (i = 1; i < DATA_BLOCK_LENGTH_ 
+                     && x == m_data_[start + i] 
+                     && x == m_data_[prevEnd - i]; ++ i) 
+                {
+                }
+    
+                // overlap by i, rounded down for the data block granularity
+                i &= ~(DATA_GRANULARITY_ - 1);
+            } 
+            if (i > 0) {
+                // some overlap
+                m_map_[start >> SHIFT_] = newStart - i;
+                // move the non-overlapping indexes to their new positions
+                start += i;
+                for (i = DATA_BLOCK_LENGTH_ - i; i > 0; -- i) {
+                    m_data_[newStart ++] = m_data_[start ++];
+                }
+            } 
+            else if (newStart < start) {
+                // no overlap, just move the indexes to their new positions
+                m_map_[start >> SHIFT_] = newStart;
+                for (i = DATA_BLOCK_LENGTH_; i > 0; -- i) {
+                    m_data_[newStart ++] = m_data_[start ++];
+                }
+            } 
+            else { // no overlap && newStart==start
+                m_map_[start >> SHIFT_] = start;
+                newStart += DATA_BLOCK_LENGTH_;
+                start = newStart;
+            }
+    
+            prevEnd = newStart - 1;
+        }
+    
+        // now adjust the index (stage 1) table
+        for (int i = 0; i < m_indexLength_; ++ i) {
+            m_index_[i] = m_map_[Math.abs(m_index_[i]) >> SHIFT_];
+        }
+        m_dataLength_ = newStart;
+    }
+
+    /**
+     * Find the same data block
+     * @param data array
+     * @param dataLength
+     * @param otherBlock
+     * @param step
+     */
+    private static final int findSameDataBlock(int data[], int dataLength,
+                                                int otherBlock, int step) 
+    {
+        // ensure that we do not even partially get past dataLength
+        dataLength -= DATA_BLOCK_LENGTH_;
+
+        for (int block = 0; block <= dataLength; block += step) {
+            int i = 0;
+            for (i = 0; i < DATA_BLOCK_LENGTH_; ++ i) {
+                if (data[block + i] != data[otherBlock + i]) {
+                    break;
+                }
+            }
+            if (i == DATA_BLOCK_LENGTH_) {
+                return block;
+            }
+        }
+        return -1;
+    }
+    
+    /**
+     * Fold the normalization data for supplementary code points into
+     * a compact area on top of the BMP-part of the trie index,
+     * with the lead surrogates indexing this compact area.
+     *
+     * Duplicate the index values for lead surrogates:
+     * From inside the BMP area, where some may be overridden with folded values,
+     * to just after the BMP area, where they can be retrieved for
+     * code point lookups.
+     * @param manipulate fold implementation
+     */
+    private final void fold(DataManipulate manipulate) 
+    {
+        int leadIndexes[] = new int[SURROGATE_BLOCK_COUNT_];
+        int index[] = m_index_;
+        // copy the lead surrogate indexes into a temporary array
+        System.arraycopy(index, 0xd800 >> SHIFT_, leadIndexes, 0, 
+                         SURROGATE_BLOCK_COUNT_);
+    
+        // to protect the copied lead surrogate values,
+        // mark all their indexes as repeat blocks
+        // (causes copy-on-write)
+        for (char c = 0xd800; c <= 0xdbff; ++ c) {
+            int block = index[c >> SHIFT_];
+            if (block > 0) {
+                index[c >> SHIFT_] =- block;
+            }
+        }
+    
+        // Fold significant index values into the area just after the BMP 
+        // indexes.
+        // In case the first lead surrogate has significant data,
+        // its index block must be used first (in which case the folding is a 
+        // no-op).
+        // Later all folded index blocks are moved up one to insert the copied
+        // lead surrogate indexes.
+        int indexLength = BMP_INDEX_LENGTH_;
+        // search for any index (stage 1) entries for supplementary code points 
+        for (int c = 0x10000; c < 0x110000;) {
+            if (index[c >> SHIFT_] != 0) {
+                // there is data, treat the full block for a lead surrogate
+                c &= ~0x3ff;
+                // is there an identical index block?
+                int block = findSameIndexBlock(index, indexLength, c >> SHIFT_);
+                // get a folded value for [c..c+0x400[ and, if 0, set it for 
+                // the lead surrogate 
+                int value = manipulate.getFoldedValue(c, 
+                                                block + SURROGATE_BLOCK_COUNT_);
+                if (value != 0) {
+                    if (!setValue(0xd7c0 + (c >> 10), value)) {
+                        // data table overflow 
+                        throw new ArrayIndexOutOfBoundsException(
+                                                        "Data table overflow");
+                    }
+                    // if we did not find an identical index block...
+                    if (block == indexLength) {
+                        // move the actual index (stage 1) entries from the 
+                        // supplementary position to the new one
+                        System.arraycopy(index, c >> SHIFT_, index, indexLength,
+                                         SURROGATE_BLOCK_COUNT_ << 2);
+                        indexLength += SURROGATE_BLOCK_COUNT_;
+                    }
+                }
+                c += 0x400;
+            } 
+            else {
+                c += DATA_BLOCK_LENGTH_;
+            }
+        }
+    
+        // index array overflow?
+        // This is to guarantee that a folding offset is of the form
+        // UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023.
+        // If the index is too large, then n>=1024 and more than 10 bits are 
+        // necessary.
+        // In fact, it can only ever become n==1024 with completely unfoldable 
+        // data and the additional block of duplicated values for lead 
+        // surrogates.
+        if (indexLength >= MAX_INDEX_LENGTH_) {
+            throw new ArrayIndexOutOfBoundsException("Index table overflow");
+        }
+        // make space for the lead surrogate index block and insert it between 
+        // the BMP indexes and the folded ones
+        System.arraycopy(index, BMP_INDEX_LENGTH_, index, 
+                         BMP_INDEX_LENGTH_ + SURROGATE_BLOCK_COUNT_,
+                         indexLength - BMP_INDEX_LENGTH_);
+        System.arraycopy(leadIndexes, 0, index, BMP_INDEX_LENGTH_,
+                         SURROGATE_BLOCK_COUNT_);
+        indexLength += SURROGATE_BLOCK_COUNT_;
+        m_indexLength_ = indexLength;
+    }
+}
+