/*============================================================================ * Copyright (C) 1997-1999, International Business Machines Corporation * and others. All Rights Reserved. * * File cmpshrta.cpp * * Modification History: * * Date Name Description * 2/5/97 aliu Added CompactIntArray streamIn and streamOut methods. * 3/4/97 aliu Tuned performance of CompactIntArray constructor, * based on performance data indicating that this_obj was slow. * 05/07/97 helena Added isBogus() * 04/26/99 Madhu Ported to C for C Implementation * 11/12/99 srl macroized ucmp32_get() *=============================================================================== */ #include "ucmp32.h" #include "cmemory.h" #include "filestrm.h" #include static int32_t ucmp32_findOverlappingPosition(CompactIntArray* this_obj, uint32_t start, const UChar *tempIndex, int32_t tempIndexCount, uint32_t cycle); static UBool debugSmall = FALSE; static uint32_t debugSmallLimit = 30000; /** debug flags *======================================================= */ int32_t ucmp32_getkUnicodeCount() { return UCMP32_kUnicodeCount;} int32_t ucmp32_getkBlockCount() { return UCMP32_kBlockCount;} U_CAPI void ucmp32_streamIn(CompactIntArray* this_obj, FileStream* is) { int32_t newCount, len; char c; if (!T_FileStream_error(is)) { T_FileStream_read(is, &newCount, sizeof(newCount)); if (this_obj->fCount != newCount) { this_obj->fCount = newCount; uprv_free(this_obj->fArray); this_obj->fArray = 0; this_obj->fArray = (int32_t*)uprv_malloc(this_obj->fCount * sizeof(int32_t)); if (!this_obj->fArray) { this_obj->fBogus = TRUE; return; } } T_FileStream_read(is, this_obj->fArray, sizeof(*(this_obj->fArray)) * this_obj->fCount); T_FileStream_read(is, &len, sizeof(len)); if (len == 0) { uprv_free(this_obj->fIndex); this_obj->fIndex = 0; } else if (len == UCMP32_kIndexCount) { if (this_obj->fIndex == 0) this_obj->fIndex =(uint16_t*)uprv_malloc(UCMP32_kIndexCount * sizeof(uint16_t)); if (!this_obj->fIndex) { this_obj->fBogus = TRUE; uprv_free(this_obj->fArray); this_obj->fArray = 0; return; } T_FileStream_read(is, this_obj->fIndex, sizeof(*(this_obj->fIndex)) * UCMP32_kIndexCount); } else { this_obj->fBogus = TRUE; return; } /* char instead of int8_t for Mac compilation*/ T_FileStream_read(is, (char*)&c, sizeof(c)); this_obj->fCompact = (c != 0); } } U_CAPI void ucmp32_streamOut(CompactIntArray* this_obj, FileStream* os) { char c; if (!T_FileStream_error(os)) { if (this_obj->fCount != 0 && this_obj->fArray != 0) { T_FileStream_write(os, &(this_obj->fCount), sizeof(this_obj->fCount)); T_FileStream_write(os, this_obj->fArray, sizeof(*(this_obj->fArray)) * this_obj->fCount); } else { int32_t zero = 0; T_FileStream_write(os, &zero, sizeof(zero)); } if (this_obj->fIndex == 0) { int32_t len = 0; T_FileStream_write(os, &len, sizeof(len)); } else { int32_t len = UCMP32_kIndexCount; T_FileStream_write(os, &len, sizeof(len)); T_FileStream_write(os, this_obj->fIndex, sizeof(*(this_obj->fIndex)) * UCMP32_kIndexCount); } c = this_obj->fCompact ? 1 : 0; /* char instead of int8_t for Mac compilation*/ T_FileStream_write(os, (const char*)&c, sizeof(c)); } } U_CAPI void ucmp32_streamMemIn(CompactIntArray* this_obj, UMemoryStream* is) { int32_t newCount, len; char c; if (!uprv_mstrm_error(is)) { uprv_mstrm_read(is, &newCount, sizeof(newCount)); if (this_obj->fCount != newCount) { this_obj->fCount = newCount; uprv_free(this_obj->fArray); this_obj->fArray = 0; this_obj->fArray = (int32_t*)uprv_malloc(this_obj->fCount * sizeof(int32_t)); if (!this_obj->fArray) { this_obj->fBogus = TRUE; return; } } uprv_mstrm_read(is, this_obj->fArray, sizeof(*(this_obj->fArray)) * this_obj->fCount); uprv_mstrm_read(is, &len, sizeof(len)); if (len == 0) { uprv_free(this_obj->fIndex); this_obj->fIndex = 0; } else if (len == UCMP32_kIndexCount) { if (this_obj->fIndex == 0) this_obj->fIndex =(uint16_t*)uprv_malloc(UCMP32_kIndexCount * sizeof(uint16_t)); if (!this_obj->fIndex) { this_obj->fBogus = TRUE; uprv_free(this_obj->fArray); this_obj->fArray = 0; return; } uprv_mstrm_read(is, this_obj->fIndex, sizeof(*(this_obj->fIndex)) * UCMP32_kIndexCount); } else { this_obj->fBogus = TRUE; return; } /* char instead of int8_t for Mac compilation*/ uprv_mstrm_read(is, (char*)&c, sizeof(c)); this_obj->fCompact = (c != 0); } } U_CAPI void ucmp32_streamMemOut(CompactIntArray* this_obj, UMemoryStream* os) { char c; if (!uprv_mstrm_error(os)) { if (this_obj->fCount != 0 && this_obj->fArray != 0) { uprv_mstrm_write(os, (uint8_t *)&(this_obj->fCount), sizeof(this_obj->fCount)); uprv_mstrm_write(os, (uint8_t *)this_obj->fArray, sizeof(*(this_obj->fArray)) * this_obj->fCount); } else { int32_t zero = 0; uprv_mstrm_write(os, (uint8_t *)&zero, sizeof(zero)); } if (this_obj->fIndex == 0) { int32_t len = 0; uprv_mstrm_write(os, (uint8_t *)&len, sizeof(len)); } else { int32_t len = UCMP32_kIndexCount; uprv_mstrm_write(os, (uint8_t *)&len, sizeof(len)); uprv_mstrm_write(os, (uint8_t *)this_obj->fIndex, sizeof(*(this_obj->fIndex)) * UCMP32_kIndexCount); } c = this_obj->fCompact ? 1 : 0; /* char instead of int8_t for Mac compilation*/ uprv_mstrm_write(os, (uint8_t *)&c, sizeof(c)); } } CompactIntArray* ucmp32_open(int32_t defaultValue) { uint16_t i; int32_t *p, *p_end; uint16_t *q, *q_end; CompactIntArray* this_obj = (CompactIntArray*) uprv_malloc(sizeof(CompactIntArray)); if (this_obj == NULL) return NULL; this_obj->fCount = UCMP32_kUnicodeCount; this_obj->fCompact = FALSE; this_obj->fBogus = FALSE; this_obj->fArray = NULL; this_obj->fIndex = NULL; /*set up the index array and the data array. * the index array always points into particular parts of the data array * it is initially set up to point at regular block boundaries * The following example uses blocks of 4 for simplicity * Example: Expanded * INDEX# 0 1 2 3 4 * INDEX 0 4 8 12 16 ... * ARRAY abcdeababcedzyabcdea... * | | | | | |... * whenever you set an element in the array, it unpacks to this_obj state * After compression, the index will point to various places in the data array * wherever there is a runs of the same elements as in the original * Example: Compressed * INDEX# 0 1 2 3 4 * INDEX 0 4 1 8 2 ... * ARRAY abcdeabazyabc... * If you look at the example, index# 2 in the expanded version points * to data position number 8, which has elements "bced". In the compressed * version, index# 2 points to data position 1, which also has "bced" */ this_obj->fArray = (int32_t*)uprv_malloc(UCMP32_kUnicodeCount * sizeof(int32_t)); if (this_obj->fArray == NULL) { this_obj->fBogus = TRUE; return NULL; } this_obj->fIndex = (uint16_t*)uprv_malloc(UCMP32_kIndexCount * sizeof(uint16_t)); if (!this_obj->fIndex) { uprv_free(this_obj->fArray); this_obj->fArray = NULL; this_obj->fBogus = TRUE; return NULL; } p = this_obj->fArray; p_end = p + UCMP32_kUnicodeCount; while (p < p_end) *p++ = defaultValue; q = this_obj->fIndex; q_end = q + UCMP32_kIndexCount; i = 0; while (q < q_end) { *q++ = i; i += (1 << UCMP32_kBlockShift); } return this_obj; } CompactIntArray* ucmp32_openAdopt(uint16_t *indexArray, int32_t *newValues, int32_t count) { CompactIntArray* this_obj = (CompactIntArray*) uprv_malloc(sizeof(CompactIntArray)); if (this_obj == NULL) return NULL; this_obj->fCount = count; this_obj->fBogus = FALSE; this_obj->fArray = newValues; this_obj->fIndex = indexArray; this_obj->fCompact = (count < UCMP32_kUnicodeCount) ? TRUE : FALSE; return this_obj; } /*=======================================================*/ void ucmp32_close( CompactIntArray* this_obj) { if(this_obj != NULL) { if(this_obj->fArray != NULL) { uprv_free(this_obj->fArray); } if(this_obj->fIndex != NULL) { uprv_free(this_obj->fIndex); } uprv_free(this_obj); } } UBool ucmp32_isBogus(const CompactIntArray* this_obj) { return this_obj == NULL || this_obj->fBogus; } void ucmp32_expand(CompactIntArray* this_obj) { /* can optimize later. * if we have to expand, then walk through the blocks instead of using Get * this_obj code unpacks the array by copying the blocks to the normalized position. * Example: Compressed * INDEX# 0 1 2 3 4 * INDEX 0 4 1 8 2 ... * ARRAY abcdeabazyabc... * turns into * Example: Expanded * INDEX# 0 1 2 3 4 * INDEX 0 4 8 12 16 ... * ARRAY abcdeababcedzyabcdea... */ int32_t i; int32_t* tempArray; if (this_obj->fCompact) { tempArray = (int32_t*)uprv_malloc(UCMP32_kUnicodeCount * sizeof(int32_t)); if (tempArray == NULL) { this_obj->fBogus = TRUE; return; } for (i = 0; i < UCMP32_kUnicodeCount; ++i) { tempArray[i] = ucmp32_get(this_obj, (UChar)i); /* HSYS : How expand?*/ } for (i = 0; i < UCMP32_kIndexCount; ++i) { this_obj->fIndex[i] = (uint16_t)(i<fArray); this_obj->fArray = tempArray; this_obj->fCompact = FALSE; } } uint32_t ucmp32_getCount(const CompactIntArray* this_obj) { return this_obj->fCount; } const int32_t* ucmp32_getArray(const CompactIntArray* this_obj) { return this_obj->fArray; } const uint16_t* ucmp32_getIndex(const CompactIntArray* this_obj) { return this_obj->fIndex; } void ucmp32_set(CompactIntArray* this_obj, UChar c, int32_t value) { if (this_obj->fCompact == TRUE) { ucmp32_expand(this_obj); if (this_obj->fBogus) return; } this_obj->fArray[(int32_t)c] = value; } void ucmp32_setRange(CompactIntArray* this_obj, UChar start, UChar end, int32_t value) { int32_t i; if (this_obj->fCompact == TRUE) { ucmp32_expand(this_obj); if (this_obj->fBogus) return; } for (i = start; i <= end; ++i) { this_obj->fArray[i] = value; } } /*======================================================= * this_obj->fArray: an array to be overlapped * start and count: specify the block to be overlapped * tempIndex: the overlapped array (actually indices back into inputContents) * inputHash: an index of hashes for tempIndex, where * inputHash[i] = XOR of values from i-count+1 to i */ int32_t ucmp32_findOverlappingPosition(CompactIntArray* this_obj, uint32_t start, const UChar* tempIndex, int32_t tempIndexCount, uint32_t cycle) { /* this_obj is a utility routine for finding blocks that overlap. * IMPORTANT: the cycle number is very important. Small cycles take a lot * longer to work. In some cases, they may be able to get better compaction. */ int32_t i; int32_t j; int32_t currentCount; for (i = 0; i < tempIndexCount; i += cycle) { currentCount = UCMP32_kBlockCount; if (i + UCMP32_kBlockCount > tempIndexCount) { currentCount = tempIndexCount - i; } for (j = 0; j < currentCount; ++j) { if (this_obj->fArray[start + j] != this_obj->fArray[tempIndex[i + j]]) break; } if (j == currentCount) break; } return i; } /*=======================================================*/ void ucmp32_compact(CompactIntArray* this_obj, int32_t cycle) { /* this_obj actually does the compaction. * it walks throught the contents of the expanded array, finding the * first block in the data that matches the contents of the current index. * As it works, it keeps an updated pointer to the last position, * so that it knows how big to make the final array * If the matching succeeds, then the index will point into the data * at some earlier position. * If the matching fails, then last position pointer will be bumped, * and the index will point to that last block of data. */ UChar* tempIndex; int32_t tempIndexCount; int32_t* tempArray; int32_t iBlock, iIndex; int32_t newCount, firstPosition; uint32_t block; if (!this_obj->fCompact) { /* fix cycle, must be 0 < cycle <= blockcount*/ if (cycle < 0) cycle = 1; else if (cycle > UCMP32_kBlockCount) cycle = UCMP32_kBlockCount; /* make temp storage, larger than we need*/ tempIndex =(UChar*)uprv_malloc(UCMP32_kUnicodeCount * sizeof(uint32_t)); if (tempIndex == NULL) { this_obj->fBogus = TRUE; return; } /* set up first block.*/ tempIndexCount = UCMP32_kBlockCount; for (iIndex = 0; iIndex < UCMP32_kBlockCount; ++iIndex) { tempIndex[iIndex] = (uint16_t)iIndex; }; /* endfor (iIndex = 0; .....)*/ this_obj->fIndex[0] = 0; /* for each successive block, find out its first position in the compacted array*/ for (iBlock = 1; iBlock < UCMP32_kIndexCount; ++iBlock) { block = iBlock< debugSmallLimit) break; firstPosition = ucmp32_findOverlappingPosition(this_obj, block, tempIndex, tempIndexCount, cycle); /* if not contained in the current list, copy the remainder * invariant; cumulativeHash[iBlock] = XOR of values from iBlock-kBlockCount+1 to iBlock * we do this_obj by XORing out cumulativeHash[iBlock-kBlockCount] */ newCount = firstPosition + UCMP32_kBlockCount; if (newCount > tempIndexCount) { for (iIndex = tempIndexCount; iIndex < newCount; ++iIndex) { tempIndex[iIndex] = (uint16_t)(iIndex - firstPosition + block); } /* endfor (iIndex = tempIndexCount....)*/ tempIndexCount = newCount; } /*endif (newCount > tempIndexCount)*/ this_obj->fIndex[iBlock] = (uint16_t)firstPosition; } /* endfor (iBlock = 1.....)*/ /* now allocate and copy the items into the array*/ tempArray = (int32_t*)uprv_malloc(tempIndexCount * sizeof(uint32_t)); if (tempArray == NULL) { this_obj->fBogus = TRUE; uprv_free(tempIndex); return; } for (iIndex = 0; iIndex < tempIndexCount; ++iIndex) { tempArray[iIndex] = this_obj->fArray[tempIndex[iIndex]]; } uprv_free(this_obj->fArray); this_obj->fArray = tempArray; this_obj->fCount = tempIndexCount; /* free up temp storage*/ uprv_free(tempIndex); this_obj->fCompact = TRUE; #ifdef _DEBUG /*the following line is useful for specific debugging purposes*/ /*fprintf(stderr, "Compacted to %ld with cycle %d\n", fCount, cycle);*/ #endif } /* endif (!this_obj->fCompact)*/ }