scuffed-code/icu4c/source/common/ucmpe32.c

523 lines
16 KiB
C
Raw Normal View History

/*
**********************************************************************
* Copyright (C) 2001, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ucmpe32.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2001aug03
* created by: Vladimir Weinstein
*
* This is basically a rip-off of trie developed by Markus for
* normalization data, but using a reduced ucmp interface
* Interface is implemented as much as required by the collation
* framework.
* This table is slow on data addition, but should support surrogates
* nicely.
*/
#include "ucmpe32.h"
#include "cmemory.h"
/* builder data ------------------------------------------------------------- */
CompactEIntArray*
ucmpe32_open(int32_t defaultValue, int32_t surrogateValue, int32_t leadSurrogateValue, UErrorCode *status)
{
int32_t *bla;
CompactEIntArray* this_obj = NULL;
if (status == NULL || U_FAILURE(*status)) {
return NULL;
}
this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray));
if (this_obj == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
this_obj->fAlias = FALSE;
this_obj->fStructSize = sizeof(CompactEIntArray);
this_obj->stage1Top = _UCMPE32_STAGE_1_MAX_COUNT;
this_obj->stage1 = (uint16_t *)uprv_malloc(_UCMPE32_STAGE_1_MAX_COUNT*sizeof(uint16_t));
if(this_obj->stage1 == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(this_obj);
return NULL;
}
/* reset stage 1 of the trie */
uprv_memset(this_obj->stage1, 0, this_obj->stage1Top*sizeof(uint16_t));
/* allocate stage 2 of the trie and reset the first block */
this_obj->stage2= (int32_t*)uprv_malloc(INIT_UCMPE32_STAGE2_SIZE*sizeof(*(this_obj->stage2)));
if(this_obj->stage2 == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(this_obj->stage1);
uprv_free(this_obj);
return NULL;
}
this_obj->stage2Size = INIT_UCMPE32_STAGE2_SIZE;
this_obj->stage2DefaultTop = 0;
this_obj->fDefaultValue = defaultValue;
this_obj->fSurrogateValue = surrogateValue;
this_obj->fLeadSurrogateValue = leadSurrogateValue;
for(bla = this_obj->stage2; bla<this_obj->stage2+INIT_UCMPE32_STAGE2_SIZE; bla++) {
/*for(bla = this_obj->stage2; bla<this_obj->stage2+_UCMPE32_STAGE_2_BLOCK_COUNT; bla++) {*/
*bla = this_obj->fDefaultValue;
}
this_obj->stage2Top = _UCMPE32_STAGE_2_BLOCK_COUNT;
this_obj->fCompact = FALSE;
this_obj->fBogus = FALSE;
this_obj->fInitPhase = TRUE;
return this_obj;
}
/*
* Set a range of UnicodeChars to the same value
*/
void
ucmpe32_setRange32(CompactEIntArray* this_obj, UChar32 start, UChar32 end, int32_t value) {
UChar32 code = 0;
uint32_t s1Start = start>>_UCMPE32_TRIE_SHIFT;
uint32_t s1End = end>>_UCMPE32_TRIE_SHIFT;
int32_t *bla;
uint32_t i = 0;
/* Allocate a block for repeat stuff */
int32_t repeatBlock = 0;
if (this_obj->fInitPhase == FALSE || this_obj->fCompact == TRUE || (start > end)) {
return;
}
/* if there is stuff that doesn't fit */
if((start & _UCMPE32_STAGE_2_MASK) != 0) { /* at the start */
s1Start++;
for(code = start; code < (UChar32)(s1Start<<_UCMPE32_TRIE_SHIFT); code++) {
ucmpe32_set32(this_obj, code, value);
}
}
repeatBlock = this_obj->stage2Top;
this_obj->stage2Top += _UCMPE32_STAGE_2_BLOCK_COUNT;
for(bla = this_obj->stage2+repeatBlock; bla<this_obj->stage2+this_obj->stage2Top; bla++) {
*bla = value;
}
for(i = s1Start; i< s1End; i++) {
this_obj->stage1[i] = (uint16_t)(repeatBlock);
}
if((end & _UCMPE32_STAGE_2_MASK) != 0) { /* at the end */
for(code = (s1End<<_UCMPE32_TRIE_SHIFT); code <= end; code++) {
ucmpe32_set32(this_obj, code, value);
}
} else {
this_obj->stage1[s1End] = (uint16_t)(repeatBlock);
}
this_obj->stage2DefaultTop = this_obj->stage2Top;
this_obj->fInitPhase = TRUE;
}
int32_t
ucmpe32_get32(CompactEIntArray* this_obj, UChar32 code) {
int16_t stage1 = (this_obj->stage1[(code >> _UCMPE32_TRIE_SHIFT)]);
int32_t offset = (code & _UCMPE32_STAGE_2_MASK);
int32_t result = this_obj->stage2[stage1 + offset];
return result;
}
/********* THIS IS THE ADD FUNCTION ********************/
void
ucmpe32_set32(CompactEIntArray* this_obj, UChar32 code, int32_t value)
{
uint16_t stage2Block, k;
if (this_obj->fCompact == TRUE) {
return;
}
this_obj->fInitPhase = FALSE;
{
uint32_t i;
uint16_t j;
i=code>>_UCMPE32_TRIE_SHIFT;
j=this_obj->stage1[i];
if(j<=this_obj->stage2DefaultTop) {
/* allocate a stage 2 block */
int32_t *p=NULL, bla=0;
if(this_obj->stage2Size < (this_obj->stage2Top + _UCMPE32_STAGE_2_BLOCK_COUNT)) {
this_obj->stage2 = (int32_t *)uprv_realloc(this_obj->stage2, 2*this_obj->stage2Size);
if(this_obj->stage2 == NULL) {
}
this_obj->stage2Size *= 2;
}
p = this_obj->stage2+this_obj->stage2Top;
for(bla = 0; bla<_UCMPE32_STAGE_2_BLOCK_COUNT; bla++) {
*(p+bla) = this_obj->stage2[j+bla];
/* fill the newly allocated block with the default values for that block */
}
this_obj->stage2Top += _UCMPE32_STAGE_2_BLOCK_COUNT;
this_obj->stage1[i]=j=(uint16_t)(p-this_obj->stage2);
}
stage2Block=j;
}
k=(uint16_t)(stage2Block+(code&_UCMPE32_STAGE_2_MASK));
this_obj->stage2[k] = value;
}
void
ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead, UChar trail, int32_t value)
{
if (this_obj->fCompact == TRUE) {
return;
}
ucmpe32_set(this_obj, (int32_t)UTF16_GET_PAIR_VALUE(lead, trail), value);
}
/*
* Fold the supplementary code point data for one lead surrogate.
*/
static uint16_t
foldLeadSurrogate(CompactEIntArray* this_obj,
uint32_t base, int32_t top) {
uint32_t leadNorm32=0;
int32_t i, j, s2;
uint32_t leadSurrogate=0xd7c0+(base>>10);
#if 0
printf("supplementary data for lead surrogate U+%04lx\n", (long)leadSurrogate);
#endif
/* calculate the 32-bit data word for the lead surrogate */
for(i=0; i<_UCMPE32_SURROGATE_BLOCK_COUNT; ++i) {
s2=this_obj->stage1[(base>>_UCMPE32_TRIE_SHIFT)+i];
if(s2!=0) {
for(j=0; j<_UCMPE32_STAGE_2_BLOCK_COUNT; ++j) {
/* basically, or all 32-bit data into the one for the lead surrogate */
leadNorm32|=this_obj->stage2[s2+j];
}
}
}
if(leadNorm32==0) {
return 0;
}
/*
* For FCD, replace the entire combined value by the surrogate index
* and make sure that it is not 0 (by not offsetting it by the BMP top,
* since here we have enough bits for this);
* lead surrogates are tested at runtime on the character code itself
* instead on special values of the trie data -
* this is because 16 bits in the FCD trie data do not allow for anything
* but the two leading and trailing combining classes of the canonical decomposition.
*/
leadNorm32= this_obj->fSurrogateValue | ((top<<_UCMPE32_TRIE_SHIFT)&~_UCMPE32_STAGE_2_MASK);
/* enter the lead surrogate's data */
s2=this_obj->stage1[leadSurrogate>>_UCMPE32_TRIE_SHIFT];
if(s2<=this_obj->stage2DefaultTop) {
/* allocate a new stage 2 block in stage (the memory is there from makeAll32()/makeFCD()) */
s2=this_obj->stage1[leadSurrogate>>_UCMPE32_TRIE_SHIFT]=(uint16_t)this_obj->stage2Top;
for(i = 0; i<_UCMPE32_STAGE_2_BLOCK_COUNT; i++) {
this_obj->stage2[this_obj->stage2Top+i] = this_obj->fLeadSurrogateValue;
}
this_obj->stage2Top+=_UCMPE32_STAGE_2_BLOCK_COUNT;
}
this_obj->stage2[s2+(leadSurrogate&_UCMPE32_STAGE_2_MASK)]=leadNorm32;
/* move the actual stage 1 indexes from the supplementary position to the new one */
uprv_memmove(this_obj->stage1+top, this_obj->stage1+(base>>_UCMPE32_TRIE_SHIFT), _UCMPE32_SURROGATE_BLOCK_COUNT*2);
/* increment stage 1 top */
return _UCMPE32_SURROGATE_BLOCK_COUNT;
}
/*
* Fold the normalization data for supplementary code points into
* a compact area on top of the BMP-part of the trie index,
* with the lead surrogates indexing this compact area.
*
* Use after makeAll32().
*/
static uint32_t
foldSupplementary(CompactEIntArray* this_obj, int32_t top) {
uint32_t c;
uint16_t i;
/* search for any stage 1 entries for supplementary code points */
for(c=0x10000; c<0x110000;) {
i=this_obj->stage1[c>>_UCMPE32_TRIE_SHIFT];
if(i!=0) {
/* there is data, treat the full block for a lead surrogate */
c&=~0x3ff;
top+=foldLeadSurrogate(this_obj, c, top);
c+=0x400;
} else {
c+=_UCMPE32_STAGE_2_BLOCK_COUNT;
}
}
#if 0
printf("trie index count: BMP %u all Unicode %lu folded %u\n",
_UCMPE32_STAGE_1_BMP_COUNT, (long)_UCMPE32_STAGE_1_MAX_COUNT, parentCount);
#endif
return top;
}
void
ucmpe32_compact(CompactEIntArray* this_obj) {
if(this_obj->fCompact == FALSE) { /* compacting can be done only once */
/*
* This function is the common implementation for compacting
* the stage 2 tables of 32-bit values.
* It is a copy of genprops/store.c's compactStage() adapted for the 32-bit stage 2 tables.
*/
static uint16_t map[0x10000>>_UCMPE32_TRIE_SHIFT];
int32_t x;
uint16_t i, start, prevEnd, newStart;
/* fold supplementary code points into lead surrogates */
this_obj->stage1Top=foldSupplementary(this_obj, _UCMPE32_STAGE_1_BMP_COUNT);
map[0]=0;
newStart=_UCMPE32_STAGE_2_BLOCK_COUNT;
for(start=newStart; start<this_obj->stage2Top;) {
prevEnd=(uint16_t)(newStart-1);
x=this_obj->stage2[start];
if(x==this_obj->stage2[prevEnd]) {
/* overlap by at least one */
for(i=1; i<_UCMPE32_STAGE_2_BLOCK_COUNT
&& x==this_obj->stage2[start+i]
&& x==this_obj->stage2[prevEnd-i]; ++i) {}
/* overlap by i */
map[start>>_UCMPE32_TRIE_SHIFT]=(uint16_t)(newStart-i);
/* move the non-overlapping indexes to their new positions */
start+=i;
for(i=(uint16_t)(_UCMPE32_STAGE_2_BLOCK_COUNT-i); i>0; --i) {
this_obj->stage2[newStart++]=this_obj->stage2[start++];
}
} else if(newStart<start) {
/* move the indexes to their new positions */
map[start>>_UCMPE32_TRIE_SHIFT]=newStart;
for(i=_UCMPE32_STAGE_2_BLOCK_COUNT; i>0; --i) {
this_obj->stage2[newStart++]=this_obj->stage2[start++];
}
} else /* no overlap && newStart==start */ {
map[start>>_UCMPE32_TRIE_SHIFT]=start;
newStart+=_UCMPE32_STAGE_2_BLOCK_COUNT;
start=newStart;
}
}
/* now adjust the stage1 table */
for(i=0; i<this_obj->stage1Top; ++i) {
this_obj->stage1[i]=map[this_obj->stage1[i]>>_UCMPE32_TRIE_SHIFT];
}
#if 0
/* we saved some space */
printf("compacting trie: count of 32-bit words %lu->%lu\n",
(long)this_obj->stage2Top, (long)newStart);
#endif
this_obj->stage2Top = newStart;
this_obj->fCompact = TRUE;
}
}
CompactEIntArray*
ucmpe32_clone(CompactEIntArray* orig, UErrorCode *status)
{
CompactEIntArray* this_obj = NULL;
if (status == NULL || U_FAILURE(*status)) {
return NULL;
}
this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray));
if(orig == NULL || orig->fBogus == TRUE || this_obj == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
this_obj->fAlias = FALSE;
this_obj->fDefaultValue = orig->fDefaultValue;
this_obj->fSurrogateValue = orig->fSurrogateValue;
this_obj->fLeadSurrogateValue = orig->fLeadSurrogateValue;
this_obj->stage1Top = orig->stage1Top;
this_obj->stage1 = (uint16_t *)uprv_malloc(this_obj->stage1Top*sizeof(uint16_t));
if(this_obj->stage1 == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(this_obj);
return NULL;
}
uprv_memcpy(this_obj->stage1, orig->stage1, this_obj->stage1Top*sizeof(uint16_t));
this_obj->stage2Size = orig->stage2Size;
this_obj->stage2DefaultTop = orig->stage2DefaultTop;
this_obj->stage2Top = orig->stage2Top;
this_obj->stage2 = (int32_t*)uprv_malloc(60000*sizeof(*(this_obj->stage2)));
if(this_obj->stage2 == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
uprv_free(this_obj->stage1);
uprv_free(this_obj);
return NULL;
}
uprv_memcpy(this_obj->stage2, orig->stage2, this_obj->stage2Top*sizeof(*(this_obj->stage2)));
this_obj->fBogus = FALSE;
this_obj->fStructSize = sizeof(CompactEIntArray);
this_obj->fCompact = orig->fCompact;
this_obj->fInitPhase = orig->fInitPhase;
return this_obj;
}
CompactEIntArray*
ucmpe32_openFromData( const uint8_t **source,
UErrorCode *status)
{
uint32_t i;
CompactEIntArray* this_obj = NULL;
if (status == NULL || U_FAILURE(*status)) {
return NULL;
}
this_obj = (CompactEIntArray*) uprv_malloc(sizeof(CompactEIntArray));
if(*source == NULL || this_obj == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
i = * ((const uint32_t*) *source);
(*source) += 4;
if(i != ICU_UCMPE32_VERSION)
{
*status = U_INVALID_FORMAT_ERROR;
return NULL;
}
this_obj->fAlias = TRUE;
this_obj->stage1 = NULL;
this_obj->stage2 = NULL;
this_obj->fBogus = FALSE;
this_obj->fStructSize = sizeof(CompactEIntArray);
this_obj->fCompact = TRUE;
this_obj->stage1Top = * ((const uint32_t*)*source);
(*source) += 4;
this_obj->stage1 = (uint16_t*) *source;
(*source) += sizeof(this_obj->stage1[0])*this_obj->stage1Top;
this_obj->stage2Top = * ((const uint32_t*)*source);
(*source) += 4;
this_obj->stage2 = (int32_t*) *source;
(*source) += sizeof(this_obj->stage2[0])*this_obj->stage2Top;
return this_obj;
}
uint32_t
ucmpe32_flattenMem (const CompactEIntArray* this_obj, UMemoryStream *MS)
{
/* This dumps stuff in memory */
/* there is no padding, as there is always an even number of 16-bit values */
/* (stage1), so everything is always 32 bit aligned */
int32_t size = 0;
if(this_obj->fCompact == TRUE) {
uprv_mstrm_write32(MS, ICU_UCMPE32_VERSION);
size += 4;
uprv_mstrm_write32(MS, this_obj->stage1Top);
size += 4;
uprv_mstrm_writeBlock(MS, this_obj->stage1, this_obj->stage1Top*sizeof(this_obj->stage1[0]));
size += this_obj->stage1Top*sizeof(this_obj->stage1[0]);
uprv_mstrm_write32(MS, this_obj->stage2Top);
size += 4;
uprv_mstrm_writeBlock(MS, this_obj->stage2, this_obj->stage2Top*sizeof(this_obj->stage2[0]));
size += this_obj->stage2Top*sizeof(this_obj->stage2[0]);
}
return size;
}
/*=======================================================*/
void ucmpe32_close(CompactEIntArray* this_obj)
{
if(this_obj != NULL) {
if(this_obj->fAlias == FALSE) {
if(this_obj->stage1 != NULL) {
uprv_free(this_obj->stage1);
}
if(this_obj->stage2 != NULL) {
uprv_free(this_obj->stage2);
}
}
uprv_free(this_obj);
}
}
int32_t
ucmpe32_getSurrogateEx(CompactEIntArray *array, UChar lead, UChar trail) {
if(array->fCompact == FALSE) {
return(ucmpe32_get(array, (int32_t)UTF16_GET_PAIR_VALUE(lead, trail)));
} else {
return(ucmpe32_getSurrogate(array, ucmpe32_get(array, lead), trail));
}
}
/*=======================================================*/
/* retrieval stuff as functions */
#if 0
int32_t
ucmpe32_get32(CompactEIntArray *this_obj, UChar32 index) {
int32_t index_lookup = this_obj->stage1[index >> _UCMPE32_TRIE_SHIFT] ;
int32_t addition = (index & _UCMPE32_STAGE_2_MASK);
return (this_obj->stage2[index_lookup + addition]);
}
/* Lead surrogate data needs to be in the following format: */
/* F50XXY000 - where X mask is 1111 (F) and Y mask is 1100 (C) */
/* The ten bits for access will be in the middle of the field */
int32_t
ucmpe32_getSurrogate(CompactEIntArray *array, int32_t leadValue32, UChar trail) {
int32_t c = ((leadValue32 & 0xffc00) | (trail & 0x3ff));
int32_t index_lookup = array->stage1[(c >> _UCMPE32_TRIE_SHIFT)];
int32_t addition = (c & _UCMPE32_STAGE_2_MASK);
return (array->stage2[index_lookup+ addition]);
}
#endif