scuffed-code/icu4c/source/common/uhash.c

/*
**********************************************************************
*   Copyright (C) 1997-1999, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*
* File uhash.c
*
* Modification History:
*
*   Date        Name        Description
*   03/12/99    stephen     Creation.
*******************************************************************************
*/

#include "uhash.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "cmemory.h"

/* Private function prototypes */
void
uhash_initialize(UHashtable *hash,
         int32_t primeIndex,
         UErrorCode *status);

int32_t 
uhash_leastGreaterPrimeIndex(int32_t source);

void
uhash_rehash(UHashtable *hash,
         UErrorCode *status);

void
uhash_putInternal(UHashtable *hash,
          int32_t hashCode, 
          void *value);

int32_t 
uhash_find(const UHashtable *hash,
       int32_t hashCode);


/*
  INVARIANT: the size of the table MUST be prime for this algorithm to work!
  Prime table can be tuned for different performance/storage characteristics
  We avoid computing primes by precomputing a table that we use.
*/
int32_t UHASH_PRIMES [] =
{
  17, 37, 67, 131, 257,
  521, 1031, 2053, 4099, 8209, 16411, 32771, 65537,
  131101, 262147, 524309, 1048583, 2097169, 4194319, 8388617, 16777259,
  33554467, 67108879, 134217757, 268435459, 536870923, 1073741827, 2147483647
};

#define UHASH_PRIMES_LENGTH 28

#define UHASH_SLOT_DELETED      ((int32_t) 0x80000000)
#define UHASH_SLOT_EMPTY      ((int32_t) UHASH_SLOT_DELETED + 1)
#define UHASH_MAX_UNUSED      ((int32_t) UHASH_SLOT_EMPTY)

/*
  INVARIANTS:
  DELETED <= MAX_UNUSED
  EMPTY <= MAX_UNUSED
  Any hash > MAX_UNUSED*
  * hashcodes may not start out this way, but internally, 
  they are adjusted so that they are always positive, and this is always true.
  Note here that we are assuming 32-bit ints.
*/

U_CAPI UHashtable*
uhash_open(UHashFunction func,
       UErrorCode *status)
{
  UHashtable* myUHT =  uhash_openSize(func, 3, status);
  if (U_SUCCESS(*status)) myUHT->isGrowable = TRUE;

  return myUHT;
}

U_CAPI UHashtable*
uhash_openSize(UHashFunction func,
	       int32_t size,
	       UErrorCode *status)
{
  UHashtable *result;
  
  if(U_FAILURE(*status)) return NULL;
  
  result = (UHashtable*) uprv_malloc(sizeof(UHashtable));
  if(result == 0) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return 0;
  }
  
  result->highWaterFactor     = 0.5F;
  result->lowWaterFactor     = 0.0F;
  result->hashFunction         = func;
  result->valueDelete        = NULL;
  result->toBeDeleted        = NULL;
  result->toBeDeletedCount    = 0;
  result->isGrowable    = FALSE;

  uhash_initialize(result, uhash_leastGreaterPrimeIndex(size), status);

  if(U_FAILURE(*status)) {
    uprv_free(result);
    return 0;
  }

  return result;
}

U_CAPI void
uhash_setValueDeleter(UHashtable *hash, ValueDeleter del )
{
    hash->valueDelete = del;
}

U_CAPI void
uhash_close(UHashtable *hash)
{
  if (hash->valueDelete)
  {
      ValueDeleter my_free = hash->valueDelete;
      void** vals = hash->values;
      void** toBeDeleted = hash->toBeDeleted;
      int32_t i;
      int32_t count = hash->count;
      int32_t toBeDeletedCount = hash->toBeDeletedCount;
      for (i = 0; i < count; i++)  my_free(vals[i]);
      while (toBeDeletedCount--)    my_free(toBeDeleted[toBeDeletedCount]);

  }
  uprv_free(hash->values);
  uprv_free(hash->hashes);
  uprv_free(hash->toBeDeleted);
}

U_CAPI int32_t
uhash_size(const UHashtable *hash)
{
  return hash->count;
}
U_CAPI int32_t
uhash_putKey(UHashtable *hash,
         int32_t valueKey,
         void *value,
         UErrorCode *status)
{
  /* Put finds the position in the table for the new value. */
  
  int32_t hashCode;
  int32_t index;
  
  if(U_FAILURE(*status)) return UHASH_INVALID;

  if(hash->count > hash->highWaterMark) {
    if (hash->isGrowable)    uhash_rehash(hash, status);
    else  {
      *status = U_INDEX_OUTOFBOUNDS_ERROR;
      return UHASH_INVALID;
    }
  }

  hashCode     = valueKey;
  index     = uhash_find(hash, hashCode);
  
  /* deleted or empty */
  if(hash->hashes[index] <= UHASH_MAX_UNUSED) {
    /* make new object */
    hash->hashes[index] = hashCode;
    ++(hash->count);
  }
  
  /* delete old value? */
  if (hash->valueDelete) 
    {
      void * result = hash->values[index];
    if (result != value) /*Make sure the same object isn't scheduled for a double deletion*/
      {
        hash->toBeDeleted = (void**) uprv_realloc(hash->toBeDeleted, sizeof(void*)*(++(hash->toBeDeletedCount)));
        hash->toBeDeleted[(hash->toBeDeletedCount)-1] = result;
      }
      hash->values[index] = 0;
    }
  
  
  /* store value */
  hash->values[index] = value;
  
  /* return the hash code to the user */
  return hashCode;
}

U_CAPI int32_t
uhash_put(UHashtable *hash,
      void *value,
      UErrorCode *status)
{
  /* Put finds the position in the table for the new value. */
  
  int32_t hashCode;
  int32_t index;
  
  if(U_FAILURE(*status)) return UHASH_INVALID;

  if(hash->count > hash->highWaterMark) {
    if (hash->isGrowable)    uhash_rehash(hash, status);
    else  {
      *status = U_INDEX_OUTOFBOUNDS_ERROR;
      return UHASH_INVALID;
    }
  }

  hashCode     = (hash->hashFunction)(value);
  index     = uhash_find(hash, hashCode);
  
  /* deleted or empty */
  if(hash->hashes[index] <= UHASH_MAX_UNUSED) {
    /* make new object */
    hash->hashes[index] = hashCode;
    ++(hash->count);
  }
  
  /* delete old value? */
  if (hash->valueDelete) 
    {
      void* result = hash->values[index];
    if (result != value) /*Make sure the same object isn't scheduled for a double deletion*/
      {
        hash->toBeDeleted = (void**) uprv_realloc(hash->toBeDeleted,
                         sizeof(void*)*(++(hash->toBeDeletedCount)));
        hash->toBeDeleted[(hash->toBeDeletedCount)-1] = result;
      }
      hash->values[index] = 0;
    }
  
    
  /* store value */
  hash->values[index] = value;

  /* return the hash code to the user */
  return hashCode;
}

U_CAPI void*
uhash_get(const UHashtable *hash, 
      int32_t key)
{
	/* srl: Shouldn't we check to see if hash->hashes[uhash_find(hash, key)] == UHASH_SLOT_DELETED ?
		Perhaps in theory hash->values[...] should have been set to 0, but can we depend
		on this?
	 */

  void *result         = hash->values[uhash_find(hash, key)];
  return result;  
}

U_CAPI void*
uhash_remove(UHashtable *hash,
         int32_t key,
         UErrorCode *status)
{
  /*
    First find the position of the key in the table
    If the object has not be removed already, remove it.
    We have to put a special value in that position that means that
    something has been deleted, since when we do a find,
    we have to continue PAST any deleted values
  */
  int32_t index     = uhash_find(hash, key);
  void *result         = 0;

  /* neither deleted nor empty */
  if(hash->hashes[index] > UHASH_MAX_UNUSED) {
    /* set to deleted */
    hash->hashes[index] = UHASH_SLOT_DELETED;
    result = hash->values[index];

    /* delete old value? */
    if (hash->valueDelete) 
    {
        hash->valueDelete(result);
    }
    hash->values[index] = 0; /* srl .. always null out the value even if there's no deletor!! */

    --(hash->count);

    if(hash->count < hash->lowWaterMark) {
      uhash_rehash(hash, status);
    }
  }
  
  return result;
}

U_CAPI void*
uhash_nextElement(const UHashtable *hash,
          int32_t *pos)
{
  /*
    Walk through the array until you find an element that is not EMPTY and 
    not DELETED
  */
  
  int32_t i;
  void *value;
  
  for(i = *pos + 1; i < hash->length; ++i) {
    if(hash->hashes[i] > UHASH_MAX_UNUSED) {
      *pos = i;
      value = hash->values[i];
      return value;
    }
  }
  
  /* No more elements */
  return 0; 
}

/* ================================================== */
/* Private functions */
/* ================================================== */
void
uhash_initialize(UHashtable *hash,
         int32_t primeIndex,
         UErrorCode *status)
{
  int32_t i;
  
  if(U_FAILURE(*status)) return;

  if(primeIndex < 0) {
    primeIndex = 0;
  }
  else if(primeIndex >= UHASH_PRIMES_LENGTH) {
    primeIndex = UHASH_PRIMES_LENGTH - 1;
  }
  
  hash->primeIndex     = primeIndex;
  hash->length         = UHASH_PRIMES[primeIndex];

  hash->values         = (void**) uprv_malloc(sizeof(void*) * hash->length);
  if(hash->values == 0) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return;
  }

  hash->hashes         = (int32_t*) uprv_malloc(sizeof(int32_t) * hash->length);
  if(hash->values == 0) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    uprv_free(hash->values);
    return;
  }

  for(i = 0; i < hash->length; ++i) {
    hash->hashes[i]     = UHASH_SLOT_EMPTY;
    hash->values[i]     = 0;
  }
  
  hash->count         = 0;
  hash->lowWaterMark     = (int32_t)(hash->length * hash->lowWaterFactor);
  hash->highWaterMark     = (int32_t)(hash->length * hash->highWaterFactor);
}

int32_t 
uhash_leastGreaterPrimeIndex(int32_t source)
{
  int32_t i;
  for(i = 0; i < UHASH_PRIMES_LENGTH; ++i) {
    if(source < UHASH_PRIMES[i]) {
      break;
    }
  }
  return (i == 0 ? 0 : (i - 1));
}

void
uhash_rehash(UHashtable *hash,
         UErrorCode *status)
{
  /*
    Rebuild the table from the start. This clears out deadwood, in case
    we have a lot of deleted values. See Find
    It is also used when the table grows or shrinks a lot.
    INVARIANT: The size of the table MUST be prime for this algorithm to work!
  */
  
  void         **oldValues     = hash->values;
  int32_t     *oldHashList     = hash->hashes;
  int32_t     old_length     = hash->length;
  int32_t     newPrimeIndex     = hash->primeIndex;
  int32_t     i;

  if(U_FAILURE(*status)) return;

  if(hash->count > hash->highWaterMark) {
    ++newPrimeIndex;
  }
  else if(hash->count < hash->lowWaterMark) {
    newPrimeIndex -= 2;
  }

  uhash_initialize(hash, newPrimeIndex, status);
  for(i = old_length - 1; i >= 0; --i) {
    void *value = oldValues[i];
    if(value != 0) {
      uhash_putInternal(hash, oldHashList[i], value);
    }
  }
  
  uprv_free(oldValues);
  uprv_free(oldHashList);
}

void
uhash_putInternal(UHashtable *hash,
          int32_t hashCode, 
          void *value)
{
  int32_t index = uhash_find(hash, hashCode);
  if(hash->hashes[index] <= UHASH_MAX_UNUSED) {  
    /* deleted or empty */
    hash->hashes[index] = hashCode;               
    ++(hash->count);
  }
  
  /* reset value */
  hash->values[index] = value; 
}

int32_t 
uhash_find(const UHashtable *hash,
       int32_t hashCode)
{
  /*
    This is the key routine. It looks for a particular key in the following 
    way. First find the start position, which is basically the key modulo the
    length. Test it to see if it is 
    a. Identical (same hash values)
    b. Deleted
    c. Empty
    Stop if it is identical or empty, otherwise continue by adding a "jump"
    value (moduloing by the length again to keep it within range) and
    retesting. For efficiency, it needs enough empty values so that the
    searches stop within a reasonable amount of time. This can be changed by
    changing the high/low water marks.
    INVARIANT: the size of the table MUST be prime for this algorithm to work!
  */

  int32_t firstDeleted     = -1; 
  int32_t index     = (hashCode ^ 0x4000000) % hash->length;
  int32_t jump         = 0;

  while(TRUE) {
    int32_t tableHash = hash->hashes[index];

    /* Compare hash codes */
    if(tableHash == hashCode) {     
      return index;
    } 
    
    /* neither correct nor unused */
    else if(tableHash > UHASH_MAX_UNUSED) {   
      /* ignore */
    }

    /* empty, end o' the line */
    else if(tableHash == UHASH_SLOT_EMPTY) {   
      if(firstDeleted >= 0) {
    /* reset if had deleted slot */
    index = firstDeleted;   
      }
      return index;
    }
    
    /* remember first deleted */
    else if(firstDeleted < 0) { 
      firstDeleted = index;
    }
    
    /* lazy compute jump */
    if(jump == 0) {
      jump = (hashCode % (hash->length - 1)) + 1;
    }
    
    index = (index + jump) % hash->length;
  }

  /* This never happens -- just make the compiler happy */
  return -1;
}

/* Predefined hash functions */

U_CAPI int32_t
uhash_hashUString(const void *parm)
{
  if(parm != NULL) {
    const UChar *key     = (const UChar*) parm;
    int32_t len         = u_strlen(key);
    int32_t hash         = UHASH_INVALID;
    const UChar *limit     = key + len;
    int32_t inc         = (len >= 128 ? len/64 : 1);

    /*
      We compute the hash by iterating sparsely over 64 (at most) characters
      spaced evenly through the string.  For each character, we multiply the
      previous hash value by a prime number and add the new character in,
      in the manner of a additive linear congruential random number generator,
      thus producing a pseudorandom deterministic value which should be well
      distributed over the output range. [LIU]
    */
  
    while(key < limit) {
      hash = (hash * 37) + *key;
      key += inc;
    }
  
    if(hash == UHASH_INVALID) {
      hash = UHASH_EMPTY;
    }
  
    return hash & 0x7FFFFFFF;
  } else {
    return UHASH_INVALID;
  }
}

U_CAPI int32_t
uhash_hashString(const void *parm)
{
  if(parm != NULL) {
    const char *key     = (const char*) parm;
    int32_t len         = uprv_strlen(key);
    int32_t hash         = UHASH_INVALID;
    const char *limit     = key + len;
    int32_t inc         = (len >= 128 ? len/64 : 1);

    /*
      We compute the hash by iterating sparsely over 64 (at most) characters
      spaced evenly through the string.  For each character, we multiply the
      previous hash value by a prime number and add the new character in,
      in the manner of a additive linear congruential random number generator,
      thus producing a pseudorandom deterministic value which should be well
      distributed over the output range. [LIU]
    */
  
    while(key < limit) {
      hash = (hash * 37) + *key;
      key += inc;
    }
  
    if(hash == UHASH_INVALID) {
      hash = UHASH_EMPTY;
    }
  
    return hash & 0x7FFFFFFF;
  } else {
    return UHASH_INVALID;
  }
}

U_CAPI int32_t
uhash_hashLong(const void *parm)
{
  int32_t hash = (int32_t) parm;
  return (int32_t) (hash & 0x7FFFFFFF);
}